View Javadoc

1   /* HTTPContentDigest
2    * 
3    * $Id: HTTPContentDigest.java 4654 2006-09-25 20:19:54Z paul_jack $
4    * 
5    * Created on 5.1.2005
6    *
7    * Copyright (C) 2005 Kristinn Sigur?sson
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.extractor;
26  
27  import java.io.IOException;
28  import java.security.MessageDigest;
29  import java.security.NoSuchAlgorithmException;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  import java.util.regex.Matcher;
33  
34  import javax.management.AttributeNotFoundException;
35  
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.framework.Processor;
38  import org.archive.crawler.settings.SimpleType;
39  import org.archive.io.ReplayCharSequence;
40  import org.archive.util.Base32;
41  import org.archive.util.TextUtils;
42  
43  /***
44   * A processor for calculating custum HTTP content digests in place of the 
45   * default (if any) computed by the HTTP fetcher processors.
46   * <p>
47   * This processor allows the user to specify a regular expression called 
48   * <i>strip-reg-expr<i>. Any segment of a document (text only, binary files will
49   * be skipped) that matches this regular expression will by rewritten with 
50   * the blank character (character 32 in the ANSI character set) <b> for the 
51   * purpose of the digest</b> this has no effect on the document for subsequent 
52   * processing or archiving.
53   * <p>
54   * NOTE: Content digest only accounts for the document body, not headers.
55   * <p>
56   * The operator will also be able to specify a maximum length for documents 
57   * being evaluated by this processors. Documents exceeding that length will be 
58   * ignored.
59   * <p>
60   * To further discriminate by file type or URL, an operator should use the 
61   * override and refinement options. 
62   * <p>
63   * It is generally recommended that this recalculation only be performed when 
64   * absolutely needed (because of stripping data that changes automatically each 
65   * time the URL is fetched) as this is an expensive operation.
66   *
67   * @author Kristinn Sigurdsson
68   */
69  public class HTTPContentDigest extends Processor {
70  
71      private static final long serialVersionUID = 8055532198737384358L;
72  
73      private static Logger logger =
74          Logger.getLogger(HTTPContentDigest.class.getName());
75  
76      /*** A regular expression detailing elements to strip before making digest */
77      public final static String ATTR_STRIP_REG_EXPR = "strip-reg-expr";
78      protected final static String DEFAULT_STRIP_REG_EXPR = "";
79      /*** Maximum file size for - longer files will be ignored. -1 = unlimited*/
80      public final static String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
81      protected final static Long DEFAULT_MAX_SIZE_BYTES = new Long(1048576); // 1 Megabyte
82      
83      private static final String SHA1 = "SHA1";
84  
85      
86      /***
87       * Constructor
88       * @param name Processor name
89       */
90      public HTTPContentDigest(String name) {
91          super(name, "Calculate custom - stripped - content digests. " +
92                  "A processor for calculating custom HTTP content digests " +
93                  "in place of the default (if any) computed by the HTTP " +
94                  "fetcher processors. " +
95                  "This processor enables you to specify a regular expression " +
96                  "called strip-reg-expr. Any segment of a document (text " +
97                  "only, binary files will be skipped) that matches this " +
98                  "regular expression will be rewritten with the blank " +
99                  "character (character 32 in the ANSI character set) FOR THE " +
100                 "PURPOSE OF THE DIGEST, this has no effect on the document " +
101                 "for subsequent processing or archiving. You can also " +
102                 "specify a maximum length for documents being evaluated by " +
103                 "this processor. Documents exceeding that length will be " +
104                 "ignored. " +
105                 "To further discriminate by file type or URL, you should use " +
106                 "the override and refinement options (the processor can be " +
107                 "disabled by default and only enabled as needed in overrides " +
108                 "and refinements. " +
109                 "It is generally recommended that this recalculation only be " +
110                 "performed when absolutely needed (because of stripping data " +
111                 "that changes automatically each time the URL is fetched) as " +
112                 "this is an expensive operation.");
113 
114         addElementToDefinition(new SimpleType(ATTR_STRIP_REG_EXPR,
115                 "A regular expression that matches those portions of " +
116                 "downloaded documents that need to be ignored when " +
117                 "calculating the content digest. " +
118                 "Segments matching this expression will be rewritten with " +
119                 "the blank character for the content digest.",
120                 DEFAULT_STRIP_REG_EXPR));
121         addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES,
122                 "Maximum size of of documents to recalculate the digest for." +
123                 " Documents that exceed this value (bytes) will be ignored." +
124                 " Defaults to 1048576 bytes, or 1 MB. " +
125                 "-1 denotes unlimited size. A setting of 0 will effectively " +
126                 "disable the processor.",
127                 DEFAULT_MAX_SIZE_BYTES));
128     }
129 
130     protected void innerProcess(CrawlURI curi) throws InterruptedException {
131         if (!curi.isHttpTransaction()){
132             // Only handles HTTP docsuments.
133             return;
134         }
135         if(!TextUtils.matches("^text.*$", curi.getContentType())){
136             // Only handles text based documents.
137             return;
138         }
139         long maxsize = DEFAULT_MAX_SIZE_BYTES.longValue(); 
140         try {
141             maxsize = ((Long)getAttribute(curi,ATTR_MAX_SIZE_BYTES)).longValue();
142         } catch (AttributeNotFoundException e) {
143             logger.severe("Missing max-size-bytes attribute when processing " + 
144                     curi.toString());
145         }
146         if(maxsize < curi.getContentSize() && maxsize > -1){
147             // Document too big.
148             return;
149         }
150         
151         // Ok, if we got this far we need to calculate the content digest. 
152         // Get the regexpr
153         String regexpr = "";
154         try {
155             regexpr = (String)getAttribute(curi,ATTR_STRIP_REG_EXPR);
156         } catch (AttributeNotFoundException e2) {
157             logger.severe("Missing strip-reg-exp when processing " +
158                     curi.toString());
159             return; // Can't do anything without it.
160         }
161         
162         // Get a replay of the document character seq.
163         ReplayCharSequence cs = null;
164         
165         try {
166            cs = curi.getHttpRecorder().getReplayCharSequence();
167         } catch(Exception e) {
168             curi.addLocalizedError(this.getName(), e,
169                 "Failed get of replay char sequence " + curi.toString() +
170                     " " + e.getMessage());
171             logger.warning("Failed get of replay char sequence " +
172                 curi.toString() + " " + e.getMessage() + " " +
173                 Thread.currentThread().getName());
174             return; // Can't proceed if this happens.
175         }
176         
177         // Create a MessageDigest 
178         MessageDigest digest = null;
179         
180         // We have a ReplayCharSequence open.  Wrap all in finally so we
181         // for sure close it before we leave.
182         try {
183             try {
184                 digest = MessageDigest.getInstance(SHA1);
185             } catch (NoSuchAlgorithmException e1) {
186                 e1.printStackTrace();
187                 return;
188             }
189 
190             digest.reset();
191 
192             String s = null;
193 
194             if (regexpr.length() == 0) {
195                 s = cs.toString();
196             } else {
197                 // Process the document
198                 Matcher m = TextUtils.getMatcher(regexpr, cs);
199                 s = m.replaceAll(" ");
200                 TextUtils.recycleMatcher(m);
201             }
202             digest.update(s.getBytes());
203 
204             // Get the new digest value
205             byte[] newDigestValue = digest.digest();
206 
207             // Log if needed.
208             if (logger.isLoggable(Level.FINEST)) {
209                 logger.finest("Recalculated content digest for "
210                         + curi.toString() + " old: "
211                         + Base32.encode((byte[]) curi.getContentDigest())
212                         + ", new: " + Base32.encode(newDigestValue));
213             }
214             // Save new digest value
215             curi.setContentDigest(SHA1, newDigestValue);
216         } finally {
217             if (cs != null) {
218                 try {
219                     cs.close();
220                 } catch (IOException ioe) {
221                     logger.warning(TextUtils.exceptionToString(
222                             "Failed close of ReplayCharSequence.", ioe));
223                 }
224             }
225         }
226     }
227 }