1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.extractor;
26
27 import java.io.IOException;
28 import java.security.MessageDigest;
29 import java.security.NoSuchAlgorithmException;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32 import java.util.regex.Matcher;
33
34 import javax.management.AttributeNotFoundException;
35
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.framework.Processor;
38 import org.archive.crawler.settings.SimpleType;
39 import org.archive.io.ReplayCharSequence;
40 import org.archive.util.Base32;
41 import org.archive.util.TextUtils;
42
43 /***
44 * A processor for calculating custum HTTP content digests in place of the
45 * default (if any) computed by the HTTP fetcher processors.
46 * <p>
47 * This processor allows the user to specify a regular expression called
48 * <i>strip-reg-expr<i>. Any segment of a document (text only, binary files will
49 * be skipped) that matches this regular expression will by rewritten with
50 * the blank character (character 32 in the ANSI character set) <b> for the
51 * purpose of the digest</b> this has no effect on the document for subsequent
52 * processing or archiving.
53 * <p>
54 * NOTE: Content digest only accounts for the document body, not headers.
55 * <p>
56 * The operator will also be able to specify a maximum length for documents
57 * being evaluated by this processors. Documents exceeding that length will be
58 * ignored.
59 * <p>
60 * To further discriminate by file type or URL, an operator should use the
61 * override and refinement options.
62 * <p>
63 * It is generally recommended that this recalculation only be performed when
64 * absolutely needed (because of stripping data that changes automatically each
65 * time the URL is fetched) as this is an expensive operation.
66 *
67 * @author Kristinn Sigurdsson
68 */
69 public class HTTPContentDigest extends Processor {
70
71 private static final long serialVersionUID = 8055532198737384358L;
72
73 private static Logger logger =
74 Logger.getLogger(HTTPContentDigest.class.getName());
75
76 /*** A regular expression detailing elements to strip before making digest */
77 public final static String ATTR_STRIP_REG_EXPR = "strip-reg-expr";
78 protected final static String DEFAULT_STRIP_REG_EXPR = "";
79 /*** Maximum file size for - longer files will be ignored. -1 = unlimited*/
80 public final static String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
81 protected final static Long DEFAULT_MAX_SIZE_BYTES = new Long(1048576);
82
83 private static final String SHA1 = "SHA1";
84
85
86 /***
87 * Constructor
88 * @param name Processor name
89 */
90 public HTTPContentDigest(String name) {
91 super(name, "Calculate custom - stripped - content digests. " +
92 "A processor for calculating custom HTTP content digests " +
93 "in place of the default (if any) computed by the HTTP " +
94 "fetcher processors. " +
95 "This processor enables you to specify a regular expression " +
96 "called strip-reg-expr. Any segment of a document (text " +
97 "only, binary files will be skipped) that matches this " +
98 "regular expression will be rewritten with the blank " +
99 "character (character 32 in the ANSI character set) FOR THE " +
100 "PURPOSE OF THE DIGEST, this has no effect on the document " +
101 "for subsequent processing or archiving. You can also " +
102 "specify a maximum length for documents being evaluated by " +
103 "this processor. Documents exceeding that length will be " +
104 "ignored. " +
105 "To further discriminate by file type or URL, you should use " +
106 "the override and refinement options (the processor can be " +
107 "disabled by default and only enabled as needed in overrides " +
108 "and refinements. " +
109 "It is generally recommended that this recalculation only be " +
110 "performed when absolutely needed (because of stripping data " +
111 "that changes automatically each time the URL is fetched) as " +
112 "this is an expensive operation.");
113
114 addElementToDefinition(new SimpleType(ATTR_STRIP_REG_EXPR,
115 "A regular expression that matches those portions of " +
116 "downloaded documents that need to be ignored when " +
117 "calculating the content digest. " +
118 "Segments matching this expression will be rewritten with " +
119 "the blank character for the content digest.",
120 DEFAULT_STRIP_REG_EXPR));
121 addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES,
122 "Maximum size of of documents to recalculate the digest for." +
123 " Documents that exceed this value (bytes) will be ignored." +
124 " Defaults to 1048576 bytes, or 1 MB. " +
125 "-1 denotes unlimited size. A setting of 0 will effectively " +
126 "disable the processor.",
127 DEFAULT_MAX_SIZE_BYTES));
128 }
129
130 protected void innerProcess(CrawlURI curi) throws InterruptedException {
131 if (!curi.isHttpTransaction()){
132
133 return;
134 }
135 if(!TextUtils.matches("^text.*$", curi.getContentType())){
136
137 return;
138 }
139 long maxsize = DEFAULT_MAX_SIZE_BYTES.longValue();
140 try {
141 maxsize = ((Long)getAttribute(curi,ATTR_MAX_SIZE_BYTES)).longValue();
142 } catch (AttributeNotFoundException e) {
143 logger.severe("Missing max-size-bytes attribute when processing " +
144 curi.toString());
145 }
146 if(maxsize < curi.getContentSize() && maxsize > -1){
147
148 return;
149 }
150
151
152
153 String regexpr = "";
154 try {
155 regexpr = (String)getAttribute(curi,ATTR_STRIP_REG_EXPR);
156 } catch (AttributeNotFoundException e2) {
157 logger.severe("Missing strip-reg-exp when processing " +
158 curi.toString());
159 return;
160 }
161
162
163 ReplayCharSequence cs = null;
164
165 try {
166 cs = curi.getHttpRecorder().getReplayCharSequence();
167 } catch(Exception e) {
168 curi.addLocalizedError(this.getName(), e,
169 "Failed get of replay char sequence " + curi.toString() +
170 " " + e.getMessage());
171 logger.warning("Failed get of replay char sequence " +
172 curi.toString() + " " + e.getMessage() + " " +
173 Thread.currentThread().getName());
174 return;
175 }
176
177
178 MessageDigest digest = null;
179
180
181
182 try {
183 try {
184 digest = MessageDigest.getInstance(SHA1);
185 } catch (NoSuchAlgorithmException e1) {
186 e1.printStackTrace();
187 return;
188 }
189
190 digest.reset();
191
192 String s = null;
193
194 if (regexpr.length() == 0) {
195 s = cs.toString();
196 } else {
197
198 Matcher m = TextUtils.getMatcher(regexpr, cs);
199 s = m.replaceAll(" ");
200 TextUtils.recycleMatcher(m);
201 }
202 digest.update(s.getBytes());
203
204
205 byte[] newDigestValue = digest.digest();
206
207
208 if (logger.isLoggable(Level.FINEST)) {
209 logger.finest("Recalculated content digest for "
210 + curi.toString() + " old: "
211 + Base32.encode((byte[]) curi.getContentDigest())
212 + ", new: " + Base32.encode(newDigestValue));
213 }
214
215 curi.setContentDigest(SHA1, newDigestValue);
216 } finally {
217 if (cs != null) {
218 try {
219 cs.close();
220 } catch (IOException ioe) {
221 logger.warning(TextUtils.exceptionToString(
222 "Failed close of ReplayCharSequence.", ioe));
223 }
224 }
225 }
226 }
227 }