1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.extractor;
26
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30 import org.archive.crawler.datamodel.CrawlURI;
31 import org.archive.crawler.framework.Processor;
32 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
33 import org.archive.util.Base32;
34
35 /***
36 * This processor compares the CrawlURI's current
37 * {@link org.archive.crawler.datamodel.CrawlURI#getContentDigest() content digest}
38 * with the one from a previous crawl. If they are equal, then further
39 * processing is skipped (going straight to the post processor chain) and the
40 * CrawlURI is marked appropriately.
41 *
42 * @author Kristinn Sigurdsson
43 */
44 public class ChangeEvaluator extends Processor
45 implements AdaptiveRevisitAttributeConstants {
46
47 private static final long serialVersionUID = 5547590621493534632L;
48 private static final Logger logger =
49 Logger.getLogger(ChangeEvaluator.class.getName());
50
51 /***
52 * Constructor
53 * @param name The name of the module
54 */
55 public ChangeEvaluator(String name) {
56 super(name, "Compares CrawlURI's current " +
57 "content digest with digest from previous crawl. If " +
58 "equal, further processing is skipped (going " +
59 "straight to the post processor chain) and the CrawlURI is " +
60 "marked appropriately. Should be located at the start of " +
61 "the Extractor chain.");
62
63
64 CrawlURI.addAlistPersistentMember(A_LAST_CONTENT_DIGEST);
65 CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VISITS);
66 CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VERSIONS);
67 }
68
69 protected void innerProcess(CrawlURI curi) throws InterruptedException {
70 if (curi.isSuccess() == false) {
71
72 if (logger.isLoggable(Level.FINEST)) {
73 logger.finest("Not handling " + curi.toString()
74 + ", did not " + "succeed.");
75 }
76 return;
77 }
78
79
80
81 if (!curi.containsKey(A_CONTENT_STATE_KEY) ||
82 curi.getInt(A_CONTENT_STATE_KEY) != CONTENT_UNCHANGED) {
83 String currentDigest = null;
84 Object digest = curi.getContentDigest();
85 if (digest != null) {
86 currentDigest = Base32.encode((byte[])digest);
87 }
88
89 String oldDigest = null;
90 if (curi.containsKey(A_LAST_CONTENT_DIGEST)) {
91 oldDigest = curi.getString(A_LAST_CONTENT_DIGEST);
92 }
93
94
95 if (currentDigest == null && oldDigest == null) {
96
97 if (logger.isLoggable(Level.FINER)) {
98 logger.finer("On " + curi.toString()
99 + " both digest are null");
100 }
101
102 return;
103 }
104
105 if (currentDigest != null && oldDigest != null
106 && currentDigest.equals(oldDigest)) {
107
108 if (logger.isLoggable(Level.FINER)) {
109 logger.finer("On " + curi.toString()
110 + " both digest are " + "equal. Old: " + oldDigest
111 + ", new: " + currentDigest);
112 }
113 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
114
115
116 curi.skipToProcessorChain(getController().getPostprocessorChain());
117
118 curi.clearOutlinks();
119
120 curi.addAnnotation("unchanged");
121
122 curi.setContentSize(0);
123 } else {
124
125 if (logger.isLoggable(Level.FINER)) {
126 logger.finer("On " + curi.toString()
127 + " digest are not " + "equal. Old: "
128 + (oldDigest == null? "null": oldDigest)
129 + ", new: "
130 + (currentDigest == null? "null": currentDigest));
131 }
132
133 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_CHANGED);
134 curi.putString(A_LAST_CONTENT_DIGEST, currentDigest);
135 }
136 } else {
137 if (logger.isLoggable(Level.FINER)) {
138 logger.finer("On " + curi.toString()
139 + " content state was " + "already set as UNCHANGED.");
140 }
141
142 curi.skipToProcessorChain(getController().getPostprocessorChain());
143
144 curi.clearOutlinks();
145 }
146
147
148 int visits = 1;
149 if(curi.containsKey(A_NUMBER_OF_VISITS)) {
150 visits = curi.getInt(A_NUMBER_OF_VISITS) + 1;
151 }
152 curi.putInt(A_NUMBER_OF_VISITS, visits);
153
154
155 if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED) {
156 int versions = 1;
157 if(curi.containsKey(A_NUMBER_OF_VERSIONS)) {
158 versions = curi.getInt(A_NUMBER_OF_VERSIONS) + 1;
159 }
160 curi.putInt(A_NUMBER_OF_VERSIONS,versions);
161 }
162 }
163 }