1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.filter;
26
27
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import org.apache.commons.httpclient.HttpMethod;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.Filter;
34 import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
35
36 /***
37 * A mid fetch filter for HTTP fetcher processors. It will evaluate the HTTP
38 * header to try and predict if the document has changed since it last passed
39 * through this filter. It does this by comparing the last-modified and etag
40 * values with the same values stored during the last processing of the URI.
41 * <p>
42 * If both values are present, they must agree on predicting no change,
43 * otherwise a change is predicted (return true).
44 * <p>
45 * If only one of the values is present, it alone is used to predict if a
46 * change has occured.
47 * <p>
48 * If neither value is present the filter will return true (predict change)
49 *
50 * @author Kristinn Sigurdsson
51 */
52 public class HTTPMidFetchUnchangedFilter extends Filter
53 implements AdaptiveRevisitAttributeConstants {
54
55 private static final long serialVersionUID = -7416477243375196980L;
56
57 private static final Logger logger =
58 Logger.getLogger(HTTPMidFetchUnchangedFilter.class.getName());
59
60
61 public static final int HEADER_PREDICTS_MISSING = -1;
62 public static final int HEADER_PREDICTS_UNCHANGED = 0;
63 public static final int HEADER_PREDICTS_CHANGED = 1;
64
65 /***
66 * Constructor
67 *
68 * @param name Module name
69 */
70 public HTTPMidFetchUnchangedFilter(String name){
71 this(name, "Filters out unchanged documents. " +
72 "Examines HTTP Header timestamp and etags. " +
73 "This filter should" +
74 "only be used in the 'midfetch-filters' on the FetchHTTP " +
75 "processor. Earlier then that, the headers are not available " +
76 "and later, the entire document is available and examining " +
77 "this will usually give better results then relying on HTTP " +
78 "headers. See documentation for further details.");
79
80
81 CrawlURI.addAlistPersistentMember(A_LAST_DATESTAMP);
82 CrawlURI.addAlistPersistentMember(A_LAST_ETAG);
83 }
84
85 /***
86 * Constructor
87 *
88 * @param name Module name
89 * @param description A description of the modules functions
90 */
91 public HTTPMidFetchUnchangedFilter(String name, String description) {
92 super(name, description);
93 }
94
95 protected boolean innerAccepts(Object o) {
96
97
98 if(o instanceof CrawlURI == false){
99
100 if (logger.isLoggable(Level.INFO)) {
101 logger.info("Error: Object passed for evaluation was not a " +
102 "CrawlURI. " + o.toString());
103 }
104 return true;
105 }
106
107 CrawlURI curi = (CrawlURI)o;
108
109 if (curi.isHttpTransaction() == false) {
110
111 if (logger.isLoggable(Level.INFO)) {
112 logger.info("Error: Non HTTP CrawlURI was passed for evalution. "
113 + curi.toString());
114 }
115 return true;
116 }
117
118 if(curi.containsKey(A_HTTP_TRANSACTION) == false){
119
120 if (logger.isLoggable(Level.INFO)) {
121 logger.info("Error: Missing HttpMethod object in CrawlURI. "
122 + curi.toString());
123 }
124 return true;
125 }
126
127
128 int datestamp = HEADER_PREDICTS_MISSING;
129 int etag = HEADER_PREDICTS_MISSING;
130 HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
131
132
133 String newDatestamp = null;
134 if (method.getResponseHeader("last-modified") != null) {
135 newDatestamp = method.getResponseHeader("last-modified").getValue();
136 }
137
138 if (newDatestamp != null && newDatestamp.length() > 0) {
139 datestamp = HEADER_PREDICTS_CHANGED;
140 if (curi.containsKey(A_LAST_DATESTAMP)) {
141 if (newDatestamp.equals(curi.getString(A_LAST_DATESTAMP))) {
142
143
144 datestamp = HEADER_PREDICTS_UNCHANGED;
145 }
146 }
147 curi.putString(A_LAST_DATESTAMP, newDatestamp);
148 }
149
150
151 String newETag = null;
152 if(method.getResponseHeader("last-etag") != null){
153 newETag = method.getResponseHeader("last-etag").getValue();
154 }
155
156 if(newETag != null && newETag.length() > 0){
157 etag = HEADER_PREDICTS_CHANGED;
158 if(curi.containsKey(A_LAST_ETAG)){
159 if(newETag.equals(curi.getString(A_LAST_ETAG))){
160
161
162 etag = HEADER_PREDICTS_UNCHANGED;
163 }
164 }
165 curi.putString(A_LAST_ETAG, newETag);
166 }
167
168
169 if (datestamp == HEADER_PREDICTS_UNCHANGED
170 && etag == HEADER_PREDICTS_UNCHANGED) {
171
172 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
173 return false;
174 }
175
176 if (datestamp == HEADER_PREDICTS_MISSING
177 && etag == HEADER_PREDICTS_UNCHANGED) {
178
179 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
180 return false;
181 }
182 if (datestamp == HEADER_PREDICTS_UNCHANGED
183 && etag == HEADER_PREDICTS_MISSING) {
184
185 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
186 return false;
187 }
188 return true;
189 }
190 }