View Javadoc

1   /* HTTPMidFetchUnhangedFilter
2    * 
3    * $Id: HTTPMidFetchUnchangedFilter.java 4652 2006-09-25 18:41:10Z paul_jack $
4    * 
5    * Created on 4.2.2005
6    *
7    * Copyright (C) 2005 Kristinn Sigur?sson
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.filter;
26  
27  
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import org.apache.commons.httpclient.HttpMethod;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.framework.Filter;
34  import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
35  
36  /***
37   * A mid fetch filter for HTTP fetcher processors. It will evaluate the HTTP
38   * header to try and predict if the document has changed since it last passed
39   * through this filter. It does this by comparing the last-modified and etag 
40   * values with the same values stored during the last processing of the URI.
41   * <p>
42   * If both values are present, they must agree on predicting no change, 
43   * otherwise a change is predicted (return true).
44   * <p>
45   * If only one of the values is present, it alone is used to predict if a 
46   * change has occured.
47   * <p>
48   * If neither value is present the filter will return true (predict change) 
49   *
50   * @author Kristinn Sigurdsson
51   */
52  public class HTTPMidFetchUnchangedFilter extends Filter 
53  implements AdaptiveRevisitAttributeConstants {
54  
55      private static final long serialVersionUID = -7416477243375196980L;
56  
57      private static final Logger logger =
58          Logger.getLogger(HTTPMidFetchUnchangedFilter.class.getName());
59  
60      // Header predictor state constants
61      public static final int HEADER_PREDICTS_MISSING = -1;
62      public static final int HEADER_PREDICTS_UNCHANGED = 0;
63      public static final int HEADER_PREDICTS_CHANGED = 1;
64      
65      /***
66       * Constructor
67       * 
68       * @param name Module name
69       */
70      public HTTPMidFetchUnchangedFilter(String name){
71          this(name, "Filters out unchanged documents. " +
72                  "Examines HTTP Header timestamp and etags. " +
73                  "This filter should" +
74                  "only be used in the 'midfetch-filters' on the FetchHTTP " +
75                  "processor. Earlier then that, the headers are not available " +
76                  "and later, the entire document is available and examining " +
77                  "this will usually give better results then relying on HTTP " +
78                  "headers. See documentation for further details.");
79  
80          // Register persistent CrawlURI items 
81          CrawlURI.addAlistPersistentMember(A_LAST_DATESTAMP);
82          CrawlURI.addAlistPersistentMember(A_LAST_ETAG);
83      }
84      
85      /***
86       * Constructor
87       * 
88       * @param name Module name
89       * @param description A description of the modules functions
90       */
91      public HTTPMidFetchUnchangedFilter(String name, String description) {
92          super(name, description);
93      }
94  
95      protected boolean innerAccepts(Object o) {
96          // Return FALSE when the document has NOT changed!
97          // Return TRUE if the document has changed or we can't tell
98          if(o instanceof CrawlURI == false){
99              // Only handles CrawlURIs
100             if (logger.isLoggable(Level.INFO)) {
101                 logger.info("Error: Object passed for evaluation was not a " +
102                     "CrawlURI. " + o.toString());
103             }
104             return true;
105         }
106         
107         CrawlURI curi = (CrawlURI)o;
108         
109         if (curi.isHttpTransaction() == false) {
110             // Only handles HTTP
111             if (logger.isLoggable(Level.INFO)) {
112                 logger.info("Error: Non HTTP CrawlURI was passed for evalution. "
113                     + curi.toString());
114             }
115             return true;
116         }
117         
118         if(curi.containsKey(A_HTTP_TRANSACTION) == false){
119             // Missing header info, can't do anything.
120             if (logger.isLoggable(Level.INFO)) {
121                 logger.info("Error: Missing HttpMethod object in CrawlURI. "
122                         + curi.toString());
123             }
124             return true;
125         }
126         
127         // Intially assume header info is missing
128         int datestamp = HEADER_PREDICTS_MISSING;
129         int etag = HEADER_PREDICTS_MISSING;
130         HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
131 
132         // Compare datestamps (last-modified)
133         String newDatestamp = null;
134         if (method.getResponseHeader("last-modified") != null) {
135             newDatestamp = method.getResponseHeader("last-modified").getValue();
136         }
137         
138         if (newDatestamp != null && newDatestamp.length() > 0) {
139             datestamp = HEADER_PREDICTS_CHANGED; // Not missing, assume change
140             if (curi.containsKey(A_LAST_DATESTAMP)) {
141                 if (newDatestamp.equals(curi.getString(A_LAST_DATESTAMP))) {
142                     // Both new and old are present and equal, datestamp
143                     // predicts no change
144                     datestamp = HEADER_PREDICTS_UNCHANGED;
145                 }
146             }
147             curi.putString(A_LAST_DATESTAMP, newDatestamp);
148         }
149         
150         // Compare ETags
151         String newETag = null;
152         if(method.getResponseHeader("last-etag") != null){
153             newETag = method.getResponseHeader("last-etag").getValue();
154         }
155         
156         if(newETag != null && newETag.length() > 0){
157             etag = HEADER_PREDICTS_CHANGED; // Not missing, assume change
158             if(curi.containsKey(A_LAST_ETAG)){
159                 if(newETag.equals(curi.getString(A_LAST_ETAG))){
160                     // Both new and old are present and equal, etag 
161                     // predicts no change
162                     etag = HEADER_PREDICTS_UNCHANGED;
163                 }
164             }
165             curi.putString(A_LAST_ETAG, newETag);
166         }
167         
168         // If both are present, predict no change only if both agree
169         if (datestamp == HEADER_PREDICTS_UNCHANGED
170                 && etag == HEADER_PREDICTS_UNCHANGED) {
171             // Have both and they agree, no change
172             curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
173             return false;
174         }
175         // If one or the other is missing, trust the one that is present
176         if (datestamp == HEADER_PREDICTS_MISSING
177                 && etag == HEADER_PREDICTS_UNCHANGED) {
178             // Only have etag, and it predicts no change
179             curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
180             return false;
181         }
182         if (datestamp == HEADER_PREDICTS_UNCHANGED
183                 && etag == HEADER_PREDICTS_MISSING) {
184             // Only have last-modified, and it predicts no change
185             curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
186             return false;
187         }
188         return true; // Default, assume change. 
189     }
190 }