View Javadoc

1   /* ChangeEvaluator
2    * 
3    * $Id: ChangeEvaluator.java 5792 2008-03-24 00:12:21Z gojomo $
4    * 
5    * Created on 11.11.2004
6    *
7    * Copyright (C) 2004 Kristinn Sigurdsson.
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.extractor;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.archive.crawler.datamodel.CrawlURI;
31  import org.archive.crawler.framework.Processor;
32  import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
33  import org.archive.util.Base32;
34  
35  /***
36   * This processor compares the CrawlURI's current 
37   * {@link org.archive.crawler.datamodel.CrawlURI#getContentDigest() content digest}
38   * with the one from a previous crawl. If they are equal, then further 
39   * processing is skipped (going straight to the post processor chain) and the
40   * CrawlURI is marked appropriately.
41   *
42   * @author Kristinn Sigurdsson
43   */
44  public class ChangeEvaluator extends Processor
45  implements AdaptiveRevisitAttributeConstants {
46  
47      private static final long serialVersionUID = 5547590621493534632L;
48      private static final Logger logger =
49          Logger.getLogger(ChangeEvaluator.class.getName());
50  
51      /***
52       * Constructor
53       * @param name The name of the module
54       */
55      public ChangeEvaluator(String name) {
56          super(name, "Compares CrawlURI's current " +
57                  "content digest with digest from previous crawl. If " +
58                  "equal, further processing is skipped (going " +
59                  "straight to the post processor chain) and the CrawlURI is " +
60                  "marked appropriately. Should be located at the start of " +
61                  "the Extractor chain.");
62  
63          // Register persistent CrawlURI items 
64          CrawlURI.addAlistPersistentMember(A_LAST_CONTENT_DIGEST);
65          CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VISITS);
66          CrawlURI.addAlistPersistentMember(A_NUMBER_OF_VERSIONS);
67      }
68  
69      protected void innerProcess(CrawlURI curi) throws InterruptedException {
70          if (curi.isSuccess() == false) {
71              // Early return. No point in doing comparison on failed downloads.
72              if (logger.isLoggable(Level.FINEST)) {
73                  logger.finest("Not handling " + curi.toString()
74                          + ", did not " + "succeed.");
75              }
76              return;
77          }
78          
79          // If a mid fetch filter aborts the HTTP fetch because the headers
80          // predict no change, we can skip the whole comparing hashes.
81          if (!curi.containsKey(A_CONTENT_STATE_KEY) ||
82                  curi.getInt(A_CONTENT_STATE_KEY) != CONTENT_UNCHANGED) {
83              String currentDigest = null;
84              Object digest = curi.getContentDigest();
85              if (digest != null) {
86                  currentDigest = Base32.encode((byte[])digest);
87              }
88      
89              String oldDigest = null;
90              if (curi.containsKey(A_LAST_CONTENT_DIGEST)) {
91                  oldDigest = curi.getString(A_LAST_CONTENT_DIGEST);
92              }
93      
94              // Compare the String representation of the byte arrays.
95              if (currentDigest == null && oldDigest == null) {
96                  // Both are null, can't do a thing
97                  if (logger.isLoggable(Level.FINER)) {
98                      logger.finer("On " + curi.toString()
99                              + " both digest are null");
100                 }
101                 // NOTE! RETURN!
102                 return;
103             }
104             
105             if (currentDigest != null && oldDigest != null 
106                     && currentDigest.equals(oldDigest)) { 
107                 // If equal, we have just downloaded a duplicate.
108                 if (logger.isLoggable(Level.FINER)) {
109                     logger.finer("On " + curi.toString()
110                             + " both digest are " + "equal. Old: " + oldDigest
111                             + ", new: " + currentDigest);
112                 }
113                 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_UNCHANGED);
114                 // TODO: In the future processors should take note of the content
115                 // state, removing the need for the following 'skip'
116                 curi.skipToProcessorChain(getController().getPostprocessorChain());
117                 // Since this is an unchanged page, no need to reschedule all of it's links.
118                 curi.clearOutlinks();
119                 // Make not in log
120                 curi.addAnnotation("unchanged");
121                 // Set content size to zero, we are not going to 'write it to disk'
122                 curi.setContentSize(0);
123             } else {
124                 // Document has changed
125                 if (logger.isLoggable(Level.FINER)) {
126                     logger.finer("On " + curi.toString()
127                             + " digest are not " + "equal. Old: "
128                             + (oldDigest == null? "null": oldDigest)
129                             + ", new: "
130                             + (currentDigest == null? "null": currentDigest));
131                 }
132                 // currentDigest may be null, that probably means a failed download
133                 curi.putInt(A_CONTENT_STATE_KEY, CONTENT_CHANGED);
134                 curi.putString(A_LAST_CONTENT_DIGEST, currentDigest); 
135             }
136         } else {
137             if (logger.isLoggable(Level.FINER)) {
138                 logger.finer("On " + curi.toString()
139                         + " content state was " + "already set as UNCHANGED.");
140             }
141             // Just like matching digests, there is no need to continue processing.
142             curi.skipToProcessorChain(getController().getPostprocessorChain());
143             // Since this is an unchanged page, no need to reschedule all of it's links.
144             curi.clearOutlinks();
145         }
146         
147         // Update visit and version counters
148         int visits = 1;
149         if(curi.containsKey(A_NUMBER_OF_VISITS)) {
150             visits = curi.getInt(A_NUMBER_OF_VISITS) + 1;
151         }
152         curi.putInt(A_NUMBER_OF_VISITS, visits);
153 
154         // Update versions.
155         if(curi.getInt(A_CONTENT_STATE_KEY) == CONTENT_CHANGED) {
156             int versions = 1;
157             if(curi.containsKey(A_NUMBER_OF_VERSIONS)) {
158                 versions = curi.getInt(A_NUMBER_OF_VERSIONS) + 1;
159             }
160             curi.putInt(A_NUMBER_OF_VERSIONS,versions);
161         }
162     }
163 }