View Javadoc

1   /* Constants
2    * 
3    * $Id: AdaptiveRevisitAttributeConstants.java 5796 2008-03-25 21:53:04Z gojomo $
4    * 
5    * Created on 26.11.2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.frontier;
26  
27  import org.archive.crawler.datamodel.CoreAttributeConstants;
28  
29  /***
30   * Defines static constants for the Adaptive Revisiting module defining data
31   * keys in the CrawlURI AList. 
32   *
33   * @author Kristinn Sigurdsson
34   * 
35   * @see org.archive.crawler.datamodel.CoreAttributeConstants
36   */
37  public interface AdaptiveRevisitAttributeConstants
38  extends CoreAttributeConstants {
39  
40      /*** Designates a field in the CrawlURIs AList for the content digest of
41       *  an earlier visit. */
42      public static final String A_LAST_CONTENT_DIGEST = "last-content-digest";
43      public static final String A_TIME_OF_NEXT_PROCESSING = 
44          "time-of-next-processing";
45      public static final String A_WAIT_INTERVAL = "wait-interval";
46      public static final String A_NUMBER_OF_VISITS = "number-of-visits";
47      public static final String A_NUMBER_OF_VERSIONS = "number-of-versions";
48      public static final String A_FETCH_OVERDUE = "fetch-overdue";
49      
50      public static final String A_LAST_ETAG = "last-etag";
51      public static final String A_LAST_DATESTAMP = "last-datestamp";
52      
53      public static final String A_WAIT_REEVALUATED = "wait-reevaluated";
54      
55      /*** Mark a URI to be dropped from revisit handling. Used for custom 
56       * processors that want to implement more selective revisiting. 
57       * Actual effect depends on whether an alreadyIncluded structure
58       * is used. If an alreadyIncluded is used, dropping the URI from 
59       * revisit handling means it won't be visited again. If an
60       * alreadyIncluded is not used, this merely drops one discovery of 
61       * the URI, and it may be rediscovered and thus revisited that way.
62       */
63      public static final String A_DISCARD_REVISIT = "no-revisit";
64      
65      /*** No knowledge of URI content. Possibly not fetched yet, unable
66       *  to check if different or an error occurred on last fetch attempt. */
67      public static final int CONTENT_UNKNOWN = -1;
68      
69      /*** URI content has not changed between the two latest, successfully
70       *  completed fetches. */
71      public static final int CONTENT_UNCHANGED = 0;
72      
73      /*** URI content had changed between the two latest, successfully completed
74       *  fetches. By definition, content has changed if there has only been one
75       *  successful fetch made. */
76      public static final int CONTENT_CHANGED = 1;
77  
78      /***
79       * Key to use getting state of crawluri from the CrawlURI alist.
80       */
81      public static final String A_CONTENT_STATE_KEY = "ar-state";
82  }