View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CoreAttributeConstants.java
20   * Created on Jun 17, 2003
21   *
22   * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src//**
23   * @author gojomo
24   *
25   */
26  package org.archive.crawler.datamodel;
27  
28  /***
29   * CrawlURI attribute keys used by the core crawler
30   * classes.
31   *
32   * @author gojomo
33   *
34   */
35  public interface CoreAttributeConstants {
36  
37      /***
38       * Extracted MIME type of fetched content; should be
39       * set immediately by fetching module if possible
40       * (rather than waiting for a later analyzer)
41       */
42      public static String A_CONTENT_TYPE = "content-type";
43  
44      /***
45       * Multiplier of last fetch duration to wait before
46       * fetching another item of the same class (eg host)
47       */
48      public static String A_DELAY_FACTOR = "delay-factor";
49      /***
50       * Minimum delay before fetching another item of th
51       * same class (eg host). Even if lastFetchTime*delayFactor
52       * is less than this, this period will be waited.
53       */
54      public static String A_MINIMUM_DELAY = "minimum-delay";
55  
56      public static String A_RRECORD_SET_LABEL = "dns-records";
57      public static String A_DNS_FETCH_TIME    = "dns-fetch-time";
58      public static String A_DNS_SERVER_IP_LABEL = "dns-server-ip";
59      public static String A_FETCH_BEGAN_TIME= "fetch-began-time";
60      public static String A_FETCH_COMPLETED_TIME = "fetch-completed-time";
61      public static String A_HTTP_TRANSACTION = "http-transaction";
62      public static String A_FTP_CONTROL_CONVERSATION = "ftp-control-conversation";
63      public static String A_FTP_FETCH_STATUS = "ftp-fetch-status";
64  
65      public static String A_RUNTIME_EXCEPTION = "runtime-exception";
66      public static String A_LOCALIZED_ERRORS = "localized-errors";
67  
68      /*** shorthand string tokens indicating notable occurences,
69       * separated by commas */
70      public static String A_ANNOTATIONS = "annotations";
71  
72      public static String A_PREREQUISITE_URI = "prerequisite-uri";
73      public static String A_DISTANCE_FROM_SEED = "distance-from-seed";
74      public static String A_HTML_BASE = "html-base-href";
75      public static String A_RETRY_DELAY = "retry-delay";
76  
77      public static String A_META_ROBOTS = "meta-robots";
78      /*** 
79       * Define for org.archive.crawler.writer.MirrorWriterProcessor.
80       */
81      public static String A_MIRROR_PATH = "mirror-path";
82  
83      /***
84       * Key to get credential avatars from A_LIST.
85       */
86      public static final String A_CREDENTIAL_AVATARS_KEY =
87          "credential-avatars";
88      
89      /*** a 'source' (usu. URI) that's inherited by discovered URIs */
90      public static String A_SOURCE_TAG = "source";
91  
92      /***
93       * Key to (optional) attribute specifying a list of keys that
94       * are passed to CandidateURIs that 'descend' (are discovered 
95       * via) this URI. 
96       */
97      public static final String A_HERITABLE_KEYS = "heritable";
98      
99      /*** flag indicating the containing queue should be retired */ 
100     public static final String A_FORCE_RETIRE = "force-retire";
101     
102     /*** local override of proxy host */ 
103     public static final String A_HTTP_PROXY_HOST = "http-proxy-host";
104     /*** local override of proxy port */ 
105     public static final String A_HTTP_PROXY_PORT = "http-proxy-port";
106 
107     /*** local override of origin bind address */ 
108     public static final String A_HTTP_BIND_ADDRESS = "http-bind-address";
109     
110     /***
111      * Fetch truncation codes present in {@link CrawlURI} annotations.
112      * All truncation annotations have a <code>TRUNC_SUFFIX</code> suffix (TODO:
113      * Make for-sure unique or redo truncation so definitive flag marked
114      * against {@link CrawlURI}).
115      */
116     public static final String TRUNC_SUFFIX = "Trunc";
117     // headerTrunc
118     public static final String HEADER_TRUNC = "header" + TRUNC_SUFFIX; 
119     // timeTrunc
120     public static final String TIMER_TRUNC = "time" + TRUNC_SUFFIX;
121     // lenTrunc
122     public static final String LENGTH_TRUNC = "len" + TRUNC_SUFFIX;
123 
124     /* Duplication-reduction / recrawl / history constants */
125     
126     /*** fetch history array */ 
127     public static final String A_FETCH_HISTORY = "fetch-history";
128     /*** content digest */
129     public static final String A_CONTENT_DIGEST = "content-digest";
130 	/*** header name (and AList key) for last-modified timestamp */
131     public static final String A_LAST_MODIFIED_HEADER = "last-modified";
132 	/*** header name (and AList key) for ETag */
133     public static final String A_ETAG_HEADER = "etag"; 
134     /*** key for status (when in history) */
135     public static final String A_STATUS = "status"; 
136     /*** reference length (content length or virtual length */
137     public static final String A_REFERENCE_LENGTH = "reference-length";
138     /*** name of warc file where uri had records written */
139     public static final String A_WRITTEN_TO_WARC = "written-to-warc";
140 
141 }