1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.crawler.datamodel;
27
28 /***
29 * CrawlURI attribute keys used by the core crawler
30 * classes.
31 *
32 * @author gojomo
33 *
34 */
35 public interface CoreAttributeConstants {
36
37 /***
38 * Extracted MIME type of fetched content; should be
39 * set immediately by fetching module if possible
40 * (rather than waiting for a later analyzer)
41 */
42 public static String A_CONTENT_TYPE = "content-type";
43
44 /***
45 * Multiplier of last fetch duration to wait before
46 * fetching another item of the same class (eg host)
47 */
48 public static String A_DELAY_FACTOR = "delay-factor";
49 /***
50 * Minimum delay before fetching another item of th
51 * same class (eg host). Even if lastFetchTime*delayFactor
52 * is less than this, this period will be waited.
53 */
54 public static String A_MINIMUM_DELAY = "minimum-delay";
55
56 public static String A_RRECORD_SET_LABEL = "dns-records";
57 public static String A_DNS_FETCH_TIME = "dns-fetch-time";
58 public static String A_DNS_SERVER_IP_LABEL = "dns-server-ip";
59 public static String A_FETCH_BEGAN_TIME= "fetch-began-time";
60 public static String A_FETCH_COMPLETED_TIME = "fetch-completed-time";
61 public static String A_HTTP_TRANSACTION = "http-transaction";
62 public static String A_FTP_CONTROL_CONVERSATION = "ftp-control-conversation";
63 public static String A_FTP_FETCH_STATUS = "ftp-fetch-status";
64
65 public static String A_RUNTIME_EXCEPTION = "runtime-exception";
66 public static String A_LOCALIZED_ERRORS = "localized-errors";
67
68 /*** shorthand string tokens indicating notable occurences,
69 * separated by commas */
70 public static String A_ANNOTATIONS = "annotations";
71
72 public static String A_PREREQUISITE_URI = "prerequisite-uri";
73 public static String A_DISTANCE_FROM_SEED = "distance-from-seed";
74 public static String A_HTML_BASE = "html-base-href";
75 public static String A_RETRY_DELAY = "retry-delay";
76
77 public static String A_META_ROBOTS = "meta-robots";
78 /***
79 * Define for org.archive.crawler.writer.MirrorWriterProcessor.
80 */
81 public static String A_MIRROR_PATH = "mirror-path";
82
83 /***
84 * Key to get credential avatars from A_LIST.
85 */
86 public static final String A_CREDENTIAL_AVATARS_KEY =
87 "credential-avatars";
88
89 /*** a 'source' (usu. URI) that's inherited by discovered URIs */
90 public static String A_SOURCE_TAG = "source";
91
92 /***
93 * Key to (optional) attribute specifying a list of keys that
94 * are passed to CandidateURIs that 'descend' (are discovered
95 * via) this URI.
96 */
97 public static final String A_HERITABLE_KEYS = "heritable";
98
99 /*** flag indicating the containing queue should be retired */
100 public static final String A_FORCE_RETIRE = "force-retire";
101
102 /*** local override of proxy host */
103 public static final String A_HTTP_PROXY_HOST = "http-proxy-host";
104 /*** local override of proxy port */
105 public static final String A_HTTP_PROXY_PORT = "http-proxy-port";
106
107 /*** local override of origin bind address */
108 public static final String A_HTTP_BIND_ADDRESS = "http-bind-address";
109
110 /***
111 * Fetch truncation codes present in {@link CrawlURI} annotations.
112 * All truncation annotations have a <code>TRUNC_SUFFIX</code> suffix (TODO:
113 * Make for-sure unique or redo truncation so definitive flag marked
114 * against {@link CrawlURI}).
115 */
116 public static final String TRUNC_SUFFIX = "Trunc";
117
118 public static final String HEADER_TRUNC = "header" + TRUNC_SUFFIX;
119
120 public static final String TIMER_TRUNC = "time" + TRUNC_SUFFIX;
121
122 public static final String LENGTH_TRUNC = "len" + TRUNC_SUFFIX;
123
124
125
126 /*** fetch history array */
127 public static final String A_FETCH_HISTORY = "fetch-history";
128 /*** content digest */
129 public static final String A_CONTENT_DIGEST = "content-digest";
130 /*** header name (and AList key) for last-modified timestamp */
131 public static final String A_LAST_MODIFIED_HEADER = "last-modified";
132 /*** header name (and AList key) for ETag */
133 public static final String A_ETAG_HEADER = "etag";
134 /*** key for status (when in history) */
135 public static final String A_STATUS = "status";
136 /*** reference length (content length or virtual length */
137 public static final String A_REFERENCE_LENGTH = "reference-length";
138 /*** name of warc file where uri had records written */
139 public static final String A_WRITTEN_TO_WARC = "written-to-warc";
140
141 }