1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.io.warc;
27
28 import java.util.Arrays;
29 import java.util.List;
30
31 import org.archive.io.ArchiveFileConstants;
32
33 /***
34 * WARC Constants used by WARC readers and writers.
35 * Below constants are used WARC Reader/Writer.
36 * @author stack
37 * @version $Revision: 6528 $ $Date: 2009-09-29 21:52:33 +0000 (Tue, 29 Sep 2009) $
38 */
39 public interface WARCConstants extends ArchiveFileConstants {
40 /***
41 * Default maximum WARC file size.
42 * 1Gig.
43 */
44 public static final int DEFAULT_MAX_WARC_FILE_SIZE = 1024 * 1024 * 1024;
45
46 /***
47 * WARC MAGIC
48 * WARC files and records begin with this sequence.
49 */
50 public static final String WARC_MAGIC = "WARC/";
51 public static final String WARC_010_MAGIC = "WARC/";
52
53 /***
54 * Hard-coded version for WARC files made with this code.
55 * conforms to ISO 28500:2009 as of May 2009
56 */
57 public static final String WARC_VERSION = "1.0";
58
59 /***
60 * Assumed maximum size of a Header Line.
61 *
62 * This 100k which seems massive but its the same as the LINE_LENGTH from
63 * <code>alexa/include/a_arcio.h</code>:
64 * <pre>
65 * #define LINE_LENGTH (100*1024)
66 * </pre>
67 */
68 public static final int MAX_WARC_HEADER_LINE_LENGTH = 1024 * 100;
69 public static final int MAX_LINE_LENGTH = MAX_WARC_HEADER_LINE_LENGTH;
70
71 /***
72 * WARC file extention.
73 */
74 public static final String WARC_FILE_EXTENSION = "warc";
75
76 /***
77 * Dot WARC file extension.
78 */
79 public static final String DOT_WARC_FILE_EXTENSION =
80 "." + WARC_FILE_EXTENSION;
81
82 public static final String DOT_COMPRESSED_FILE_EXTENSION =
83 ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
84
85 /***
86 * Compressed WARC file extension.
87 */
88 public static final String COMPRESSED_WARC_FILE_EXTENSION =
89 WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
90
91 /***
92 * Compressed dot WARC file extension.
93 */
94 public static final String DOT_COMPRESSED_WARC_FILE_EXTENSION =
95 DOT_WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
96
97 /***
98 * Encoding to use getting bytes from strings.
99 *
100 * Specify an encoding rather than leave it to chance: i.e whatever the
101 * JVMs encoding. Use an encoding that gets the stream as bytes, not chars.
102 *
103 * ARC uses ISO-8859-1. By specification, WARC uses UTF-8.
104 */
105 public static final String DEFAULT_ENCODING = "UTF-8";
106 public static final String HEADER_LINE_ENCODING = DEFAULT_ENCODING;
107
108
109
110 public static final String WARC_HEADER_ENCODING = HEADER_LINE_ENCODING;
111
112 public static final String [] HEADER_FIELD_KEYS = {
113 VERSION_FIELD_KEY,
114 LENGTH_FIELD_KEY,
115 TYPE_FIELD_KEY,
116 URL_FIELD_KEY,
117 DATE_FIELD_KEY,
118 RECORD_IDENTIFIER_FIELD_KEY,
119 MIMETYPE_FIELD_KEY
120 };
121
122 /***
123 * WARC Record Types.
124 */
125 public static final String WARCINFO = "warcinfo";
126 public static final String RESPONSE = "response";
127 public static final String RESOURCE = "resource";
128 public static final String REQUEST = "request";
129 public static final String METADATA = "metadata";
130 public static final String REVISIT = "revisit";
131 public static final String CONVERSION = "conversion";
132 public static final String CONTINUATION = "continuation";
133
134 public static final String TYPE = "type";
135
136
137 public static final String [] TYPES = {WARCINFO, RESPONSE, RESOURCE,
138 REQUEST, METADATA, REVISIT, CONVERSION, CONTINUATION};
139
140
141 public static final int WARCINFO_INDEX = 0;
142 public static final int RESPONSE_INDEX = 1;
143 public static final int RESOURCE_INDEX = 2;
144 public static final int REQUEST_INDEX = 3;
145 public static final int METADATA_INDEX = 4;
146 public static final int REVISIT_INDEX = 5;
147 public static final int CONVERSION_INDEX = 6;
148 public static final int CONTINUATION_INDEX = 7;
149
150
151 public static final List TYPES_LIST = Arrays.asList(TYPES);
152
153 /***
154 * WARC-ID
155 */
156 public static final String WARC_ID = WARC_MAGIC + WARC_VERSION;
157 public static final String WARC_010_ID = WARC_010_MAGIC + "0.10";
158
159 /***
160 * Header field seperator character.
161 */
162 public static final char HEADER_FIELD_SEPARATOR = ' ';
163
164 /***
165 * WSP
166 * One of a space or horizontal tab character.
167 * TODO: WSP undefined. Fix.
168 */
169 public static final Character [] WSP = {HEADER_FIELD_SEPARATOR, '\t'};
170
171 /***
172 * Placeholder for length in Header line.
173 * Placeholder is same size as the fixed field size allocated for length,
174 * 12 characters. 12 characters allows records of size almost 1TB.
175 */
176 public static final String PLACEHOLDER_RECORD_LENGTH_STRING =
177 "000000000000";
178
179 public static final String NAMED_FIELD_IP_LABEL = "IP-Address";
180 public static final String NAMED_FIELD_CHECKSUM_LABEL = "Checksum";
181 public static final String NAMED_FIELD_RELATED_LABEL = "References";
182 public static final String NAMED_FIELD_WARCFILENAME = "Filename";
183 public static final String NAMED_FIELD_DESCRIPTION = "Description";
184 public static final String NAMED_FIELD_FILEDESC = "ARC-FileDesc";
185 public static final String NAMED_FIELD_TRUNCATED = "Truncated";
186 public static final String NAMED_FIELD_TRUNCATED_VALUE_TIME = "time";
187 public static final String NAMED_FIELD_TRUNCATED_VALUE_LENGTH = "length";
188 public static final String NAMED_FIELD_TRUNCATED_VALUE_HEAD =
189 "long-headers";
190 public static final String NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED = null;
191
192
193 public static final String HEADER_KEY_DATE = "WARC-Date";
194 public static final String HEADER_KEY_TYPE = "WARC-Type";
195 public static final String HEADER_KEY_ID = "WARC-Record-ID";
196
197 public static final String HEADER_KEY_URI = "WARC-Target-URI";
198 public static final String HEADER_KEY_IP = "WARC-IP-Address";
199 public static final String HEADER_KEY_BLOCK_DIGEST = "WARC-Block-Digest";
200 public static final String HEADER_KEY_PAYLOAD_DIGEST = "WARC-Payload-Digest";
201 public static final String HEADER_KEY_CONCURRENT_TO =
202 "WARC-Concurrent-To";
203 public static final String HEADER_KEY_TRUNCATED = "WARC-Truncated";
204 public static final String HEADER_KEY_PROFILE = "WARC-Profile";
205 public static final String HEADER_KEY_FILENAME = "WARC-Filename";
206 public static final String HEADER_KEY_ETAG = "WARC-Etag";
207 public static final String HEADER_KEY_LAST_MODIFIED = "WARC-Last-Modified";
208
209 public static final String PROFILE_REVISIT_IDENTICAL_DIGEST =
210 "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";
211 public static final String PROFILE_REVISIT_NOT_MODIFIED =
212 "http://netpreserve.org/warc/1.0/revisit/server-not-modified";
213
214 public static final String CONTENT_LENGTH = "Content-Length";
215 public static final String CONTENT_TYPE = "Content-Type";
216 public static final String CONTENT_DESCRIPTION = "Content-Description";
217
218 public static final String COLON_SPACE = ": ";
219
220 public static final String TRUNCATED_VALUE_UNSPECIFIED = "unspecified";
221
222
223 /***
224 * To be safe, lets use application type rather than message. Regards
225 * 'message/http', RFC says "...provided that it obeys the MIME restrictions
226 * for all 'message' types regarding line length and encodings." This
227 * usually means lines of 1000 octets max (unless a
228 * 'Content-Transfer-Encoding: binary' mime header is present).
229 * @see <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html#sec19.1">rfc2616 section 19.1</a>
230 */
231 public static final String HTTP_REQUEST_MIMETYPE =
232 "application/http; msgtype=request";
233 public static final String HTTP_RESPONSE_MIMETYPE =
234 "application/http; msgtype=response";
235 public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
236 "text/x-ftp-control-conversation";
237 }