View Javadoc

1   /*
2    * WARCConstants
3    *
4    * $Id: WARCConstants.java 6528 2009-09-29 21:52:33Z szznax $
5    *
6    * Created on July 27th, 2006
7    *
8    * Copyright (C) 2006 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.io.warc;
27  
28  import java.util.Arrays;
29  import java.util.List;
30  
31  import org.archive.io.ArchiveFileConstants;
32  
33  /***
34   * WARC Constants used by WARC readers and writers.
35   * Below constants are used WARC Reader/Writer.
36   * @author stack
37   * @version $Revision: 6528 $ $Date: 2009-09-29 21:52:33 +0000 (Tue, 29 Sep 2009) $
38   */
39  public interface WARCConstants extends ArchiveFileConstants {
40      /***
41       * Default maximum WARC file size.
42       * 1Gig.
43       */
44      public static final int DEFAULT_MAX_WARC_FILE_SIZE = 1024 * 1024 * 1024;
45      
46  	/***
47  	 * WARC MAGIC
48  	 * WARC files and records begin with this sequence.
49  	 */
50  	public static final String WARC_MAGIC = "WARC/";
51      public static final String WARC_010_MAGIC = "WARC/";
52      
53      /***
54       * Hard-coded version for WARC files made with this code.
55       * conforms to ISO 28500:2009 as of May 2009
56       */
57  	public static final String WARC_VERSION = "1.0";
58      
59      /***
60       * Assumed maximum size of a Header Line.
61       *
62       * This 100k which seems massive but its the same as the LINE_LENGTH from
63       * <code>alexa/include/a_arcio.h</code>:
64       * <pre>
65       * #define LINE_LENGTH     (100*1024)
66       * </pre>
67       */
68      public static final int MAX_WARC_HEADER_LINE_LENGTH = 1024 * 100;
69      public static final int MAX_LINE_LENGTH = MAX_WARC_HEADER_LINE_LENGTH;
70      
71      /***
72       * WARC file extention.
73       */
74      public static final String WARC_FILE_EXTENSION = "warc";
75      
76      /***
77       * Dot WARC file extension.
78       */
79      public static final String DOT_WARC_FILE_EXTENSION =
80          "." + WARC_FILE_EXTENSION;
81      
82      public static final String DOT_COMPRESSED_FILE_EXTENSION =
83          ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
84  
85      /***
86       * Compressed WARC file extension.
87       */
88      public static final String COMPRESSED_WARC_FILE_EXTENSION =
89          WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
90      
91      /***
92       * Compressed dot WARC file extension.
93       */
94      public static final String DOT_COMPRESSED_WARC_FILE_EXTENSION =
95          DOT_WARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
96      
97      /***
98       * Encoding to use getting bytes from strings.
99       *
100      * Specify an encoding rather than leave it to chance: i.e whatever the
101      * JVMs encoding.  Use an encoding that gets the stream as bytes, not chars.
102      * 
103      * ARC uses ISO-8859-1. By specification, WARC uses UTF-8. 
104      */
105     public static final String DEFAULT_ENCODING = "UTF-8";
106     public static final String HEADER_LINE_ENCODING = DEFAULT_ENCODING;
107     
108     // TODO: Revisit. 8859 isn't correct, especially if we settle on RFC822
109     // headers
110     public static final String WARC_HEADER_ENCODING = HEADER_LINE_ENCODING;
111     
112     public static final String [] HEADER_FIELD_KEYS = {
113         VERSION_FIELD_KEY,
114         LENGTH_FIELD_KEY,
115         TYPE_FIELD_KEY,
116         URL_FIELD_KEY,
117         DATE_FIELD_KEY,
118         RECORD_IDENTIFIER_FIELD_KEY,
119         MIMETYPE_FIELD_KEY
120     };
121     
122     /***
123      * WARC Record Types.
124      */
125     public static final String WARCINFO = "warcinfo";
126     public static final String RESPONSE = "response";
127     public static final String RESOURCE = "resource";
128     public static final String REQUEST = "request";
129     public static final String METADATA = "metadata";
130     public static final String REVISIT = "revisit";
131     public static final String CONVERSION = "conversion";
132     public static final String CONTINUATION = "continuation";
133     
134     public static final String TYPE = "type";
135     
136     // List of all WARC Record TYPES
137     public static final String [] TYPES = {WARCINFO, RESPONSE, RESOURCE,
138     	REQUEST, METADATA, REVISIT, CONVERSION, CONTINUATION};
139     
140     // Indices into TYPES array.
141     public static final int WARCINFO_INDEX = 0;
142     public static final int RESPONSE_INDEX = 1;
143     public static final int RESOURCE_INDEX = 2;
144     public static final int REQUEST_INDEX = 3;
145     public static final int METADATA_INDEX = 4;
146     public static final int REVISIT_INDEX = 5;
147     public static final int CONVERSION_INDEX = 6;
148     public static final int CONTINUATION_INDEX = 7;
149     
150     // TYPES as List.
151     public static final List TYPES_LIST = Arrays.asList(TYPES);
152     
153     /***
154      * WARC-ID
155      */
156     public static final String WARC_ID = WARC_MAGIC + WARC_VERSION;
157     public static final String WARC_010_ID = WARC_010_MAGIC + "0.10";
158         
159     /***
160      * Header field seperator character.
161      */
162     public static final char HEADER_FIELD_SEPARATOR = ' ';
163     
164     /***
165      * WSP
166      * One of a space or horizontal tab character.
167      * TODO: WSP undefined.  Fix.
168      */
169     public static final Character [] WSP = {HEADER_FIELD_SEPARATOR, '\t'};
170 
171     /***
172      * Placeholder for length in Header line.
173      * Placeholder is same size as the fixed field size allocated for length,
174      * 12 characters.  12 characters allows records of size almost 1TB.
175      */
176     public static final String PLACEHOLDER_RECORD_LENGTH_STRING =
177         "000000000000";
178     
179     public static final String NAMED_FIELD_IP_LABEL = "IP-Address";
180     public static final String NAMED_FIELD_CHECKSUM_LABEL = "Checksum";
181     public static final String NAMED_FIELD_RELATED_LABEL = "References";
182     public static final String NAMED_FIELD_WARCFILENAME = "Filename";
183     public static final String NAMED_FIELD_DESCRIPTION = "Description";
184     public static final String NAMED_FIELD_FILEDESC = "ARC-FileDesc";
185     public static final String NAMED_FIELD_TRUNCATED = "Truncated";
186     public static final String NAMED_FIELD_TRUNCATED_VALUE_TIME = "time";
187     public static final String NAMED_FIELD_TRUNCATED_VALUE_LENGTH = "length";
188     public static final String NAMED_FIELD_TRUNCATED_VALUE_HEAD =
189         "long-headers";
190     public static final String NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED = null;
191     
192     // Headers for version 0.17 of spec.
193     public static final String HEADER_KEY_DATE = "WARC-Date";
194     public static final String HEADER_KEY_TYPE = "WARC-Type";
195     public static final String HEADER_KEY_ID = "WARC-Record-ID";
196 
197     public static final String HEADER_KEY_URI = "WARC-Target-URI";   
198     public static final String HEADER_KEY_IP = "WARC-IP-Address";   
199     public static final String HEADER_KEY_BLOCK_DIGEST = "WARC-Block-Digest";
200     public static final String HEADER_KEY_PAYLOAD_DIGEST = "WARC-Payload-Digest";
201     public static final String HEADER_KEY_CONCURRENT_TO =
202         "WARC-Concurrent-To";
203     public static final String HEADER_KEY_TRUNCATED = "WARC-Truncated";
204     public static final String HEADER_KEY_PROFILE = "WARC-Profile";
205     public static final String HEADER_KEY_FILENAME = "WARC-Filename";
206     public static final String HEADER_KEY_ETAG = "WARC-Etag";
207     public static final String HEADER_KEY_LAST_MODIFIED = "WARC-Last-Modified";
208     
209     public static final String PROFILE_REVISIT_IDENTICAL_DIGEST = 
210     	"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";
211     public static final String PROFILE_REVISIT_NOT_MODIFIED = 
212     	"http://netpreserve.org/warc/1.0/revisit/server-not-modified";
213     
214     public static final String CONTENT_LENGTH = "Content-Length";
215     public static final String CONTENT_TYPE = "Content-Type";
216     public static final String CONTENT_DESCRIPTION = "Content-Description";
217     
218     public static final String COLON_SPACE = ": ";
219     // TODO: This is not in spec. Fix.
220     public static final String TRUNCATED_VALUE_UNSPECIFIED = "unspecified";
221     
222     
223     /***
224      * To be safe, lets use application type rather than message. Regards 
225      * 'message/http', RFC says "...provided that it obeys the MIME restrictions
226      * for all 'message' types regarding line length and encodings."  This
227      * usually means lines of 1000 octets max (unless a 
228      * 'Content-Transfer-Encoding: binary' mime header is present).
229      * @see <a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec19.html#sec19.1">rfc2616 section 19.1</a>
230      */
231     public static final String HTTP_REQUEST_MIMETYPE =
232     	"application/http; msgtype=request";
233     public static final String HTTP_RESPONSE_MIMETYPE =
234     	"application/http; msgtype=response";
235     public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
236         "text/x-ftp-control-conversation";
237 }