View Javadoc

1   /*
2    * ARCConstants
3    *
4    * $Id: ARCConstants.java 5029 2007-03-29 23:53:50Z gojomo $
5    *
6    * Created on Dec 30, 2003.
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.io.arc;
27  
28  import java.util.Arrays;
29  import java.util.List;
30  import java.util.zip.Deflater;
31  import java.util.zip.GZIPInputStream;
32  
33  import org.archive.io.ArchiveFileConstants;
34  import org.archive.io.GzipHeader;
35  
36  /***
37   * Constants used by ARC files and in ARC file processing.
38   * 
39   * @author stack
40   */
41  public interface ARCConstants extends ArchiveFileConstants {
42      /***
43       * Default maximum ARC file size.
44       */
45      public static final long DEFAULT_MAX_ARC_FILE_SIZE = 100000000;
46      
47      /***
48       * Maximum length for a metadata line.
49       */
50      public static final int MAX_METADATA_LINE_LENGTH = (4 * 1024);
51  
52      /***
53       * ARC file extention.
54       */
55      public static final String ARC_FILE_EXTENSION = "arc";
56      
57      /***
58       * Dot ARC file extension.
59       */
60      public static final String DOT_ARC_FILE_EXTENSION =
61          "." + ARC_FILE_EXTENSION;
62      
63      public static final String DOT_COMPRESSED_FILE_EXTENSION =
64          ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
65  
66      /***
67       * Compressed arc file extension.
68       */
69      public static final String COMPRESSED_ARC_FILE_EXTENSION =
70          ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
71      
72      /***
73       * Compressed dot arc file extension.
74       */
75      public static final String DOT_COMPRESSED_ARC_FILE_EXTENSION =
76          DOT_ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
77  
78      /***
79       * Encoding to use getting bytes from strings.
80       *
81       * Specify an encoding rather than leave it to chance: i.e whatever the
82       * JVMs encoding.  Use an encoding that gets the stream as bytes, not chars.
83       */
84      public static final String DEFAULT_ENCODING = "ISO-8859-1";
85  
86      /***
87       * ARC file line seperator character.
88       * 
89       * This is what the alexa c-code looks for delimiting lines.
90       */
91      public static final char LINE_SEPARATOR = '\n';
92  
93      /***
94       * ARC header field seperator character.
95       */
96      public static final char HEADER_FIELD_SEPARATOR = ' ';
97  
98      /***
99       * ARC file *MAGIC NUMBER*.
100      * 
101      * Every ARC file must begin w/ this.
102      */
103     public static final String ARC_MAGIC_NUMBER = "filedesc://";
104 
105     /***
106      * The FLG.FEXTRA field that is added to ARC files. (See RFC1952 to
107      * understand FLG.FEXTRA).
108      */
109     public static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0,
110             0, 0, 0, 0 };
111 
112     /***
113      * Key for the ARC Header IP field.
114      * 
115      * Lowercased.
116      */
117     public static final String IP_HEADER_FIELD_KEY = "ip-address";
118 
119     /***
120      * Key for the ARC Header Result Code field.
121      * 
122      * Lowercased.
123      */
124     public static final String CODE_HEADER_FIELD_KEY = "result-code";
125 
126     /***
127      * Key for the ARC Header Checksum field.
128      * 
129      * Lowercased.
130      */
131     public static final String CHECKSUM_HEADER_FIELD_KEY = "checksum";
132 
133     /***
134      * Key for the ARC Header Location field.
135      * 
136      * Lowercased.
137      */
138     public static final String LOCATION_HEADER_FIELD_KEY = "location";
139 
140     /***
141      * Key for the ARC Header Offset field.
142      * 
143      * Lowercased.
144      */
145     public static final String OFFSET_HEADER_FIELD_KEY = "offset";
146 
147     /***
148      * Key for the ARC Header filename field.
149      * 
150      * Lowercased.
151      */
152     public static final String FILENAME_HEADER_FIELD_KEY = "filename";
153     
154     /***
155      * Key for statuscode field.
156      */
157     public static final String STATUSCODE_FIELD_KEY = "statuscode";
158     
159     /***
160      * Key for offset field.
161      */
162     public static final String OFFSET_FIELD_KEY = OFFSET_HEADER_FIELD_KEY;
163     
164     /***
165      * Key for filename field.
166      */
167     public static final String FILENAME_FIELD_KEY = FILENAME_HEADER_FIELD_KEY;
168     
169     /***
170      * Key for checksum field.
171      */
172     public static final String CHECKSUM_FIELD_KEY = CHECKSUM_HEADER_FIELD_KEY;
173     
174     /***
175      * Tokenized field prefix.
176      * 
177      * Use this prefix for tokenized fields  when naming fields in
178      * an index.
179      */
180     public static final String TOKENIZED_PREFIX = "tokenized_";
181     
182 
183     /***
184      * Version 1 required metadata fields.
185      */
186     public static List REQUIRED_VERSION_1_HEADER_FIELDS = Arrays
187             .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY,
188                     DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
189                     LENGTH_FIELD_KEY, VERSION_FIELD_KEY,
190                     ABSOLUTE_OFFSET_KEY });
191 
192     /***
193      * Minimum possible record length.
194      * 
195      * This is a rough calc. When the header is data it will occupy less space.
196      */
197     public static int MINIMUM_RECORD_LENGTH = 1 + "://".length() + 1
198             + ARC_FILE_EXTENSION.length() + " ".length() + +1 + " ".length()
199             + 1 + " ".length() + 1 + "/".length() + 1 + " ".length() + 1;
200 
201     /***
202      * Start of a GZIP header that uses default deflater.
203      */
204     public static final byte[] GZIP_HEADER_BEGIN = {
205             (byte) GZIPInputStream.GZIP_MAGIC, // Magic number (short)
206             (byte) (GZIPInputStream.GZIP_MAGIC >> 8), // Magic number (short)
207             Deflater.DEFLATED // Compression method (CM)
208     };
209 
210     /***
211      * Length of minimual 'default GZIP header.
212      * 
213      * See RFC1952 for explaination of value of 10.
214      */
215     public static final int DEFAULT_GZIP_HEADER_LENGTH =
216     	GzipHeader.MINIMAL_GZIP_HEADER_LENGTH;
217 }