1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.io.arc;
27
28 import java.util.Arrays;
29 import java.util.List;
30 import java.util.zip.Deflater;
31 import java.util.zip.GZIPInputStream;
32
33 import org.archive.io.ArchiveFileConstants;
34 import org.archive.io.GzipHeader;
35
36 /***
37 * Constants used by ARC files and in ARC file processing.
38 *
39 * @author stack
40 */
41 public interface ARCConstants extends ArchiveFileConstants {
42 /***
43 * Default maximum ARC file size.
44 */
45 public static final long DEFAULT_MAX_ARC_FILE_SIZE = 100000000;
46
47 /***
48 * Maximum length for a metadata line.
49 */
50 public static final int MAX_METADATA_LINE_LENGTH = (4 * 1024);
51
52 /***
53 * ARC file extention.
54 */
55 public static final String ARC_FILE_EXTENSION = "arc";
56
57 /***
58 * Dot ARC file extension.
59 */
60 public static final String DOT_ARC_FILE_EXTENSION =
61 "." + ARC_FILE_EXTENSION;
62
63 public static final String DOT_COMPRESSED_FILE_EXTENSION =
64 ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION;
65
66 /***
67 * Compressed arc file extension.
68 */
69 public static final String COMPRESSED_ARC_FILE_EXTENSION =
70 ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
71
72 /***
73 * Compressed dot arc file extension.
74 */
75 public static final String DOT_COMPRESSED_ARC_FILE_EXTENSION =
76 DOT_ARC_FILE_EXTENSION + DOT_COMPRESSED_FILE_EXTENSION;
77
78 /***
79 * Encoding to use getting bytes from strings.
80 *
81 * Specify an encoding rather than leave it to chance: i.e whatever the
82 * JVMs encoding. Use an encoding that gets the stream as bytes, not chars.
83 */
84 public static final String DEFAULT_ENCODING = "ISO-8859-1";
85
86 /***
87 * ARC file line seperator character.
88 *
89 * This is what the alexa c-code looks for delimiting lines.
90 */
91 public static final char LINE_SEPARATOR = '\n';
92
93 /***
94 * ARC header field seperator character.
95 */
96 public static final char HEADER_FIELD_SEPARATOR = ' ';
97
98 /***
99 * ARC file *MAGIC NUMBER*.
100 *
101 * Every ARC file must begin w/ this.
102 */
103 public static final String ARC_MAGIC_NUMBER = "filedesc://";
104
105 /***
106 * The FLG.FEXTRA field that is added to ARC files. (See RFC1952 to
107 * understand FLG.FEXTRA).
108 */
109 public static final byte[] ARC_GZIP_EXTRA_FIELD = { 8, 0, 'L', 'X', 4, 0,
110 0, 0, 0, 0 };
111
112 /***
113 * Key for the ARC Header IP field.
114 *
115 * Lowercased.
116 */
117 public static final String IP_HEADER_FIELD_KEY = "ip-address";
118
119 /***
120 * Key for the ARC Header Result Code field.
121 *
122 * Lowercased.
123 */
124 public static final String CODE_HEADER_FIELD_KEY = "result-code";
125
126 /***
127 * Key for the ARC Header Checksum field.
128 *
129 * Lowercased.
130 */
131 public static final String CHECKSUM_HEADER_FIELD_KEY = "checksum";
132
133 /***
134 * Key for the ARC Header Location field.
135 *
136 * Lowercased.
137 */
138 public static final String LOCATION_HEADER_FIELD_KEY = "location";
139
140 /***
141 * Key for the ARC Header Offset field.
142 *
143 * Lowercased.
144 */
145 public static final String OFFSET_HEADER_FIELD_KEY = "offset";
146
147 /***
148 * Key for the ARC Header filename field.
149 *
150 * Lowercased.
151 */
152 public static final String FILENAME_HEADER_FIELD_KEY = "filename";
153
154 /***
155 * Key for statuscode field.
156 */
157 public static final String STATUSCODE_FIELD_KEY = "statuscode";
158
159 /***
160 * Key for offset field.
161 */
162 public static final String OFFSET_FIELD_KEY = OFFSET_HEADER_FIELD_KEY;
163
164 /***
165 * Key for filename field.
166 */
167 public static final String FILENAME_FIELD_KEY = FILENAME_HEADER_FIELD_KEY;
168
169 /***
170 * Key for checksum field.
171 */
172 public static final String CHECKSUM_FIELD_KEY = CHECKSUM_HEADER_FIELD_KEY;
173
174 /***
175 * Tokenized field prefix.
176 *
177 * Use this prefix for tokenized fields when naming fields in
178 * an index.
179 */
180 public static final String TOKENIZED_PREFIX = "tokenized_";
181
182
183 /***
184 * Version 1 required metadata fields.
185 */
186 public static List REQUIRED_VERSION_1_HEADER_FIELDS = Arrays
187 .asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY,
188 DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
189 LENGTH_FIELD_KEY, VERSION_FIELD_KEY,
190 ABSOLUTE_OFFSET_KEY });
191
192 /***
193 * Minimum possible record length.
194 *
195 * This is a rough calc. When the header is data it will occupy less space.
196 */
197 public static int MINIMUM_RECORD_LENGTH = 1 + "://".length() + 1
198 + ARC_FILE_EXTENSION.length() + " ".length() + +1 + " ".length()
199 + 1 + " ".length() + 1 + "/".length() + 1 + " ".length() + 1;
200
201 /***
202 * Start of a GZIP header that uses default deflater.
203 */
204 public static final byte[] GZIP_HEADER_BEGIN = {
205 (byte) GZIPInputStream.GZIP_MAGIC,
206 (byte) (GZIPInputStream.GZIP_MAGIC >> 8),
207 Deflater.DEFLATED
208 };
209
210 /***
211 * Length of minimual 'default GZIP header.
212 *
213 * See RFC1952 for explaination of value of 10.
214 */
215 public static final int DEFAULT_GZIP_HEADER_LENGTH =
216 GzipHeader.MINIMAL_GZIP_HEADER_LENGTH;
217 }