1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.io.arc;
26
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.net.MalformedURLException;
32 import java.net.URL;
33 import java.util.Iterator;
34 import java.util.logging.Level;
35
36 import org.archive.io.ArchiveReader;
37 import org.archive.io.ArchiveReaderFactory;
38 import org.archive.io.ArchiveRecord;
39 import org.archive.io.ArchiveRecordHeader;
40 import org.archive.io.GzipHeader;
41 import org.archive.io.GzippedInputStream;
42 import org.archive.io.NoGzipMagicException;
43 import org.archive.util.FileUtils;
44
45
46 /***
47 * Factory that returns an ARCReader.
48 *
49 * Can handle compressed and uncompressed ARCs.
50 *
51 * @author stack
52 */
53 public class ARCReaderFactory extends ArchiveReaderFactory
54 implements ARCConstants {
55 /***
56 * This factory instance.
57 */
58 private static final ARCReaderFactory factory = new ARCReaderFactory();
59
60 /***
61 * Shutdown any access to default constructor.
62 */
63 protected ARCReaderFactory() {
64 super();
65 }
66
67 public static ARCReader get(String arcFileOrUrl)
68 throws MalformedURLException, IOException {
69 return (ARCReader)ARCReaderFactory.factory.
70 getArchiveReader(arcFileOrUrl);
71 }
72
73 public static ARCReader get(String arcFileOrUrl, final long offset)
74 throws MalformedURLException, IOException {
75 return (ARCReader)ARCReaderFactory.factory.
76 getArchiveReader(arcFileOrUrl, offset);
77 }
78
79 public static ARCReader get(final File f) throws IOException {
80 return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f);
81 }
82
83 public static ARCReader get(final File f, final long offset)
84 throws IOException {
85 return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset);
86 }
87
88 protected ArchiveReader getArchiveReader(final File f, final long offset)
89 throws IOException {
90 return getArchiveReader(f, true, offset);
91 }
92
93 /***
94 * @param f An arcfile to read.
95 * @param skipSuffixTest Set to true if want to test that ARC has proper
96 * suffix. Use this method and pass <code>false</code> to open ARCs
97 * with the <code>.open</code> or otherwise suffix.
98 * @param offset Have returned ARCReader set to start reading at passed
99 * offset.
100 * @return An ARCReader.
101 * @throws IOException
102 */
103 public static ARCReader get(final File f,
104 final boolean skipSuffixTest, final long offset)
105 throws IOException {
106 return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f,
107 skipSuffixTest, offset);
108 }
109
110 protected ArchiveReader getArchiveReader(final File arcFile,
111 final boolean skipSuffixTest, final long offset)
112 throws IOException {
113 boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
114 if (!compressed) {
115 if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
116 ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
117 throw new IOException(arcFile.getAbsolutePath() +
118 " is not an Internet Archive ARC file.");
119 }
120 }
121 return compressed?
122 (ARCReader)ARCReaderFactory.factory.
123 new CompressedARCReader(arcFile, offset):
124 (ARCReader)ARCReaderFactory.factory.
125 new UncompressedARCReader(arcFile, offset);
126 }
127
128 public static ArchiveReader get(final String s, final InputStream is,
129 final boolean atFirstRecord)
130 throws IOException {
131 return ARCReaderFactory.factory.getArchiveReader(s, is,
132 atFirstRecord);
133 }
134
135 protected ArchiveReader getArchiveReader(final String arc,
136 final InputStream is, final boolean atFirstRecord)
137 throws IOException {
138
139
140 return new CompressedARCReader(arc, asRepositionable(is),
141 atFirstRecord);
142 }
143
144 /***
145 * Get an ARCReader aligned at <code>offset</code>. This version of get
146 * will not bring the ARC local but will try to stream across the net making
147 * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
148 *
149 * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
150 * @param offset Offset into ARC at which to start fetching.
151 * @return An ARCReader aligned at offset.
152 * @throws IOException
153 */
154 public static ARCReader get(final URL arcUrl, final long offset)
155 throws IOException {
156 return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl,
157 offset);
158 }
159
160 /***
161 * Get an ARCReader.
162 * Pulls the ARC local into whereever the System Property
163 * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
164 * points at this local copy. A close on this ARCReader instance will
165 * remove the local copy.
166 * @param arcUrl An URL that points at an ARC.
167 * @return An ARCReader.
168 * @throws IOException
169 */
170 public static ARCReader get(final URL arcUrl)
171 throws IOException {
172 return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl);
173 }
174
175 /***
176 * @param arcFile File to test.
177 * @return True if <code>arcFile</code> is compressed ARC.
178 * @throws IOException
179 */
180 public boolean isCompressed(File arcFile) throws IOException {
181 return testCompressedARCFile(arcFile);
182 }
183
184 /***
185 * Check file is compressed and in ARC GZIP format.
186 *
187 * @param arcFile File to test if its Internet Archive ARC file
188 * GZIP compressed.
189 *
190 * @return True if this is an Internet Archive GZIP'd ARC file (It begins
191 * w/ the Internet Archive GZIP header and has the
192 * COMPRESSED_ARC_FILE_EXTENSION suffix).
193 *
194 * @exception IOException If file does not exist or is not unreadable.
195 */
196 public static boolean testCompressedARCFile(File arcFile)
197 throws IOException {
198 return testCompressedARCFile(arcFile, false);
199 }
200
201 /***
202 * Check file is compressed and in ARC GZIP format.
203 *
204 * @param arcFile File to test if its Internet Archive ARC file
205 * GZIP compressed.
206 * @param skipSuffixCheck Set to true if we're not to test on the
207 * '.arc.gz' suffix.
208 *
209 * @return True if this is an Internet Archive GZIP'd ARC file (It begins
210 * w/ the Internet Archive GZIP header).
211 *
212 * @exception IOException If file does not exist or is not unreadable.
213 */
214 public static boolean testCompressedARCFile(File arcFile,
215 boolean skipSuffixCheck)
216 throws IOException {
217 boolean compressedARCFile = false;
218 FileUtils.isReadable(arcFile);
219 if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
220 .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
221 return compressedARCFile;
222 }
223
224 final InputStream is = new FileInputStream(arcFile);
225 try {
226 compressedARCFile = testCompressedARCStream(is);
227 } finally {
228 is.close();
229 }
230 return compressedARCFile;
231 }
232
233 public static boolean isARCSuffix(final String arcName) {
234 return (arcName == null)?
235 false:
236 (arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
237 true:
238 (arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
239 true: false;
240 }
241
242 /***
243 * Tests passed stream is gzip stream by reading in the HEAD.
244 * Does not reposition the stream. That is left up to the caller.
245 * @param is An InputStream.
246 * @return True if compressed stream.
247 * @throws IOException
248 */
249 public static boolean testCompressedARCStream(final InputStream is)
250 throws IOException {
251 boolean compressedARCFile = false;
252 GzipHeader gh = null;
253 try {
254 gh = new GzipHeader(is);
255 } catch (NoGzipMagicException e ) {
256 return compressedARCFile;
257 }
258
259 byte[] fextra = gh.getFextra();
260
261
262
263
264 if (fextra != null &&
265 ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
266 compressedARCFile = true;
267 for (int i = 0; i < fextra.length; i++) {
268 if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
269 compressedARCFile = false;
270 break;
271 }
272 }
273 }
274 return compressedARCFile;
275 }
276
277 /***
278 * Uncompressed arc file reader.
279 * @author stack
280 */
281 public class UncompressedARCReader extends ARCReader {
282 /***
283 * Constructor.
284 * @param f Uncompressed arcfile to read.
285 * @throws IOException
286 */
287 public UncompressedARCReader(final File f)
288 throws IOException {
289 this(f, 0);
290 }
291
292 /***
293 * Constructor.
294 *
295 * @param f Uncompressed arcfile to read.
296 * @param offset Offset at which to position ARCReader.
297 * @throws IOException
298 */
299 public UncompressedARCReader(final File f, final long offset)
300 throws IOException {
301
302
303 setIn(getInputStream(f, offset));
304 initialize(f.getAbsolutePath());
305 }
306
307 /***
308 * Constructor.
309 *
310 * @param f Uncompressed arc to read.
311 * @param is InputStream.
312 */
313 public UncompressedARCReader(final String f, final InputStream is) {
314
315
316 setIn(is);
317 initialize(f);
318 }
319 }
320
321 /***
322 * Compressed arc file reader.
323 *
324 * @author stack
325 */
326 public class CompressedARCReader extends ARCReader {
327
328 /***
329 * Constructor.
330 *
331 * @param f
332 * Compressed arcfile to read.
333 * @throws IOException
334 */
335 public CompressedARCReader(final File f) throws IOException {
336 this(f, 0);
337 }
338
339 /***
340 * Constructor.
341 *
342 * @param f Compressed arcfile to read.
343 * @param offset Position at where to start reading file.
344 * @throws IOException
345 */
346 public CompressedARCReader(final File f, final long offset)
347 throws IOException {
348
349
350 setIn(new GzippedInputStream(getInputStream(f, offset)));
351 setCompressed((offset == 0));
352 initialize(f.getAbsolutePath());
353 }
354
355 /***
356 * Constructor.
357 *
358 * @param f Compressed arcfile.
359 * @param is InputStream to use.
360 * @throws IOException
361 */
362 public CompressedARCReader(final String f, final InputStream is,
363 final boolean atFirstRecord)
364 throws IOException {
365
366
367 setIn(new GzippedInputStream(is));
368 setCompressed(true);
369 setAlignedOnFirstRecord(atFirstRecord);
370 initialize(f);
371 }
372
373 /***
374 * Get record at passed <code>offset</code>.
375 *
376 * @param offset
377 * Byte index into arcfile at which a record starts.
378 * @return An ARCRecord reference.
379 * @throws IOException
380 */
381 public ARCRecord get(long offset) throws IOException {
382 cleanupCurrentRecord();
383 ((GzippedInputStream)getIn()).gzipMemberSeek(offset);
384 return createArchiveRecord(getIn(), offset);
385 }
386
387 public Iterator<ArchiveRecord> iterator() {
388 /***
389 * Override ARCRecordIterator so can base returned iterator on
390 * GzippedInputStream iterator.
391 */
392 return new ArchiveRecordIterator() {
393 private GzippedInputStream gis =
394 (GzippedInputStream)getInputStream();
395
396 private Iterator gzipIterator = this.gis.iterator();
397
398 protected boolean innerHasNext() {
399 return this.gzipIterator.hasNext();
400 }
401
402 protected ArchiveRecord innerNext() throws IOException {
403
404
405 long p = this.gis.position();
406 InputStream is = (InputStream) this.gzipIterator.next();
407 return createArchiveRecord(is, p);
408 }
409 };
410 }
411
412 protected void gotoEOR(ArchiveRecord rec) throws IOException {
413 long skipped = ((GzippedInputStream)getIn()).
414 gotoEOR(LINE_SEPARATOR);
415 if (skipped <= 0) {
416 return;
417 }
418
419
420 ArchiveRecordHeader meta = (getCurrentRecord() != null)?
421 rec.getHeader(): null;
422 String message = "Record ENDING at " +
423 ((GzippedInputStream)getIn()).position() +
424 " has " + skipped + " trailing byte(s): " +
425 ((meta != null)? meta.toString(): "");
426 if (isStrict()) {
427 throw new IOException(message);
428 }
429 logStdErr(Level.WARNING, message);
430 }
431 }
432 }