View Javadoc

1   /* ARCReaderFactory
2    *
3    * $Id: ARCReaderFactory.java 5950 2008-08-05 23:48:24Z gojomo $
4    *
5    * Created on May 1, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.File;
28  import java.io.FileInputStream;
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.net.MalformedURLException;
32  import java.net.URL;
33  import java.util.Iterator;
34  import java.util.logging.Level;
35  
36  import org.archive.io.ArchiveReader;
37  import org.archive.io.ArchiveReaderFactory;
38  import org.archive.io.ArchiveRecord;
39  import org.archive.io.ArchiveRecordHeader;
40  import org.archive.io.GzipHeader;
41  import org.archive.io.GzippedInputStream;
42  import org.archive.io.NoGzipMagicException;
43  import org.archive.util.FileUtils;
44  
45  
46  /***
47   * Factory that returns an ARCReader.
48   * 
49   * Can handle compressed and uncompressed ARCs.
50   *
51   * @author stack
52   */
53  public class ARCReaderFactory extends ArchiveReaderFactory
54  implements ARCConstants {
55      /***
56       * This factory instance.
57       */
58      private static final ARCReaderFactory factory = new ARCReaderFactory();
59  
60      /***
61       * Shutdown any access to default constructor.
62       */
63      protected ARCReaderFactory() {
64          super();
65      }
66      
67      public static ARCReader get(String arcFileOrUrl)
68      throws MalformedURLException, IOException {
69      	return (ARCReader)ARCReaderFactory.factory.
70      		getArchiveReader(arcFileOrUrl);
71      }
72      
73      public static ARCReader get(String arcFileOrUrl, final long offset)
74      throws MalformedURLException, IOException {
75      	return (ARCReader)ARCReaderFactory.factory.
76      		getArchiveReader(arcFileOrUrl, offset);
77      }
78      
79      public static ARCReader get(final File f) throws IOException {
80      	return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f);
81      }
82      
83      public static ARCReader get(final File f, final long offset)
84      throws IOException {
85      	return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset);
86      }
87      
88      protected ArchiveReader getArchiveReader(final File f, final long offset)
89      throws IOException {
90      	return getArchiveReader(f, true, offset);
91  	}
92      
93      /***
94       * @param f An arcfile to read.
95       * @param skipSuffixTest Set to true if want to test that ARC has proper
96       * suffix. Use this method and pass <code>false</code> to open ARCs
97       * with the <code>.open</code> or otherwise suffix.
98       * @param offset Have returned ARCReader set to start reading at passed
99       * offset.
100      * @return An ARCReader.
101      * @throws IOException 
102      */
103     public static ARCReader get(final File f,
104             final boolean skipSuffixTest, final long offset)
105     throws IOException {
106     	return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f,
107     		skipSuffixTest, offset);
108     }
109     
110     protected ArchiveReader getArchiveReader(final File arcFile,
111             final boolean skipSuffixTest, final long offset)
112     throws IOException {
113         boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
114         if (!compressed) {
115             if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
116                     ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
117                 throw new IOException(arcFile.getAbsolutePath() +
118                     " is not an Internet Archive ARC file.");
119             }
120         }
121         return compressed?
122             (ARCReader)ARCReaderFactory.factory.
123                 new CompressedARCReader(arcFile, offset):
124             (ARCReader)ARCReaderFactory.factory.
125                 new UncompressedARCReader(arcFile, offset);
126 	}
127     
128     public static ArchiveReader get(final String s, final InputStream is,
129             final boolean atFirstRecord)
130     throws IOException {
131         return ARCReaderFactory.factory.getArchiveReader(s, is,
132             atFirstRecord);
133     }
134     
135     protected ArchiveReader getArchiveReader(final String arc,
136 			final InputStream is, final boolean atFirstRecord)
137 			throws IOException {
138 		// For now, assume stream is compressed. Later add test of input
139 		// stream or handle exception thrown when figure not compressed stream.
140 		return new CompressedARCReader(arc, asRepositionable(is),
141             atFirstRecord);
142 	}
143     
144     /***
145 	 * Get an ARCReader aligned at <code>offset</code>. This version of get
146 	 * will not bring the ARC local but will try to stream across the net making
147 	 * an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
148 	 * 
149 	 * @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
150 	 * @param offset Offset into ARC at which to start fetching.
151 	 * @return An ARCReader aligned at offset.
152 	 * @throws IOException
153 	 */
154     public static ARCReader get(final URL arcUrl, final long offset)
155     throws IOException {
156         return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl,
157             offset);
158     }
159     
160     /***
161      * Get an ARCReader.
162      * Pulls the ARC local into whereever the System Property
163      * <code>java.io.tmpdir</code> points. It then hands back an ARCReader that
164      * points at this local copy.  A close on this ARCReader instance will
165      * remove the local copy.
166      * @param arcUrl An URL that points at an ARC.
167      * @return An ARCReader.
168      * @throws IOException 
169      */
170     public static ARCReader get(final URL arcUrl)
171     throws IOException {
172         return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl);
173     }
174     
175     /***
176      * @param arcFile File to test.
177      * @return True if <code>arcFile</code> is compressed ARC.
178      * @throws IOException
179      */
180     public boolean isCompressed(File arcFile) throws IOException {
181         return testCompressedARCFile(arcFile);
182     }
183     
184     /***
185      * Check file is compressed and in ARC GZIP format.
186      *
187      * @param arcFile File to test if its Internet Archive ARC file
188      * GZIP compressed.
189      *
190      * @return True if this is an Internet Archive GZIP'd ARC file (It begins
191      * w/ the Internet Archive GZIP header and has the
192      * COMPRESSED_ARC_FILE_EXTENSION suffix).
193      *
194      * @exception IOException If file does not exist or is not unreadable.
195      */
196     public static boolean testCompressedARCFile(File arcFile)
197     throws IOException {
198         return testCompressedARCFile(arcFile, false);
199     }
200 
201     /***
202      * Check file is compressed and in ARC GZIP format.
203      *
204      * @param arcFile File to test if its Internet Archive ARC file
205      * GZIP compressed.
206      * @param skipSuffixCheck Set to true if we're not to test on the
207      * '.arc.gz' suffix.
208      *
209      * @return True if this is an Internet Archive GZIP'd ARC file (It begins
210      * w/ the Internet Archive GZIP header).
211      *
212      * @exception IOException If file does not exist or is not unreadable.
213      */
214     public static boolean testCompressedARCFile(File arcFile,
215             boolean skipSuffixCheck)
216     throws IOException {
217         boolean compressedARCFile = false;
218         FileUtils.isReadable(arcFile);
219         if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
220                 .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
221             return compressedARCFile;
222         }
223         
224         final InputStream is = new FileInputStream(arcFile);
225         try {
226             compressedARCFile = testCompressedARCStream(is);
227         } finally {
228             is.close();
229         }
230         return compressedARCFile;
231     }
232     
233     public static boolean isARCSuffix(final String arcName) {
234     	return (arcName == null)?
235     		false:
236     		(arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
237     		    true:
238     			(arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
239     			true: false;
240     }
241     
242     /***
243      * Tests passed stream is gzip stream by reading in the HEAD.
244      * Does not reposition the stream.  That is left up to the caller.
245      * @param is An InputStream.
246      * @return True if compressed stream.
247      * @throws IOException
248      */
249     public static boolean testCompressedARCStream(final InputStream is)
250             throws IOException {
251         boolean compressedARCFile = false;
252         GzipHeader gh = null;
253         try {
254             gh = new GzipHeader(is);
255         } catch (NoGzipMagicException e ) {
256             return compressedARCFile;
257         }
258         
259         byte[] fextra = gh.getFextra();
260         // Now make sure following bytes are IA GZIP comment.
261         // First check length. ARC_GZIP_EXTRA_FIELD includes length
262         // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
263         // at +2.
264         if (fextra != null &&
265                 ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
266             compressedARCFile = true;
267             for (int i = 0; i < fextra.length; i++) {
268                 if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
269                     compressedARCFile = false;
270                     break;
271                 }
272             }
273         }
274         return compressedARCFile;
275     }
276 
277     /***
278      * Uncompressed arc file reader.
279      * @author stack
280      */
281     public class UncompressedARCReader extends ARCReader {
282         /***
283          * Constructor.
284          * @param f Uncompressed arcfile to read.
285          * @throws IOException
286          */
287         public UncompressedARCReader(final File f)
288         throws IOException {
289             this(f, 0);
290         }
291 
292         /***
293          * Constructor.
294          * 
295          * @param f Uncompressed arcfile to read.
296          * @param offset Offset at which to position ARCReader.
297          * @throws IOException
298          */
299         public UncompressedARCReader(final File f, final long offset)
300         throws IOException {
301             // Arc file has been tested for existence by time it has come
302             // to here.
303             setIn(getInputStream(f, offset));
304             initialize(f.getAbsolutePath());
305         }
306         
307         /***
308          * Constructor.
309          * 
310          * @param f Uncompressed arc to read.
311          * @param is InputStream.
312          */
313         public UncompressedARCReader(final String f, final InputStream is) {
314             // Arc file has been tested for existence by time it has come
315             // to here.
316             setIn(is);
317             initialize(f);
318         }
319     }
320     
321     /***
322      * Compressed arc file reader.
323      * 
324      * @author stack
325      */
326     public class CompressedARCReader extends ARCReader {
327 
328         /***
329          * Constructor.
330          * 
331          * @param f
332          *            Compressed arcfile to read.
333          * @throws IOException
334          */
335         public CompressedARCReader(final File f) throws IOException {
336             this(f, 0);
337         }
338 
339         /***
340          * Constructor.
341          * 
342          * @param f Compressed arcfile to read.
343          * @param offset Position at where to start reading file.
344          * @throws IOException
345          */
346         public CompressedARCReader(final File f, final long offset)
347                 throws IOException {
348             // Arc file has been tested for existence by time it has come
349             // to here.
350             setIn(new GzippedInputStream(getInputStream(f, offset)));
351             setCompressed((offset == 0));
352             initialize(f.getAbsolutePath());
353         }
354         
355         /***
356          * Constructor.
357          * 
358          * @param f Compressed arcfile.
359          * @param is InputStream to use.
360          * @throws IOException
361          */
362         public CompressedARCReader(final String f, final InputStream is,
363             final boolean atFirstRecord)
364         throws IOException {
365             // Arc file has been tested for existence by time it has come
366             // to here.
367             setIn(new GzippedInputStream(is));
368             setCompressed(true);
369             setAlignedOnFirstRecord(atFirstRecord);
370             initialize(f);
371         }
372         
373         /***
374          * Get record at passed <code>offset</code>.
375          * 
376          * @param offset
377          *            Byte index into arcfile at which a record starts.
378          * @return An ARCRecord reference.
379          * @throws IOException
380          */
381         public ARCRecord get(long offset) throws IOException {
382             cleanupCurrentRecord();
383             ((GzippedInputStream)getIn()).gzipMemberSeek(offset);
384             return createArchiveRecord(getIn(), offset);
385         }
386         
387         public Iterator<ArchiveRecord> iterator() {
388             /***
389              * Override ARCRecordIterator so can base returned iterator on
390              * GzippedInputStream iterator.
391              */
392             return new ArchiveRecordIterator() {
393                 private GzippedInputStream gis =
394                     (GzippedInputStream)getInputStream();
395 
396                 private Iterator gzipIterator = this.gis.iterator();
397 
398                 protected boolean innerHasNext() {
399                     return this.gzipIterator.hasNext();
400                 }
401 
402                 protected ArchiveRecord innerNext() throws IOException {
403                     // Get the position before gzipIterator.next moves
404                     // it on past the gzip header.
405                     long p = this.gis.position();
406                     InputStream is = (InputStream) this.gzipIterator.next();
407                     return createArchiveRecord(is, p);
408                 }
409             };
410         }
411         
412         protected void gotoEOR(ArchiveRecord rec) throws IOException {
413             long skipped = ((GzippedInputStream)getIn()).
414                 gotoEOR(LINE_SEPARATOR);
415             if (skipped <= 0) {
416                 return;
417             }
418             // Report on system error the number of unexpected characters
419             // at the end of this record.
420             ArchiveRecordHeader meta = (getCurrentRecord() != null)?
421                 rec.getHeader(): null;
422             String message = "Record ENDING at " +
423                 ((GzippedInputStream)getIn()).position() +
424                 " has " + skipped + " trailing byte(s): " +
425                 ((meta != null)? meta.toString(): "");
426             if (isStrict()) {
427                 throw new IOException(message);
428             }
429             logStdErr(Level.WARNING, message);
430         }
431     }
432 }