ARCReader xref

View Javadoc

1   /* $Id: ARCReader.java 6786 2010-03-10 00:42:08Z szznax $
2    *
3    * Created on May 1, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.arc;
24  
25  import java.io.ByteArrayOutputStream;
26  import java.io.File;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.HashMap;
32  import java.util.Iterator;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.concurrent.atomic.AtomicInteger;
36  import java.util.logging.Level;
37  import java.util.logging.Logger;
38  import java.util.regex.Matcher;
39  
40  import org.apache.commons.cli.CommandLine;
41  import org.apache.commons.cli.HelpFormatter;
42  import org.apache.commons.cli.Option;
43  import org.apache.commons.cli.Options;
44  import org.apache.commons.cli.ParseException;
45  import org.apache.commons.cli.PosixParser;
46  import org.archive.io.ArchiveReader;
47  import org.archive.io.ArchiveRecord;
48  import org.archive.io.ArchiveRecordHeader;
49  import org.archive.io.RecoverableIOException;
50  import org.archive.io.WriterPoolMember;
51  import org.archive.util.ArchiveUtils;
52  import org.archive.util.InetAddressUtil;
53  import org.archive.util.TextUtils;
54  
55  
56  /***
57   * Get an iterator on an ARC file or get a record by absolute position.
58   *
59   * ARC files are described here:
60   * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
61   * File Format</a>.
62   *
63   * <p>This class knows how to parse an ARC file.  Pass it a file path
64   * or an URL to an ARC. It can parse ARC Version 1 and 2.
65   *
66   * <p>Iterator returns <code>ARCRecord</code>
67   * though {@link Iterator#next()} is returning
68   * java.lang.Object.  Cast the return.
69   *
70   * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
71   * latter slightly slower -- but not by much.  TODO: Test more.  Just
72   * change {@link #getInputStream(File, long)}.
73   *
74   * @author stack
75   * @version $Date: 2010-03-10 00:42:08 +0000 (Wed, 10 Mar 2010) $ $Revision: 6786 $
76   */
77  public abstract class ARCReader extends ArchiveReader
78  implements ARCConstants {
79      Logger logger = Logger.getLogger(ARCReader.class.getName());
80      
81      /***
82       * Set to true if we are aligned on first record of Archive file.
83       * We used depend on offset. If offset was zero, then we were
84       * aligned on first record.  This is no longer necessarily the case when
85       * Reader is created at an offset into an Archive file: The offset is zero
86       * but its relative to where we started reading.
87       */
88      private boolean alignedOnFirstRecord = true;
89      
90      /***
91       * Assumed maximum size of a record meta header line.
92       *
93       * This 100k which seems massive but its the same as the LINE_LENGTH from
94       * <code>alexa/include/a_arcio.h</code>:
95       * <pre>
96       * #define LINE_LENGTH     (100*1024)
97       * </pre>
98       */
99      private static final int MAX_HEADER_LINE_LENGTH = 1024 * 100;
100 
101     /***
102      * An array of the header field names found in the ARC file header on
103      * the 3rd line.
104      * 
105      * We used to read these in from the arc file first record 3rd line but
106      * now we hardcode them for sake of improved performance.
107      */
108     public final static String [] HEADER_FIELD_NAME_KEYS = {
109         URL_FIELD_KEY,
110         IP_HEADER_FIELD_KEY,
111         DATE_FIELD_KEY,
112         MIMETYPE_FIELD_KEY,
113         LENGTH_FIELD_KEY
114     };
115     
116     private boolean parseHttpHeaders = true;
117         
118     ARCReader() {
119     	super();
120     }
121     
122     /***
123      * Skip over any trailing new lines at end of the record so we're lined up
124      * ready to read the next.
125      * @param record
126      * @throws IOException
127      */
128     protected void gotoEOR(ArchiveRecord record) throws IOException {
129         if (getIn().available() <= 0) {
130             return;
131         }
132         
133         // Remove any trailing LINE_SEPARATOR
134         int c = -1;
135         while (getIn().available() > 0) {
136             if (getIn().markSupported()) {
137                 getIn().mark(1);
138             }
139             c = getIn().read();
140             if (c != -1) {
141                 if (c == LINE_SEPARATOR) {
142                     continue;
143                 }
144                 if (getIn().markSupported()) {
145                     // We've overread.  We're probably in next record.  There is
146                     // no way of telling for sure. It may be dross at end of
147                     // current record. Backup.
148                 	getIn().reset();
149                     break;
150                 }
151                 ArchiveRecordHeader h = (getCurrentRecord() != null)?
152                     record.getHeader(): null;
153                 throw new IOException("Read " + (char)c +
154                     " when only " + LINE_SEPARATOR + " expected. " + 
155                     getReaderIdentifier() + ((h != null)?
156                         h.getHeaderFields().toString(): ""));
157             }
158         }
159     }
160     
161     /***
162      * Create new arc record.
163      *
164      * Encapsulate housekeeping that has to do w/ creating a new record.
165      *
166      * <p>Call this method at end of constructor to read in the
167      * arcfile header.  Will be problems reading subsequent arc records
168      * if you don't since arcfile header has the list of metadata fields for
169      * all records that follow.
170      * 
171      * <p>When parsing through ARCs writing out CDX info, we spend about
172      * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
173      * -- of which 16% is reading.
174      *
175      * @param is InputStream to use.
176      * @param offset Absolute offset into arc file.
177      * @return An arc record.
178      * @throws IOException
179      */
180     protected ARCRecord createArchiveRecord(InputStream is, long offset)
181     throws IOException {
182         ArrayList<String> firstLineValues = new ArrayList<String>(20);
183         getTokenizedHeaderLine(is, firstLineValues);
184         int bodyOffset = 0;
185         if (offset == 0 && isAlignedOnFirstRecord()) {
186             // If offset is zero and we were aligned at first record on
187             // creation (See #alignedOnFirstRecord for more on this), then no
188             // records have been read yet and we're reading our first one, the
189             // record of ARC file meta info.  Its special.  In ARC versions
190             // 1.x, first record has three lines of meta info. We've just read
191             // the first line. There are two more.  The second line has misc.
192             // info.  We're only interested in the first field, the version
193             // number.  The third line is the list of field names. Here's what
194             // ARC file version 1.x meta content looks like:
195             //
196             // filedesc://testIsBoundary-JunitIAH200401070157520.arc 0.0.0.0 //
197             //      20040107015752 text/plain 77
198             // 1 0 InternetArchive
199             // URL IP-address Archive-date Content-type Archive-length
200             //
201             ArrayList<String> secondLineValues = new ArrayList<String>(20);
202             bodyOffset += getTokenizedHeaderLine(is, secondLineValues);
203             setVersion((String)secondLineValues.get(0) +
204                 "." + (String)secondLineValues.get(1));
205             // Just read over the 3rd line.  We used to parse it and use
206             // values found here but now we just hardcode them to avoid
207             // having to read this 3rd line even for random arc file accesses.
208             bodyOffset += getTokenizedHeaderLine(is, null);
209         }
210 
211         try {
212             currentRecord(new ARCRecord(is, 
213                     (ArchiveRecordHeader)computeMetaData(
214                             Arrays.asList(HEADER_FIELD_NAME_KEYS),
215                             firstLineValues, getVersion(), offset), 
216                             bodyOffset, isDigest(), isStrict(), 
217                             isParseHttpHeaders()));
218         } catch (IOException e) {
219             if (e instanceof RecoverableIOException) {
220                 // Don't mess with RecoverableIOExceptions.  Let them out.
221                 throw e;
222             }
223             IOException newE = new IOException(e.getMessage() + " (Offset " +
224                     offset + ").");
225             newE.setStackTrace(e.getStackTrace());
226             throw newE;
227         }
228         return (ARCRecord)getCurrentRecord();
229     }
230     
231     /***
232      * Returns version of this ARC file.  Usually read from first record of ARC.
233      * If we're reading without having first read the first record -- e.g.
234      * random access into middle of an ARC -- then version will not have been
235      * set.  For now, we return a default, version 1.1.  Later, if more than
236      * just one version of ARC, we could look at such as the meta line to see
237      * what version of ARC this is.
238      * @return Version of this ARC file.
239      */
240     public String getVersion() {
241         return (super.getVersion() == null)? "1.1": super.getVersion();
242     }
243 
244     /***
245      * Get a record header line as list of tokens.
246      *
247      * We keep reading till we find a LINE_SEPARATOR or we reach the end
248      * of file w/o finding a LINE_SEPARATOR or the line length is crazy.
249      *
250      * @param stream InputStream to read from.
251      * @param list Empty list that gets filled w/ string tokens.
252      * @return Count of characters read.
253      * @exception IOException If problem reading stream or no line separator
254      * found or EOF before EOL or we didn't get minimum header fields.
255      */
256     private int getTokenizedHeaderLine(final InputStream stream,
257             List<String> list) throws IOException {
258         // Preallocate usual line size.
259         StringBuilder buffer = new StringBuilder(2048 + 20);
260         int read = 0;
261         int previous = -1;
262         for (int c = -1; true;) {
263         	previous = c;
264             c = stream.read();
265             if (c == -1) {
266                 throw new RecoverableIOException("Hit EOF before header EOL.");
267             }
268             c &= 0xff; 
269             read++;
270             if (read > MAX_HEADER_LINE_LENGTH) {
271                 throw new IOException("Header line longer than max allowed " +
272                     " -- " + String.valueOf(MAX_HEADER_LINE_LENGTH) +
273                     " -- or passed buffer doesn't contain a line (Read: " +
274                     buffer.length() + ").  Here's" +
275                     " some of what was read: " +
276                     buffer.substring(0, Math.min(buffer.length(), 256)));
277             }
278 
279             if (c == LINE_SEPARATOR) {
280                 if (buffer.length() == 0) {
281                     // Empty line at start of buffer.  Skip it and try again.
282                     continue;
283                 }
284 
285                 if (list != null) {
286                     list.add(buffer.toString());
287                 }
288                 // LOOP TERMINATION.
289                 break;
290             } else if (c == HEADER_FIELD_SEPARATOR) {
291             	if (!isStrict() && previous == HEADER_FIELD_SEPARATOR) {
292             		// Early ARCs sometimes had multiple spaces between fields.
293             		continue;
294             	}
295                 if (list != null) {
296                     list.add(buffer.toString());
297                 }
298                 // reset to empty
299                 buffer.setLength(0);
300             } else {
301                 buffer.append((char)c);
302             }
303         }
304 
305         // List must have at least 3 elements in it and no more than 10.  If
306         // it has other than this, then bogus parse.
307         if (list != null && (list.size() < 3 || list.size() > 100)) {
308             throw new IOException("Unparseable header line: " + list);
309         }
310 
311         return read;
312     }
313 
314     /***
315      * Compute metadata fields.
316      *
317      * Here we check the meta field has right number of items in it.
318      *
319      * @param keys Keys to use composing headerFields map.
320      * @param values Values to set into the headerFields map.
321      * @param v The version of this ARC file.
322      * @param offset Offset into arc file.
323      *
324      * @return Metadata structure for this record.
325      *
326      * @exception IOException  If no. of keys doesn't match no. of values.
327      */
328     private ARCRecordMetaData computeMetaData(List<String> keys,
329     		List<String> values, String v, long offset)
330     throws IOException {
331         if (keys.size() != values.size()) {
332             List<String> originalValues = values;
333             if (!isStrict()) {
334                 values = fixSpaceInURL(values, keys.size());
335                 // If values still doesn't match key size, try and do
336                 // further repair.
337 	            if (keys.size() != values.size()) {
338 	            	// Early ARCs had a space in mimetype.
339 	            	if (values.size() == (keys.size() + 1) &&
340 	            			values.get(4).toLowerCase().startsWith("charset=")) {
341 	            		List<String> nuvalues =
342 	            			new ArrayList<String>(keys.size());
343 	            		nuvalues.add(0, values.get(0));
344 	            		nuvalues.add(1, values.get(1));
345 	            		nuvalues.add(2, values.get(2));
346 	            		nuvalues.add(3, values.get(3) + values.get(4));
347 	            		nuvalues.add(4, values.get(5));
348 	            		values = nuvalues;
349 	            	} else if((values.size() + 1) == keys.size() &&
350                             isLegitimateIPValue(values.get(1)) &&
351                             isDate(values.get(2)) && isNumber(values.get(3))) {
352                         // Mimetype is empty.
353                         List<String> nuvalues =
354                             new ArrayList<String>(keys.size());
355                         nuvalues.add(0, values.get(0));
356                         nuvalues.add(1, values.get(1));
357                         nuvalues.add(2, values.get(2));
358                         nuvalues.add(3, "-");
359                         nuvalues.add(4, values.get(3));
360                         values = nuvalues;
361                     }
362 	            }
363         	}
364             if (keys.size() != values.size()) {
365                 throw new IOException("Size of field name keys does" +
366                     " not match count of field values: " + values);
367             }
368             // Note that field was fixed on stderr.
369             logStdErr(Level.WARNING, "Fixed spaces in metadata line at " +
370             	"offset " + offset +
371                 " Original: " + originalValues + ", New: " + values);
372         }
373         
374         Map<Object, Object> headerFields =
375         	new HashMap<Object, Object>(keys.size() + 2);
376         for (int i = 0; i < keys.size(); i++) {
377             headerFields.put(keys.get(i), values.get(i));
378         }
379         
380         // Add a check for tabs in URLs.  If any, replace with '%09'.
381         // See https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966,
382         // [ 1010966 ] crawl.log has URIs with spaces in them.
383         String url = (String)headerFields.get(URL_FIELD_KEY);
384         if (url != null && url.indexOf('\t') >= 0) {
385             headerFields.put(URL_FIELD_KEY,
386                 TextUtils.replaceAll("\t", url, "%09"));
387         }
388 
389         headerFields.put(VERSION_FIELD_KEY, v);
390         headerFields.put(ABSOLUTE_OFFSET_KEY, new  Long(offset));
391 
392         return new ARCRecordMetaData(getReaderIdentifier(), headerFields);
393     }
394     
395     protected boolean isDate(final String date) {
396         if (date.length() != 14) {
397             return false;
398         }
399         return isNumber(date);
400     }
401     
402     protected boolean isNumber(final String n) {
403         for (int i = 0; i < n.length(); i++) {
404             if (!Character.isDigit(n.charAt(i))) {
405                 return false;
406             }
407         }
408         return true;
409     }
410     
411     protected boolean isLegitimateIPValue(final String ip) {
412         if ("-".equals(ip)) {
413             return true;
414         }
415         Matcher m = InetAddressUtil.IPV4_QUADS.matcher(ip);
416         return m != null && m.matches();
417     }
418     
419     /***
420      * Fix space in URLs.
421      * The ARCWriter used to write into the ARC URLs with spaces in them.
422      * See <a
423      * href="https://sourceforge.net/tracker/?group_id=73833&atid=539099&func=detail&aid=1010966">[ 1010966 ]
424      * crawl.log has URIs with spaces in them</a>.
425      * This method does fix up on such headers converting all spaces found
426      * to '%20'.
427      * @param values List of metadata values.
428      * @param requiredSize Expected size of resultant values list.
429      * @return New list if we successfully fixed up values or original if
430      * fixup failed.
431      */
432     protected List<String> fixSpaceInURL(List<String> values, int requiredSize) {
433         // Do validity check. 3rd from last is a date of 14 numeric
434         // characters. The 4th from last is IP, all before the IP
435         // should be concatenated together with a '%20' joiner.
436         // In the below, '4' is 4th field from end which has the IP.
437         if (!(values.size() > requiredSize) || values.size() < 4) {
438             return values;
439         }
440         // Test 3rd field is valid date.
441         if (!isDate((String) values.get(values.size() - 3))) {
442             return values;
443         }
444 
445         // Test 4th field is valid IP.
446         if (!isLegitimateIPValue((String) values.get(values.size() - 4))) {
447             return values;
448         }
449 
450         List<String> newValues = new ArrayList<String>(requiredSize);
451         StringBuffer url = new StringBuffer();
452         for (int i = 0; i < (values.size() - 4); i++) {
453             if (i > 0) {
454                 url.append("%20");
455             }
456             url.append(values.get(i));
457         }
458         newValues.add(url.toString());
459         for (int i = values.size() - 4; i < values.size(); i++) {
460             newValues.add(values.get(i));
461         }
462         return newValues;
463     }
464     
465 	protected boolean isAlignedOnFirstRecord() {
466 		return alignedOnFirstRecord;
467 	}
468 
469 	protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
470 		this.alignedOnFirstRecord = alignedOnFirstRecord;
471 	}
472 	
473     /***
474      * @return Returns the parseHttpHeaders.
475      */
476     public boolean isParseHttpHeaders() {
477         return this.parseHttpHeaders;
478     }
479     
480     /***
481      * @param parse The parseHttpHeaders to set.
482      */
483     public void setParseHttpHeaders(boolean parse) {
484         this.parseHttpHeaders = parse;
485     }
486     
487 	public String getFileExtension() {
488 		return ARC_FILE_EXTENSION;
489 	}
490 	
491 	public String getDotFileExtension() {
492 		return DOT_ARC_FILE_EXTENSION;
493 	}
494 	
495 	protected boolean output(final String format) 
496 	throws IOException, java.text.ParseException {
497 		boolean result = super.output(format);
498 		if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
499 			throw new IOException(format +
500 				" format only supported for single Records");
501 		}
502 		return result;
503 	}
504     
505     public boolean outputRecord(final String format) throws IOException {
506 		boolean result = super.outputRecord(format);
507 		if (result) {
508 			return result;
509 		}
510 		if (format.equals(NOHEAD)) {
511 			// No point digesting if dumping content.
512 			setDigest(false);
513 			ARCRecord r = (ARCRecord) get();
514 			r.skipHttpHeader();
515 			r.dump();
516 			result = true;
517 		} else if (format.equals(HEADER)) {
518 			// No point digesting if dumping content.
519 			setDigest(false);
520 			ARCRecord r = (ARCRecord) get();
521 			r.dumpHttpHeader();
522 			result = true;
523 		}
524 
525 		return result;
526 	}
527 
528     public void dump(final boolean compress)
529     throws IOException, java.text.ParseException {
530         // No point digesting if we're doing a dump.
531         setDigest(false);
532         boolean firstRecord = true;
533         ARCWriter writer = null;
534         for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
535             ARCRecord r = (ARCRecord)ii.next();
536             // We're to dump the arc on stdout.
537             // Get the first record's data if any.
538             ARCRecordMetaData meta = r.getMetaData();
539             if (firstRecord) {
540                 firstRecord = false;
541                 // Get an ARCWriter.
542                 ByteArrayOutputStream baos =
543                     new ByteArrayOutputStream(r.available());
544                 // This is slow but done only once at top of ARC.
545                 while (r.available() > 0) {
546                     baos.write(r.read());
547                 }
548                 List<String> listOfMetadata = new ArrayList<String>();
549                 listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
550                 // Assume getArc returns full path to file.  ARCWriter
551                 // or new File will complain if it is otherwise.
552                 writer = new ARCWriter(new AtomicInteger(), System.out,
553                     new File(meta.getArc()),
554                     compress, meta.getDate(), listOfMetadata);
555                 continue;
556             }
557             
558             writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
559                 ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
560                 (int)meta.getLength(), r);
561         }
562         // System.out.println(System.currentTimeMillis() - start);
563     }
564     
565     /***
566      * @return an ArchiveReader that will delete a local file on close.  Used
567      * when we bring Archive files local and need to clean up afterward.
568      */
569     public ARCReader getDeleteFileOnCloseReader(final File f) {
570         final ARCReader d = this;
571         return new ARCReader() {
572             private final ARCReader delegate = d;
573             private File archiveFile = f;
574             
575             public void close() throws IOException {
576                 this.delegate.close();
577                 if (this.archiveFile != null) {
578                     if (archiveFile.exists()) {
579                         archiveFile.delete();
580                     }
581                     this.archiveFile = null;
582                 }
583             }
584             
585             public ArchiveRecord get(long o) throws IOException {
586                 return this.delegate.get(o);
587             }
588             
589             public boolean isDigest() {
590                 return this.delegate.isDigest();
591             }
592             
593             public boolean isStrict() {
594                 return this.delegate.isStrict();
595             }
596             
597             public Iterator<ArchiveRecord> iterator() {
598                 return this.delegate.iterator();
599             }
600             
601             public void setDigest(boolean d) {
602                 this.delegate.setDigest(d);
603             }
604             
605             public void setStrict(boolean s) {
606                 this.delegate.setStrict(s);
607             }
608             
609             public List validate() throws IOException {
610                 return this.delegate.validate();
611             }
612 
613             @Override
614             public ArchiveRecord get() throws IOException {
615                 return this.delegate.get();
616             }
617 
618             @Override
619             public String getVersion() {
620                 return this.delegate.getVersion();
621             }
622 
623             @Override
624             public List validate(int noRecords) throws IOException {
625                 return this.delegate.validate(noRecords);
626             }
627 
628             @Override
629             protected ARCRecord createArchiveRecord(InputStream is,
630                     long offset)
631             throws IOException {
632                 return this.delegate.createArchiveRecord(is, offset);
633             }
634 
635             @Override
636             protected void gotoEOR(ArchiveRecord record) throws IOException {
637                 this.delegate.gotoEOR(record);
638             }
639 
640             @Override
641             public void dump(boolean compress)
642             throws IOException, java.text.ParseException {
643                 this.delegate.dump(compress);
644             }
645 
646             @Override
647             public String getDotFileExtension() {
648                 return this.delegate.getDotFileExtension();
649             }
650 
651             @Override
652             public String getFileExtension() {
653                 return this.delegate.getFileExtension();
654             }
655         };
656     }
657     
658     // Static methods follow.
659 
660     /***
661      *
662      * @param formatter Help formatter instance.
663      * @param options Usage options.
664      * @param exitCode Exit code.
665      */
666     private static void usage(HelpFormatter formatter, Options options,
667             int exitCode) {
668         formatter.printHelp("java org.archive.io.arc.ARCReader" +
669             " [--digest=true|false] //\n" +
670             " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +
671             " [--offset=#] //\n[--strict] [--parse] ARC_FILE|ARC_URL",
672                 options);
673         System.exit(exitCode);
674     }
675 
676     /***
677      * Write out the arcfile.
678      * 
679      * @param reader
680      * @param format Format to use outputting.
681      * @throws IOException
682      * @throws java.text.ParseException
683      */
684     protected static void output(ARCReader reader, String format)
685     throws IOException, java.text.ParseException {
686     	if (!reader.output(format)) {
687             throw new IOException("Unsupported format: " + format);
688     	}
689     }
690 
691     /***
692      * Generate a CDX index file for an ARC file.
693      *
694      * @param urlOrPath The ARC file to generate a CDX index for
695      * @throws IOException
696      * @throws java.text.ParseException
697      */
698     public static void createCDXIndexFile(String urlOrPath)
699     throws IOException, java.text.ParseException {
700     	ARCReader r = ARCReaderFactory.get(urlOrPath);
701     	r.setStrict(false);
702     	r.setParseHttpHeaders(true);
703     	r.setDigest(true);
704     	output(r, CDX_FILE);
705     }
706 
707     /***
708      * Command-line interface to ARCReader.
709      *
710      * Here is the command-line interface:
711      * <pre>
712      * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
713      *  -h,--help      Prints this message and exits.
714      *  -o,--offset    Outputs record at this offset into arc file.</pre>
715      *
716      * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll
717      * take care of classpaths and the calling of ARCReader.
718      *
719      * <p>Outputs using a pseudo-CDX format as described here:
720      * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
721      * Legent</a> and here
722      * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
723      * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
724      * Hash is hard-coded straight SHA-1 hash of content.
725      *
726      * @param args Command-line arguments.
727      * @throws ParseException Failed parse of the command line.
728      * @throws IOException
729      * @throws java.text.ParseException
730      */
731     public static void main(String [] args)
732     throws ParseException, IOException, java.text.ParseException {
733         Options options = getOptions();
734         options.addOption(new Option("p","parse", false, "Parse headers."));
735         PosixParser parser = new PosixParser();
736         CommandLine cmdline = parser.parse(options, args, false);
737         List cmdlineArgs = cmdline.getArgList();
738         Option [] cmdlineOptions = cmdline.getOptions();
739         HelpFormatter formatter = new HelpFormatter();
740 
741         // If no args, print help.
742         if (cmdlineArgs.size() <= 0) {
743             usage(formatter, options, 0);
744         }
745 
746         // Now look at options passed.
747         long offset = -1;
748         boolean digest = false;
749         boolean strict = false;
750         boolean parse = false;
751         String format = CDX;
752         for (int i = 0; i < cmdlineOptions.length; i++) {
753             switch(cmdlineOptions[i].getId()) {
754                 case 'h':
755                     usage(formatter, options, 0);
756                     break;
757 
758                 case 'o':
759                     offset =
760                         Long.parseLong(cmdlineOptions[i].getValue());
761                     break;
762                     
763                 case 's':
764                     strict = true;
765                     break;
766                     
767                 case 'p':
768                 	parse = true;
769                     break;
770                     
771                 case 'd':
772                 	digest = getTrueOrFalse(cmdlineOptions[i].getValue());
773                     break;
774                     
775                 case 'f':
776                     format = cmdlineOptions[i].getValue().toLowerCase();
777                     boolean match = false;
778                     // List of supported formats.
779                     final String [] supportedFormats =
780                 		{CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
781                     for (int ii = 0; ii < supportedFormats.length; ii++) {
782                         if (supportedFormats[ii].equals(format)) {
783                             match = true;
784                             break;
785                         }
786                     }
787                     if (!match) {
788                         usage(formatter, options, 1);
789                     }
790                     break;
791 
792                 default:
793                     throw new RuntimeException("Unexpected option: " +
794                         + cmdlineOptions[i].getId());
795             }
796         }
797         
798         if (offset >= 0) {
799             if (cmdlineArgs.size() != 1) {
800                 System.out.println("Error: Pass one arcfile only.");
801                 usage(formatter, options, 1);
802             }
803             ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0),
804             	offset);
805             arc.setStrict(strict);
806             // We must parse headers if we need to skip them.
807             if (format.equals(NOHEAD) || format.equals(HEADER)) {
808                 parse = true;
809             }
810             arc.setParseHttpHeaders(parse);
811             outputRecord(arc, format);
812         } else {
813             for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
814                 String urlOrPath = (String)i.next();
815                 try {
816                 	ARCReader r = ARCReaderFactory.get(urlOrPath);
817                 	r.setStrict(strict);
818                 	r.setParseHttpHeaders(parse);
819                 	r.setDigest(digest);
820                     output(r, format);
821                 } catch (RuntimeException e) {
822                     // Write out name of file we failed on to help with
823                     // debugging.  Then print stack trace and try to keep
824                     // going.  We do this for case where we're being fed
825                     // a bunch of ARCs; just note the bad one and move
826                     // on to the next.
827                     System.err.println("Exception processing " + urlOrPath +
828                         ": " + e.getMessage());
829                     e.printStackTrace(System.err);
830                     System.exit(1);
831                 }
832             }
833         }
834     }
835 }