View Javadoc

1   /*
2    * ARCWriter
3    *
4    * $Id: ARCWriter.java 5908 2008-07-28 22:01:46Z gojomo $
5    *
6    * Created on Jun 5, 2003
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.io.arc;
27  
28  import java.io.BufferedInputStream;
29  import java.io.ByteArrayInputStream;
30  import java.io.ByteArrayOutputStream;
31  import java.io.File;
32  import java.io.FileInputStream;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.io.PrintStream;
36  import java.io.UnsupportedEncodingException;
37  import java.util.Iterator;
38  import java.util.List;
39  import java.util.concurrent.atomic.AtomicInteger;
40  import java.util.logging.Logger;
41  import java.util.regex.Matcher;
42  import java.util.regex.Pattern;
43  
44  import org.archive.io.GzippedInputStream;
45  import org.archive.io.ReplayInputStream;
46  import org.archive.io.WriterPoolMember;
47  import org.archive.util.ArchiveUtils;
48  import org.archive.util.DevUtils;
49  import org.archive.util.MimetypeUtils;
50  
51  
52  /***
53   * Write ARC files.
54   *
55   * Assumption is that the caller is managing access to this ARCWriter ensuring
56   * only one thread of control accessing this ARC file instance at any one time.
57   *
58   * <p>ARC files are described here:
59   * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
60   * File Format</a>.  This class does version 1 of the ARC file format.  It also
61   * writes version 1.1 which is version 1 with data stuffed into the body of the
62   * first arc record in the file, the arc file meta record itself.
63   *
64   * <p>An ARC file is three lines of meta data followed by an optional 'body' and
65   * then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
66   * If we are writing compressed ARC files, then each of the ARC file records is
67   * individually gzipped and concatenated together to make up a single ARC file.
68   * In GZIP terms, each ARC record is a GZIP <i>member</i> of a total gzip'd
69   * file.
70   *
71   * <p>The GZIPping of the ARC file meta data is exceptional.  It is GZIPped
72   * w/ an extra GZIP header, a special Internet Archive (IA) extra header field
73   * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
74   * appended to the GZIP header).  The extra field has little in it but its
75   * presence denotes this GZIP as an Internet Archive gzipped ARC.  See RFC1952
76   * to learn about the GZIP header structure.
77   *
78   * <p>This class then does its GZIPping in the following fashion.  Each GZIP
79   * member is written w/ a new instance of GZIPOutputStream -- actually
80   * ARCWriterGZIPOututStream so we can get access to the underlying stream.
81   * The underlying stream stays open across GZIPoutputStream instantiations.
82   * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
83   * GZIPOutputStream output into a byte array, manipulating it adding the
84   * IA GZIP header, before writing to the stream.
85   *
86   * <p>I tried writing a resettable GZIPOutputStream and could make it work w/
87   * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
88   * native call doesn't seem to like the notion of resetting -- so I gave up on
89   * it.
90   *
91   * <p>Because of such as the above and troubles with GZIPInputStream, we should
92   * write our own GZIP*Streams, ones that resettable and consious of gzip
93   * members.
94   *
95   * <p>This class will write until we hit >= maxSize.  The check is done at
96   * record boundary.  Records do not span ARC files.  We will then close current
97   * file and open another and then continue writing.
98   *
99   * <p><b>TESTING: </b>Here is how to test that produced ARC files are good
100  * using the
101  * <a href="http://www.archive.org/web/researcher/tool_documentation.php">alexa
102  * ARC c-tools</a>:
103  * <pre>
104  * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
105  *     /tmp/hx20040109230030-0.dat.gz
106  * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
107  * </pre>
108  * Examine the produced cdx file to make sure it makes sense.  Search
109  * for 'no-type 0'.  If found, then we're opening a gzip record w/o data to
110  * write.  This is bad.
111  *
112  * <p>You can also do <code>gzip -t FILENAME</code> and it will tell you if the
113  * ARC makes sense to GZIP.
114  * 
115  * <p>While being written, ARCs have a '.open' suffix appended.
116  *
117  * @author stack
118  */
119 public class ARCWriter extends WriterPoolMember implements ARCConstants {
120     private static final Logger logger =
121         Logger.getLogger(ARCWriter.class.getName());
122     
123     /***
124      * Metadata line pattern.
125      */
126     private static final Pattern METADATA_LINE_PATTERN =
127         Pattern.compile("^//S+ //S+ //S+ //S+ //S+(" + LINE_SEPARATOR + "?)$");
128     
129     private List metadata = null;
130     
131     
132     /***
133      * Constructor.
134      * Takes a stream. Use with caution. There is no upperbound check on size.
135      * Will just keep writing.
136      * 
137      * @param serialNo  used to generate unique file name sequences
138      * @param out Where to write.
139      * @param arc File the <code>out</code> is connected to.
140      * @param cmprs Compress the content written.
141      * @param metadata File meta data.  Can be null.  Is list of File and/or
142      * String objects.
143      * @param a14DigitDate If null, we'll write current time.
144      * @throws IOException
145      */
146     public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
147     	final File arc, final boolean cmprs, String a14DigitDate,
148     	final List metadata)
149     throws IOException {
150         super(serialNo, out, arc, cmprs, a14DigitDate);
151         this.metadata = metadata;
152         writeFirstRecord(a14DigitDate);
153     }
154     
155     /***
156      * Constructor.
157      *
158      * @param serialNo  used to generate unique file name sequences
159      * @param dirs Where to drop the ARC files.
160      * @param prefix ARC file prefix to use.  If null, we use
161      * DEFAULT_ARC_FILE_PREFIX.
162      * @param cmprs Compress the ARC files written.  The compression is done
163      * by individually gzipping each record added to the ARC file: i.e. the
164      * ARC file is a bunch of gzipped records concatenated together.
165      * @param maxSize Maximum size for ARC files written.
166      */
167     public ARCWriter(final AtomicInteger serialNo, final List<File> dirs,
168     		final String prefix, final boolean cmprs, final long maxSize) {
169         this(serialNo, dirs, prefix, "", cmprs, maxSize, null);
170     }
171             
172     /***
173      * Constructor.
174      *
175      * @param serialNo  used to generate unique file name sequences
176      * @param dirs Where to drop files.
177      * @param prefix File prefix to use.
178      * @param cmprs Compress the records written. 
179      * @param maxSize Maximum size for ARC files written.
180      * @param suffix File tail to use.  If null, unused.
181      * @param meta File meta data.  Can be null.  Is list of File and/or
182      * String objects.
183      */
184     public ARCWriter(final AtomicInteger serialNo, final List<File> dirs,
185     		final String prefix, final String suffix, final boolean cmprs,
186             final long maxSize, final List meta) {
187         super(serialNo, dirs, prefix, suffix, cmprs, maxSize,
188         	ARC_FILE_EXTENSION);
189         this.metadata = meta;
190     }
191 
192     protected String createFile()
193     throws IOException {
194         String name = super.createFile();
195         writeFirstRecord(getCreateTimestamp());
196         return name;
197     }
198     
199     private void writeFirstRecord(final String ts)
200     throws IOException {
201         write(generateARCFileMetaData(ts));
202     }
203         
204 	/***
205      * Write out the ARCMetaData.
206      *
207      * <p>Generate ARC file meta data.  Currently we only do version 1 of the
208      * ARC file formats or version 1.1 when metadata has been supplied (We
209      * write it into the body of the first record in the arc file).
210      *
211      * <p>Version 1 metadata looks roughly like this:
212      *
213      * <pre>filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 //
214      *  20040110013326 text/plain 77
215      * 1 0 InternetArchive
216      * URL IP-address Archive-date Content-type Archive-length
217      * </pre>
218      *
219      * <p>If compress is set, then we generate a header that has been gzipped
220      * in the Internet Archive manner.   Such a gzipping enables the FEXTRA
221      * flag in the FLG field of the gzip header.  It then appends an extra
222      * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'.  The first two
223      * bytes are the length of the field and the last 6 bytes the Internet
224      * Archive header.  To learn about GZIP format, see RFC1952.  To learn
225      * about the Internet Archive extra header field, read the source for
226      * av_ziparc which can be found at
227      * <code>alexa/vista/alexa-tools-1.2/src/av_ziparc.cc</code>.
228      *
229      * <p>We do things in this roundabout manner because the java
230      * GZIPOutputStream does not give access to GZIP header fields.
231      *
232      * @param date Date to put into the ARC metadata.
233      *
234      * @return Byte array filled w/ the arc header.
235 	 * @throws IOException
236      */
237     private byte [] generateARCFileMetaData(String date)
238     throws IOException {
239         int metadataBodyLength = getMetadataLength();
240         // If metadata body, then the minor part of the version is '1' rather
241         // than '0'.
242         String metadataHeaderLinesTwoAndThree =
243             getMetadataHeaderLinesTwoAndThree("1 " +
244                 ((metadataBodyLength > 0)? "1": "0"));
245         int recordLength = metadataBodyLength +
246             metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
247         String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() +
248             " 0.0.0.0 " + date + " text/plain " + recordLength +
249             metadataHeaderLinesTwoAndThree;
250         ByteArrayOutputStream metabaos =
251             new ByteArrayOutputStream(recordLength);
252         // Write the metadata header.
253         metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
254         // Write the metadata body, if anything to write.
255         if (metadataBodyLength > 0) {
256             writeMetaData(metabaos);
257         }
258         
259         // Write out a LINE_SEPARATORs to end this record.
260         metabaos.write(LINE_SEPARATOR);
261         
262         // Now get bytes of all just written and compress if flag set.
263         byte [] bytes = metabaos.toByteArray();
264         
265         if(isCompressed()) {
266             // GZIP the header but catch the gzipping into a byte array so we
267             // can add the special IA GZIP header to the product.  After
268             // manipulations, write to the output stream (The JAVA GZIP
269             // implementation does not give access to GZIP header. It
270             // produces a 'default' header only).  We can get away w/ these
271             // maniupulations because the GZIP 'default' header doesn't
272             // do the 'optional' CRC'ing of the header.
273             byte [] gzippedMetaData = GzippedInputStream.gzip(bytes);
274             if (gzippedMetaData[3] != 0) {
275                 throw new IOException("The GZIP FLG header is unexpectedly " +
276                     " non-zero.  Need to add smarter code that can deal " +
277                     " when already extant extra GZIP header fields.");
278             }
279             // Set the GZIP FLG header to '4' which says that the GZIP header
280             // has extra fields.  Then insert the alex {'L', 'X', '0', '0', '0,
281             // '0'} 'extra' field.  The IA GZIP header will also set byte
282             // 9 (zero-based), the OS byte, to 3 (Unix).  We'll do the same.
283             gzippedMetaData[3] = 4;
284             gzippedMetaData[9] = 3;
285             byte [] assemblyBuffer = new byte[gzippedMetaData.length +
286                 ARC_GZIP_EXTRA_FIELD.length];
287             // '10' in the below is a pointer past the following bytes of the
288             // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS.  See
289             // RFC1952 for explaination of the abbreviations just used.
290             System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
291             System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
292                 ARC_GZIP_EXTRA_FIELD.length);
293             System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
294                 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
295             bytes = assemblyBuffer;
296         }
297         return bytes;
298     }
299     
300     public String getMetadataHeaderLinesTwoAndThree(String version) {
301         StringBuffer buffer = new StringBuffer();
302         buffer.append(LINE_SEPARATOR);
303         buffer.append(version);
304         buffer.append(" InternetArchive");
305         buffer.append(LINE_SEPARATOR);
306         buffer.append("URL IP-address Archive-date Content-type Archive-length");
307         buffer.append(LINE_SEPARATOR);
308         return buffer.toString();
309     }
310 
311     /***
312      * Write all metadata to passed <code>baos</code>.
313      *
314      * @param baos Byte array to write to.
315      * @throws UnsupportedEncodingException
316      * @throws IOException
317      */
318     private void writeMetaData(ByteArrayOutputStream baos)
319             throws UnsupportedEncodingException, IOException {
320         if (this.metadata == null) {
321             return;
322         }
323 
324         for (Iterator i = this.metadata.iterator();
325                 i.hasNext();) {
326             Object obj = i.next();
327             if (obj instanceof String) {
328                 baos.write(((String)obj).getBytes(DEFAULT_ENCODING));
329             } else if (obj instanceof File) {
330                 InputStream is = null;
331                 try {
332                     is = new BufferedInputStream(
333                         new FileInputStream((File)obj));
334                     byte [] buffer = new byte[4096];
335                     for (int read = -1; (read = is.read(buffer)) != -1;) {
336                         baos.write(buffer, 0, read);
337                     }
338                 } finally {
339                     if (is != null) {
340                         is.close();
341                     }
342                 }
343             } else if (obj != null) {
344                 logger.severe("Unsupported metadata type: " + obj);
345             }
346         }
347         return;
348     }
349 
350     /***
351      * @return Total length of metadata.
352      * @throws UnsupportedEncodingException
353      */
354     private int getMetadataLength()
355     throws UnsupportedEncodingException {
356         int result = -1;
357         if (this.metadata == null) {
358             result = 0;
359         } else {
360             for (Iterator i = this.metadata.iterator();
361                     i.hasNext();) {
362                 Object obj = i.next();
363                 if (obj instanceof String) {
364                     result += ((String)obj).getBytes(DEFAULT_ENCODING).length;
365                 } else if (obj instanceof File) {
366                     result += ((File)obj).length();
367                 } else {
368                     logger.severe("Unsupported metadata type: " + obj);
369                 }
370             }
371         }
372         return result;
373     }
374 
375     /***
376      * @deprecated use input-stream version directly instead
377      */
378     public void write(String uri, String contentType, String hostIP,
379             long fetchBeginTimeStamp, long recordLength,
380             ByteArrayOutputStream baos)
381     throws IOException {
382         write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength, 
383                 new ByteArrayInputStream(baos.toByteArray()), false);
384     }
385     
386     public void write(String uri, String contentType, String hostIP,
387             long fetchBeginTimeStamp, long recordLength, InputStream in)
388     throws IOException {
389         write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true);
390     }
391     
392     /***
393      * Write a record with the given metadata/content.
394      * 
395      * @param uri
396      *            URI for metadata-line
397      * @param contentType
398      *            MIME content-type for metadata-line
399      * @param hostIP
400      *            IP for metadata-line
401      * @param fetchBeginTimeStamp
402      *            timestamp for metadata-line
403      * @param recordLength
404      *            length for metadata-line; also may be enforced
405      * @param in
406      *            source InputStream for record content
407      * @param enforceLength
408      *            whether to enforce the declared length; should be true
409      *            unless intentionally writing bad records for testing
410      * @throws IOException
411      */
412     public void write(String uri, String contentType, String hostIP,
413             long fetchBeginTimeStamp, long recordLength, InputStream in,
414             boolean enforceLength) throws IOException {
415         preWriteRecordTasks();
416         try {
417             write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp,
418                     recordLength).getBytes(UTF8));
419             copyFrom(in, recordLength, enforceLength);
420             if (in instanceof ReplayInputStream) {
421                 // check for consumption of entire recorded material
422                 long remaining = ((ReplayInputStream) in).remaining();
423                 // Should be zero at this stage. If not, something is
424                 // wrong.
425                 if (remaining != 0) {
426                     String message = "Gap between expected and actual: "
427                             + remaining + LINE_SEPARATOR + DevUtils.extraInfo()
428                             + " writing arc "
429                             + this.getFile().getAbsolutePath();
430                     DevUtils.warnHandle(new Throwable(message), message);
431                     throw new IOException(message);
432                 }
433             }
434             write(LINE_SEPARATOR);
435         } finally {
436             postWriteRecordTasks();
437         }
438     }
439     
440     /***
441      * @param uri
442      * @param contentType
443      * @param hostIP
444      * @param fetchBeginTimeStamp
445      * @param recordLength
446      * @return Metadata line for an ARCRecord made of passed components.
447      * @exception IOException
448      */
449     protected String getMetaLine(String uri, String contentType, String hostIP,
450         long fetchBeginTimeStamp, long recordLength)
451     throws IOException {
452         if (fetchBeginTimeStamp <= 0) {
453             throw new IOException("Bogus fetchBeginTimestamp: " +
454                 Long.toString(fetchBeginTimeStamp));
455         }
456 
457         return validateMetaLine(createMetaline(uri, hostIP, 
458             ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
459             MimetypeUtils.truncate(contentType),
460             Long.toString(recordLength)));
461     }
462     
463     public String createMetaline(String uri, String hostIP,
464             String timeStamp, String mimetype, String recordLength) {
465         return uri + HEADER_FIELD_SEPARATOR + hostIP +
466             HEADER_FIELD_SEPARATOR + timeStamp +
467             HEADER_FIELD_SEPARATOR + mimetype +
468             HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
469     }
470     
471     /***
472      * Test that the metadata line is valid before writing.
473      * @param metaLineStr
474      * @throws IOException
475      * @return The passed in metaline.
476      */
477     protected String validateMetaLine(String metaLineStr)
478     throws IOException {
479         if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
480         	throw new IOException("Metadata line too long ("
481                 + metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH 
482                 + "): " + metaLineStr);
483         }
484      	Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
485         if (!m.matches()) {
486             throw new IOException("Metadata line doesn't match expected" +
487                 " pattern: " + metaLineStr);
488         }
489         return metaLineStr;
490     }
491 }