1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.io.arc;
27
28 import java.io.BufferedInputStream;
29 import java.io.ByteArrayInputStream;
30 import java.io.ByteArrayOutputStream;
31 import java.io.File;
32 import java.io.FileInputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.io.PrintStream;
36 import java.io.UnsupportedEncodingException;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.concurrent.atomic.AtomicInteger;
40 import java.util.logging.Logger;
41 import java.util.regex.Matcher;
42 import java.util.regex.Pattern;
43
44 import org.archive.io.GzippedInputStream;
45 import org.archive.io.ReplayInputStream;
46 import org.archive.io.WriterPoolMember;
47 import org.archive.util.ArchiveUtils;
48 import org.archive.util.DevUtils;
49 import org.archive.util.MimetypeUtils;
50
51
52 /***
53 * Write ARC files.
54 *
55 * Assumption is that the caller is managing access to this ARCWriter ensuring
56 * only one thread of control accessing this ARC file instance at any one time.
57 *
58 * <p>ARC files are described here:
59 * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
60 * File Format</a>. This class does version 1 of the ARC file format. It also
61 * writes version 1.1 which is version 1 with data stuffed into the body of the
62 * first arc record in the file, the arc file meta record itself.
63 *
64 * <p>An ARC file is three lines of meta data followed by an optional 'body' and
65 * then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
66 * If we are writing compressed ARC files, then each of the ARC file records is
67 * individually gzipped and concatenated together to make up a single ARC file.
68 * In GZIP terms, each ARC record is a GZIP <i>member</i> of a total gzip'd
69 * file.
70 *
71 * <p>The GZIPping of the ARC file meta data is exceptional. It is GZIPped
72 * w/ an extra GZIP header, a special Internet Archive (IA) extra header field
73 * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
74 * appended to the GZIP header). The extra field has little in it but its
75 * presence denotes this GZIP as an Internet Archive gzipped ARC. See RFC1952
76 * to learn about the GZIP header structure.
77 *
78 * <p>This class then does its GZIPping in the following fashion. Each GZIP
79 * member is written w/ a new instance of GZIPOutputStream -- actually
80 * ARCWriterGZIPOututStream so we can get access to the underlying stream.
81 * The underlying stream stays open across GZIPoutputStream instantiations.
82 * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
83 * GZIPOutputStream output into a byte array, manipulating it adding the
84 * IA GZIP header, before writing to the stream.
85 *
86 * <p>I tried writing a resettable GZIPOutputStream and could make it work w/
87 * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
88 * native call doesn't seem to like the notion of resetting -- so I gave up on
89 * it.
90 *
91 * <p>Because of such as the above and troubles with GZIPInputStream, we should
92 * write our own GZIP*Streams, ones that resettable and consious of gzip
93 * members.
94 *
95 * <p>This class will write until we hit >= maxSize. The check is done at
96 * record boundary. Records do not span ARC files. We will then close current
97 * file and open another and then continue writing.
98 *
99 * <p><b>TESTING: </b>Here is how to test that produced ARC files are good
100 * using the
101 * <a href="http://www.archive.org/web/researcher/tool_documentation.php">alexa
102 * ARC c-tools</a>:
103 * <pre>
104 * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
105 * /tmp/hx20040109230030-0.dat.gz
106 * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
107 * </pre>
108 * Examine the produced cdx file to make sure it makes sense. Search
109 * for 'no-type 0'. If found, then we're opening a gzip record w/o data to
110 * write. This is bad.
111 *
112 * <p>You can also do <code>gzip -t FILENAME</code> and it will tell you if the
113 * ARC makes sense to GZIP.
114 *
115 * <p>While being written, ARCs have a '.open' suffix appended.
116 *
117 * @author stack
118 */
119 public class ARCWriter extends WriterPoolMember implements ARCConstants {
120 private static final Logger logger =
121 Logger.getLogger(ARCWriter.class.getName());
122
123 /***
124 * Metadata line pattern.
125 */
126 private static final Pattern METADATA_LINE_PATTERN =
127 Pattern.compile("^//S+ //S+ //S+ //S+ //S+(" + LINE_SEPARATOR + "?)$");
128
129 private List metadata = null;
130
131
132 /***
133 * Constructor.
134 * Takes a stream. Use with caution. There is no upperbound check on size.
135 * Will just keep writing.
136 *
137 * @param serialNo used to generate unique file name sequences
138 * @param out Where to write.
139 * @param arc File the <code>out</code> is connected to.
140 * @param cmprs Compress the content written.
141 * @param metadata File meta data. Can be null. Is list of File and/or
142 * String objects.
143 * @param a14DigitDate If null, we'll write current time.
144 * @throws IOException
145 */
146 public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
147 final File arc, final boolean cmprs, String a14DigitDate,
148 final List metadata)
149 throws IOException {
150 super(serialNo, out, arc, cmprs, a14DigitDate);
151 this.metadata = metadata;
152 writeFirstRecord(a14DigitDate);
153 }
154
155 /***
156 * Constructor.
157 *
158 * @param serialNo used to generate unique file name sequences
159 * @param dirs Where to drop the ARC files.
160 * @param prefix ARC file prefix to use. If null, we use
161 * DEFAULT_ARC_FILE_PREFIX.
162 * @param cmprs Compress the ARC files written. The compression is done
163 * by individually gzipping each record added to the ARC file: i.e. the
164 * ARC file is a bunch of gzipped records concatenated together.
165 * @param maxSize Maximum size for ARC files written.
166 */
167 public ARCWriter(final AtomicInteger serialNo, final List<File> dirs,
168 final String prefix, final boolean cmprs, final long maxSize) {
169 this(serialNo, dirs, prefix, "", cmprs, maxSize, null);
170 }
171
172 /***
173 * Constructor.
174 *
175 * @param serialNo used to generate unique file name sequences
176 * @param dirs Where to drop files.
177 * @param prefix File prefix to use.
178 * @param cmprs Compress the records written.
179 * @param maxSize Maximum size for ARC files written.
180 * @param suffix File tail to use. If null, unused.
181 * @param meta File meta data. Can be null. Is list of File and/or
182 * String objects.
183 */
184 public ARCWriter(final AtomicInteger serialNo, final List<File> dirs,
185 final String prefix, final String suffix, final boolean cmprs,
186 final long maxSize, final List meta) {
187 super(serialNo, dirs, prefix, suffix, cmprs, maxSize,
188 ARC_FILE_EXTENSION);
189 this.metadata = meta;
190 }
191
192 protected String createFile()
193 throws IOException {
194 String name = super.createFile();
195 writeFirstRecord(getCreateTimestamp());
196 return name;
197 }
198
199 private void writeFirstRecord(final String ts)
200 throws IOException {
201 write(generateARCFileMetaData(ts));
202 }
203
204 /***
205 * Write out the ARCMetaData.
206 *
207 * <p>Generate ARC file meta data. Currently we only do version 1 of the
208 * ARC file formats or version 1.1 when metadata has been supplied (We
209 * write it into the body of the first record in the arc file).
210 *
211 * <p>Version 1 metadata looks roughly like this:
212 *
213 * <pre>filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 //
214 * 20040110013326 text/plain 77
215 * 1 0 InternetArchive
216 * URL IP-address Archive-date Content-type Archive-length
217 * </pre>
218 *
219 * <p>If compress is set, then we generate a header that has been gzipped
220 * in the Internet Archive manner. Such a gzipping enables the FEXTRA
221 * flag in the FLG field of the gzip header. It then appends an extra
222 * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'. The first two
223 * bytes are the length of the field and the last 6 bytes the Internet
224 * Archive header. To learn about GZIP format, see RFC1952. To learn
225 * about the Internet Archive extra header field, read the source for
226 * av_ziparc which can be found at
227 * <code>alexa/vista/alexa-tools-1.2/src/av_ziparc.cc</code>.
228 *
229 * <p>We do things in this roundabout manner because the java
230 * GZIPOutputStream does not give access to GZIP header fields.
231 *
232 * @param date Date to put into the ARC metadata.
233 *
234 * @return Byte array filled w/ the arc header.
235 * @throws IOException
236 */
237 private byte [] generateARCFileMetaData(String date)
238 throws IOException {
239 int metadataBodyLength = getMetadataLength();
240
241
242 String metadataHeaderLinesTwoAndThree =
243 getMetadataHeaderLinesTwoAndThree("1 " +
244 ((metadataBodyLength > 0)? "1": "0"));
245 int recordLength = metadataBodyLength +
246 metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
247 String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() +
248 " 0.0.0.0 " + date + " text/plain " + recordLength +
249 metadataHeaderLinesTwoAndThree;
250 ByteArrayOutputStream metabaos =
251 new ByteArrayOutputStream(recordLength);
252
253 metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
254
255 if (metadataBodyLength > 0) {
256 writeMetaData(metabaos);
257 }
258
259
260 metabaos.write(LINE_SEPARATOR);
261
262
263 byte [] bytes = metabaos.toByteArray();
264
265 if(isCompressed()) {
266
267
268
269
270
271
272
273 byte [] gzippedMetaData = GzippedInputStream.gzip(bytes);
274 if (gzippedMetaData[3] != 0) {
275 throw new IOException("The GZIP FLG header is unexpectedly " +
276 " non-zero. Need to add smarter code that can deal " +
277 " when already extant extra GZIP header fields.");
278 }
279
280
281
282
283 gzippedMetaData[3] = 4;
284 gzippedMetaData[9] = 3;
285 byte [] assemblyBuffer = new byte[gzippedMetaData.length +
286 ARC_GZIP_EXTRA_FIELD.length];
287
288
289
290 System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
291 System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
292 ARC_GZIP_EXTRA_FIELD.length);
293 System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
294 10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
295 bytes = assemblyBuffer;
296 }
297 return bytes;
298 }
299
300 public String getMetadataHeaderLinesTwoAndThree(String version) {
301 StringBuffer buffer = new StringBuffer();
302 buffer.append(LINE_SEPARATOR);
303 buffer.append(version);
304 buffer.append(" InternetArchive");
305 buffer.append(LINE_SEPARATOR);
306 buffer.append("URL IP-address Archive-date Content-type Archive-length");
307 buffer.append(LINE_SEPARATOR);
308 return buffer.toString();
309 }
310
311 /***
312 * Write all metadata to passed <code>baos</code>.
313 *
314 * @param baos Byte array to write to.
315 * @throws UnsupportedEncodingException
316 * @throws IOException
317 */
318 private void writeMetaData(ByteArrayOutputStream baos)
319 throws UnsupportedEncodingException, IOException {
320 if (this.metadata == null) {
321 return;
322 }
323
324 for (Iterator i = this.metadata.iterator();
325 i.hasNext();) {
326 Object obj = i.next();
327 if (obj instanceof String) {
328 baos.write(((String)obj).getBytes(DEFAULT_ENCODING));
329 } else if (obj instanceof File) {
330 InputStream is = null;
331 try {
332 is = new BufferedInputStream(
333 new FileInputStream((File)obj));
334 byte [] buffer = new byte[4096];
335 for (int read = -1; (read = is.read(buffer)) != -1;) {
336 baos.write(buffer, 0, read);
337 }
338 } finally {
339 if (is != null) {
340 is.close();
341 }
342 }
343 } else if (obj != null) {
344 logger.severe("Unsupported metadata type: " + obj);
345 }
346 }
347 return;
348 }
349
350 /***
351 * @return Total length of metadata.
352 * @throws UnsupportedEncodingException
353 */
354 private int getMetadataLength()
355 throws UnsupportedEncodingException {
356 int result = -1;
357 if (this.metadata == null) {
358 result = 0;
359 } else {
360 for (Iterator i = this.metadata.iterator();
361 i.hasNext();) {
362 Object obj = i.next();
363 if (obj instanceof String) {
364 result += ((String)obj).getBytes(DEFAULT_ENCODING).length;
365 } else if (obj instanceof File) {
366 result += ((File)obj).length();
367 } else {
368 logger.severe("Unsupported metadata type: " + obj);
369 }
370 }
371 }
372 return result;
373 }
374
375 /***
376 * @deprecated use input-stream version directly instead
377 */
378 public void write(String uri, String contentType, String hostIP,
379 long fetchBeginTimeStamp, long recordLength,
380 ByteArrayOutputStream baos)
381 throws IOException {
382 write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength,
383 new ByteArrayInputStream(baos.toByteArray()), false);
384 }
385
386 public void write(String uri, String contentType, String hostIP,
387 long fetchBeginTimeStamp, long recordLength, InputStream in)
388 throws IOException {
389 write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true);
390 }
391
392 /***
393 * Write a record with the given metadata/content.
394 *
395 * @param uri
396 * URI for metadata-line
397 * @param contentType
398 * MIME content-type for metadata-line
399 * @param hostIP
400 * IP for metadata-line
401 * @param fetchBeginTimeStamp
402 * timestamp for metadata-line
403 * @param recordLength
404 * length for metadata-line; also may be enforced
405 * @param in
406 * source InputStream for record content
407 * @param enforceLength
408 * whether to enforce the declared length; should be true
409 * unless intentionally writing bad records for testing
410 * @throws IOException
411 */
412 public void write(String uri, String contentType, String hostIP,
413 long fetchBeginTimeStamp, long recordLength, InputStream in,
414 boolean enforceLength) throws IOException {
415 preWriteRecordTasks();
416 try {
417 write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp,
418 recordLength).getBytes(UTF8));
419 copyFrom(in, recordLength, enforceLength);
420 if (in instanceof ReplayInputStream) {
421
422 long remaining = ((ReplayInputStream) in).remaining();
423
424
425 if (remaining != 0) {
426 String message = "Gap between expected and actual: "
427 + remaining + LINE_SEPARATOR + DevUtils.extraInfo()
428 + " writing arc "
429 + this.getFile().getAbsolutePath();
430 DevUtils.warnHandle(new Throwable(message), message);
431 throw new IOException(message);
432 }
433 }
434 write(LINE_SEPARATOR);
435 } finally {
436 postWriteRecordTasks();
437 }
438 }
439
440 /***
441 * @param uri
442 * @param contentType
443 * @param hostIP
444 * @param fetchBeginTimeStamp
445 * @param recordLength
446 * @return Metadata line for an ARCRecord made of passed components.
447 * @exception IOException
448 */
449 protected String getMetaLine(String uri, String contentType, String hostIP,
450 long fetchBeginTimeStamp, long recordLength)
451 throws IOException {
452 if (fetchBeginTimeStamp <= 0) {
453 throw new IOException("Bogus fetchBeginTimestamp: " +
454 Long.toString(fetchBeginTimeStamp));
455 }
456
457 return validateMetaLine(createMetaline(uri, hostIP,
458 ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
459 MimetypeUtils.truncate(contentType),
460 Long.toString(recordLength)));
461 }
462
463 public String createMetaline(String uri, String hostIP,
464 String timeStamp, String mimetype, String recordLength) {
465 return uri + HEADER_FIELD_SEPARATOR + hostIP +
466 HEADER_FIELD_SEPARATOR + timeStamp +
467 HEADER_FIELD_SEPARATOR + mimetype +
468 HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
469 }
470
471 /***
472 * Test that the metadata line is valid before writing.
473 * @param metaLineStr
474 * @throws IOException
475 * @return The passed in metaline.
476 */
477 protected String validateMetaLine(String metaLineStr)
478 throws IOException {
479 if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
480 throw new IOException("Metadata line too long ("
481 + metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH
482 + "): " + metaLineStr);
483 }
484 Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
485 if (!m.matches()) {
486 throw new IOException("Metadata line doesn't match expected" +
487 " pattern: " + metaLineStr);
488 }
489 return metaLineStr;
490 }
491 }