View Javadoc

1   /*
2    * ExperimentalWARCWriterTest
3    *
4    * $Id: ExperimentalWARCWriterTest.java 4554 2006-08-30 02:35:48Z stack-sf $
5    *
6    * Created on July 27th, 2006
7    *
8    * Copyright (C) 2006 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.io.warc;
27  
28  import java.io.ByteArrayInputStream;
29  import java.io.ByteArrayOutputStream;
30  import java.io.File;
31  import java.io.FileNotFoundException;
32  import java.io.IOException;
33  import java.net.URI;
34  import java.net.URISyntaxException;
35  import java.util.Arrays;
36  import java.util.Iterator;
37  import java.util.List;
38  import java.util.concurrent.atomic.AtomicInteger;
39  
40  import org.archive.io.ArchiveRecord;
41  import org.archive.io.ArchiveRecordHeader;
42  import org.archive.io.UTF8Bytes;
43  import org.archive.io.WriterPoolMember;
44  import org.archive.io.warc.WARCConstants;
45  import org.archive.uid.GeneratorFactory;
46  import org.archive.util.ArchiveUtils;
47  import org.archive.util.TmpDirTestCase;
48  import org.archive.util.anvl.ANVLRecord;
49  
50  /***
51   * Test Writer and Reader.
52   * @author stack
53   * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
54   */
55  public class WARCWriterTest
56  extends TmpDirTestCase implements WARCConstants {
57      private static final AtomicInteger SERIAL_NO = new AtomicInteger();
58      
59      /***
60       * Prefix to use for ARC files made by JUNIT.
61       */
62      private static final String PREFIX = "IAH";
63      
64      private static final String SOME_URL = "http://www.archive.org/test/";
65      
66      public void testCheckHeaderLineValue() throws Exception {
67          WARCWriter writer = new WARCWriter();
68          writer.checkHeaderValue("one");
69          IllegalArgumentException exception = null;
70          try {
71              writer.checkHeaderValue("with space");
72          } catch(IllegalArgumentException e) {
73              exception = e;
74          }
75         assertNotNull(exception);
76         exception = null;
77         try {
78             writer.checkHeaderValue("with\0x0000controlcharacter");
79         } catch(IllegalArgumentException e) {
80             exception = e;
81         }
82        assertNotNull(exception);
83      }
84  
85      public void testMimetypes() throws IOException {
86          WARCWriter writer = new WARCWriter();
87          writer.checkHeaderLineMimetypeParameter("text/xml");
88          writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
89          assertEquals(writer.checkHeaderLineMimetypeParameter(
90          	"text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
91          assertEquals(writer.checkHeaderLineMimetypeParameter(
92      		"multipart/mixed; \r\n        boundary=\"simple boundary\""),
93              "multipart/mixed; boundary=\"simple boundary\"");
94      }
95      
96      public void testWriteRecord() throws IOException {
97      	File [] files = {getTmpDir()};
98          
99      	// Write uncompressed.
100         WARCWriter writer =
101         	new WARCWriter(SERIAL_NO, Arrays.asList(files),
102         			this.getClass().getName(), "suffix", false, -1, null);
103         writeFile(writer);
104         
105         // Write compressed.
106         writer = new WARCWriter(SERIAL_NO, Arrays.asList(files),
107         		this.getClass().getName(), "suffix", true, -1, null);
108         writeFile(writer);
109     }
110     
111     private void writeFile(final WARCWriter writer)
112     throws IOException {
113         try {
114             writeWarcinfoRecord(writer);
115             writeBasicRecords(writer);
116         } finally {
117             writer.close();
118             writer.getFile().delete();
119         }
120     }
121     
122     private void writeWarcinfoRecord(WARCWriter writer)
123     throws IOException {
124     	ANVLRecord meta = new ANVLRecord();
125     	meta.addLabelValue("size", "1G");
126     	meta.addLabelValue("operator", "igor");
127     	byte [] bytes = meta.getUTF8Bytes();
128     	writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
129     		new ByteArrayInputStream(bytes), bytes.length);
130 	}
131 
132     protected void writeBasicRecords(final WARCWriter writer)
133     throws IOException {
134         ANVLRecord headerFields = new ANVLRecord();
135         headerFields.addLabelValue("x", "y");
136         headerFields.addLabelValue("a", "b");
137 
138         URI rid = null;
139         try {
140             rid = GeneratorFactory.getFactory().
141             getQualifiedRecordID(TYPE, METADATA);
142         } catch (URISyntaxException e) {
143             // Convert to IOE so can let it out.
144             throw new IOException(e.getMessage());
145         }
146         final String content = "Any old content.";
147         for (int i = 0; i < 10; i++) {
148             String body = i + ". " + content;
149             byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
150             writer.writeRecord(METADATA, "http://www.archive.org/",
151                     ArchiveUtils.get14DigitDate(), "no/type",
152                     rid, headerFields, new ByteArrayInputStream(bodyBytes),
153                     (long)bodyBytes.length, true);
154         }
155     }
156 
157     /***
158      * @return Generic HTML Content.
159      */
160     protected static String getContent() {
161         return getContent(null);
162     }
163     
164     /***
165      * @return Generic HTML Content with mention of passed <code>indexStr</code>
166      * in title and body.
167      */
168     protected static String getContent(String indexStr) {
169         String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
170         return "HTTP/1.1 200 OK\r\n" +
171         "Content-Type: text/html\r\n\r\n" +
172         "<html><head><title>" + page +
173         "</title></head>" +
174         "<body>" + page +
175         "</body></html>";
176     }
177 
178     /***
179      * Write random HTML Record.
180      * @param w Where to write.
181      * @param index An index to put into content.
182      * @return Length of record written.
183      * @throws IOException
184      */
185     protected int writeRandomHTTPRecord(WARCWriter w, int index)
186     throws IOException {
187         ByteArrayOutputStream baos = new ByteArrayOutputStream();
188         String indexStr = Integer.toString(index);
189         byte[] record = (getContent(indexStr)).getBytes();
190         int recordLength = record.length;
191         baos.write(record);
192         // Add named fields for ip, checksum, and relate the metadata
193         // and request to the resource field.
194         ANVLRecord r = new ANVLRecord(1);
195         r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
196         w.writeResourceRecord(
197             "http://www.one.net/id=" + indexStr,
198             ArchiveUtils.get14DigitDate(),
199             "text/html; charset=UTF-8",
200             r,
201             new ByteArrayInputStream(baos.toByteArray()),
202             recordLength);
203         return recordLength;
204     }
205 
206     /***
207      * Fill a WARC with HTML Records.
208      * @param baseName WARC basename.
209      * @param compress Whether to compress or not.
210      * @param maxSize Maximum WARC size.
211      * @param recordCount How many records.
212      * @return The written file.
213      * @throws IOException
214      */
215     private File writeRecords(String baseName, boolean compress,
216         int maxSize, int recordCount)
217     throws IOException {
218         cleanUpOldFiles(baseName);
219         File [] files = {getTmpDir()};
220         WARCWriter w = new WARCWriter(SERIAL_NO,
221             Arrays.asList(files), baseName + '-' + PREFIX, "", compress,
222             maxSize, null);
223         assertNotNull(w);
224         for (int i = 0; i < recordCount; i++) {
225             writeRandomHTTPRecord(w, i);
226         }
227         w.close();
228         assertTrue("Doesn't exist: " +  w.getFile().getAbsolutePath(), 
229             w.getFile().exists());
230         return w.getFile();
231     }
232 
233     /***
234      * Run validation of passed file.
235      * @param f File to validate.
236      * @param recordCount Expected count of records.
237      * @throws FileNotFoundException
238      * @throws IOException
239      */
240     private void validate(File f, int recordCount)
241     throws FileNotFoundException, IOException {
242         WARCReader reader = WARCReaderFactory.get(f);
243         assertNotNull(reader);
244         List<ArchiveRecordHeader> headers = null;
245         if (recordCount == -1) {
246             headers = reader.validate();
247         } else {
248             headers = reader.validate(recordCount);
249         }
250         reader.close();
251         
252         // Now, run through each of the records doing absolute get going from
253         // the end to start.  Reopen the arc so no context between this test
254         // and the previous.
255         reader = WARCReaderFactory.get(f);
256         for (int i = headers.size() - 1; i >= 0; i--) {
257             ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
258             ArchiveRecord r = reader.get(h.getOffset());
259             String mimeType = r.getHeader().getMimetype();
260             assertTrue("Record is bogus, bad mimetype "+mimeType,
261                 mimeType != null && mimeType.length() > 0);
262         }
263         reader.close();
264         
265         assertTrue("Metadatas not equal", headers.size() == recordCount);
266         for (Iterator<ArchiveRecordHeader> i = headers.iterator(); i.hasNext();) {
267             ArchiveRecordHeader r = i.next();
268             assertTrue("Record is empty", r.getLength() > 0);
269         }
270     }
271 
272     public void testWriteRecords() throws IOException {
273         final int recordCount = 2;
274         File f = writeRecords("writeRecord", false, DEFAULT_MAX_WARC_FILE_SIZE,
275             recordCount);
276      	validate(f, recordCount  + 1); // Header record.
277     }
278 
279     public void testRandomAccess() throws IOException {
280         final int recordCount = 3;
281         File f = writeRecords("writeRecord", true, DEFAULT_MAX_WARC_FILE_SIZE,
282             recordCount);
283         WARCReader reader = WARCReaderFactory.get(f);
284         // Get to second record.  Get its offset for later use.
285         boolean readFirst = false;
286         String url = null;
287         long offset = -1;
288         long totalRecords = 0;
289         boolean readSecond = false;
290         for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();
291                 totalRecords++) {
292             WARCRecord ar = (WARCRecord)i.next();
293             if (!readFirst) {
294                 readFirst = true;
295                 continue;
296             }
297             if (!readSecond) {
298                 url = ar.getHeader().getUrl();
299                 offset = ar.getHeader().getOffset();
300                 readSecond = true;
301             }
302         }
303         
304         reader = WARCReaderFactory.get(f, offset);
305         ArchiveRecord ar = reader.get();
306         assertEquals(ar.getHeader().getUrl(), url);
307         ar.close();
308         
309         // Get reader again.  See how iterator works with offset
310         reader = WARCReaderFactory.get(f, offset);
311         int count = 0;
312         for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); i.next()) {
313             count++;
314         }
315         reader.close();
316         assertEquals(totalRecords - 1, count);
317     }
318     
319     public void testWriteRecordCompressed() throws IOException {
320         final int recordCount = 2;
321         File arcFile = writeRecords("writeRecordCompressed", true,
322             DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
323         validate(arcFile, recordCount + 1 /*Header record*/);
324     }
325     
326     protected WARCWriter createWARCWriter(String NAME,
327             boolean compress) {
328         File [] files = {getTmpDir()};
329         return new WARCWriter(SERIAL_NO,
330         	Arrays.asList(files), NAME, "",
331             compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
332     }
333     
334     protected static ByteArrayOutputStream getBaos(String str)
335     throws IOException {
336         ByteArrayOutputStream baos = new ByteArrayOutputStream();
337         baos.write(str.getBytes());
338         return baos;
339     }
340     
341     protected static void writeRecord(WARCWriter w, String url,
342         String mimetype, int len, ByteArrayOutputStream baos)
343     throws IOException {
344         w.writeResourceRecord(url,
345             ArchiveUtils.get14DigitDate(),
346             mimetype,
347             null,
348             new ByteArrayInputStream(baos.toByteArray()),
349             len);
350     }
351     
352     protected int iterateRecords(WARCReader r)
353     throws IOException {
354         int count = 0;
355         for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
356             ArchiveRecord ar = i.next();
357             ar.close();
358             if (count != 0) {
359                 assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
360                     ar.getHeader().getUrl().equals(SOME_URL));
361             }
362             count++;
363         }
364         return count;
365     }
366     
367     protected WARCWriter createWithOneRecord(String name,
368         boolean compressed)
369     throws IOException {
370         WARCWriter writer = createWARCWriter(name, compressed);
371         String content = getContent();
372         writeRecord(writer, SOME_URL, "text/html",
373             content.length(), getBaos(content));
374         return writer;
375     }
376     
377     public void testSpaceInURL() throws Exception {
378         long bytesWritten = holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
379         assertEquals("Unexpected successful writing occurred",0,bytesWritten);
380     }
381 
382     public void testTabInURL() throws Exception {
383         long bytesWritten = holeyUrl("testTabInURL-" + PREFIX, false, "\t");
384         assertEquals("Unexpected successful writing occurred",0,bytesWritten);
385     }
386     
387     protected long holeyUrl(String name, boolean compress, String urlInsert)
388     throws IOException {
389         WARCWriter writer = createWithOneRecord(name, compress);
390         // Add some bytes on the end to mess up the record.
391         long startPos = writer.getPosition();
392         String content = getContent();
393         ByteArrayOutputStream baos = getBaos(content);
394         writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
395             content.length(), baos);
396         long endPos = writer.getPosition();
397         writer.close();
398         return endPos-startPos;
399     }
400     
401     /***
402      * Write an arc file for other tests to use.
403      * @param arcdir Directory to write to.
404      * @param compress True if file should be compressed.
405      * @return ARC written.
406      * @throws IOException 
407      */
408     public static File createWARCFile(File arcdir, boolean compress)
409     throws IOException {
410         File [] files = {arcdir};
411         WARCWriter writer =
412             new WARCWriter(SERIAL_NO, Arrays.asList(files),
413             "test", "", compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
414         String content = getContent();
415         writeRecord(writer, SOME_URL, "text/html", content.length(),
416             getBaos(content));
417         writer.close();
418         return writer.getFile();
419     }
420     
421 //    public void testSpeed() throws IOException {
422 //        ARCWriter writer = createArcWithOneRecord("speed", true);
423 //        // Add a record with a length that is too long.
424 //        String content = getContent();
425 //        final int count = 100000;
426 //        logger.info("Starting speed write of " + count + " records.");
427 //        for (int i = 0; i < count; i++) {
428 //            writeRecord(writer, SOME_URL, "text/html", content.length(),
429 //                    getBaos(content));
430 //        }
431 //        writer.close();
432 //        logger.info("Finished speed write test.");
433 //    }
434     
435     public void testArcRecordOffsetReads() throws Exception {
436         // Get an ARC with one record.
437         WriterPoolMember w =
438             createWithOneRecord("testArcRecordInBufferStream", true);
439         w.close();
440         // Get reader on said ARC.
441         WARCReader r = WARCReaderFactory.get(w.getFile());
442         final Iterator<ArchiveRecord> i = r.iterator();
443         // Skip first ARC meta record.
444         ArchiveRecord ar = i.next();
445         i.hasNext();
446         // Now we're at first and only record in ARC.
447         ar = (WARCRecord) i.next();
448         // Now try getting some random set of bytes out of it 
449         // at an odd offset (used to fail because we were
450         // doing bad math to find where in buffer to read).
451         final byte[] buffer = new byte[17];
452         final int maxRead = 4;
453         int totalRead = 0;
454         while (totalRead < maxRead) {
455             totalRead = totalRead
456             + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
457             assertTrue(totalRead > 0);
458         }
459     }
460 }