View Javadoc

1   /* ARCWriterTest
2    *
3    * $Id: ARCWriterTest.java 6813 2010-04-09 21:19:06Z gojomo $
4    *
5    * Created on Dec 31, 2003.
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.BufferedInputStream;
28  import java.io.ByteArrayInputStream;
29  import java.io.ByteArrayOutputStream;
30  import java.io.File;
31  import java.io.FileNotFoundException;
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.io.PrintStream;
35  import java.util.Arrays;
36  import java.util.Date;
37  import java.util.Iterator;
38  import java.util.List;
39  import java.util.concurrent.atomic.AtomicInteger;
40  
41  import org.apache.commons.io.IOUtils;
42  import org.apache.commons.io.input.NullInputStream;
43  import org.apache.commons.io.output.NullOutputStream;
44  import org.archive.io.ArchiveRecord;
45  import org.archive.io.ReplayInputStream;
46  import org.archive.io.WriterPoolMember;
47  import org.archive.util.ArchiveUtils;
48  import org.archive.util.FileUtils;
49  import org.archive.util.TmpDirTestCase;
50  
51  
52  /***
53   * Test ARCWriter class.
54   *
55   * This code exercises ARCWriter AND ARCReader.  First it writes ARCs w/
56   * ARCWriter.  Then it validates what was written w/ ARCReader.
57   *
58   * @author stack
59   */
60  public class ARCWriterTest
61  extends TmpDirTestCase implements ARCConstants {
62      /*** Utility class for writing bad ARCs (with trailing junk)
63        */
64      public class CorruptibleARCWriter extends ARCWriter {
65          byte[] endJunk = null;
66          public CorruptibleARCWriter(AtomicInteger serial_no, List<File> name, String name2, boolean compress, long default_max_arc_file_size) {
67              super(serial_no,name,name2,compress,default_max_arc_file_size);
68          }    
69          @Override
70          protected void postWriteRecordTasks() throws IOException {
71              if(endJunk!=null) {
72                  this.write(endJunk);
73              }
74              super.postWriteRecordTasks();
75          }
76          public void setEndJunk(byte[] b) throws IOException {
77              this.endJunk = b;
78          }
79      }
80  
81      /***
82       * Prefix to use for ARC files made by JUNIT.
83       */
84      private static final String SUFFIX =
85          /* TODO DEFAULT_ARC_FILE_PREFIX*/ "JUNIT";
86      
87      private static final String SOME_URL = "http://www.archive.org/test/";
88  
89      
90      private static final AtomicInteger SERIAL_NO = new AtomicInteger();
91  
92      /*
93       * @see TestCase#setUp()
94       */
95      protected void setUp() throws Exception {
96          super.setUp();
97      }
98  
99      /*
100      * @see TestCase#tearDown()
101      */
102     protected void tearDown() throws Exception {
103         super.tearDown();
104     }
105     
106     protected static String getContent() {
107         return getContent(null);
108     }
109     
110     protected static String getContent(String indexStr) {
111         String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
112         return "HTTP/1.1 200 OK\r\n" +
113         "Content-Type: text/html\r\n\r\n" +
114         "<html><head><title>" + page +
115         "</title></head>" +
116         "<body>" + page +
117         "</body></html>";
118     }
119 
120     protected int writeRandomHTTPRecord(ARCWriter arcWriter, int index)
121     throws IOException {
122         String indexStr = Integer.toString(index);
123         ByteArrayOutputStream baos = new ByteArrayOutputStream();
124         // Start the record with an arbitrary 14-digit date per RFC2540
125         String now = ArchiveUtils.get14DigitDate();
126         int recordLength = 0;
127         byte[] record = (getContent(indexStr)).getBytes();
128         recordLength += record.length;
129         baos.write(record);
130         // Add the newline between records back in
131         baos.write("\n".getBytes());
132         recordLength += 1;
133         arcWriter.write("http://www.one.net/id=" + indexStr, "text/html",
134             "0.1.2.3", Long.parseLong(now), recordLength, baos);
135         return recordLength;
136     }
137 
138     private File writeRecords(String baseName, boolean compress,
139         long maxSize, int recordCount)
140     throws IOException {
141         cleanUpOldFiles(baseName);
142         File [] files = {getTmpDir()};
143         ARCWriter arcWriter = new ARCWriter(SERIAL_NO, Arrays.asList(files),
144             baseName + '-' + SUFFIX, compress, maxSize);
145         assertNotNull(arcWriter);
146         for (int i = 0; i < recordCount; i++) {
147             writeRandomHTTPRecord(arcWriter, i);
148         }
149         arcWriter.close();
150         assertTrue("Doesn't exist: " +
151                 arcWriter.getFile().getAbsolutePath(), 
152             arcWriter.getFile().exists());
153         return arcWriter.getFile();
154     }
155 
156     private void validate(File arcFile, int recordCount)
157     throws FileNotFoundException, IOException {
158         ARCReader reader = ARCReaderFactory.get(arcFile);
159         assertNotNull(reader);
160         List metaDatas = null;
161         if (recordCount == -1) {
162             metaDatas = reader.validate();
163         } else {
164             metaDatas = reader.validate(recordCount);
165         }
166         reader.close();
167         // Now, run through each of the records doing absolute get going from
168         // the end to start.  Reopen the arc so no context between this test
169         // and the previous.
170         reader = ARCReaderFactory.get(arcFile);
171         for (int i = metaDatas.size() - 1; i >= 0; i--) {
172             ARCRecordMetaData meta = (ARCRecordMetaData)metaDatas.get(i);
173             ArchiveRecord r = reader.get(meta.getOffset());
174             String mimeType = r.getHeader().getMimetype();
175             assertTrue("Record is bogus",
176                 mimeType != null && mimeType.length() > 0);
177         }
178         reader.close();
179         assertTrue("Metadatas not equal", metaDatas.size() == recordCount);
180         for (Iterator i = metaDatas.iterator(); i.hasNext();) {
181                 ARCRecordMetaData r = (ARCRecordMetaData)i.next();
182                 assertTrue("Record is empty", r.getLength() > 0);
183         }
184     }
185 
186     public void testCheckARCFileSize()
187     throws IOException {
188         runCheckARCFileSizeTest("checkARCFileSize", false);
189     }
190 
191     public void testCheckARCFileSizeCompressed()
192     throws IOException {
193         runCheckARCFileSizeTest("checkARCFileSize", true);
194     }
195 
196     public void testWriteRecord() throws IOException {
197         final int recordCount = 2;
198         File arcFile = writeRecords("writeRecord", false,
199                 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
200         validate(arcFile, recordCount  + 1); // Header record.
201     }
202     
203     public void testRandomAccess() throws IOException {
204         final int recordCount = 3;
205         File arcFile = writeRecords("writeRecord", true,
206             DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
207         ARCReader reader = ARCReaderFactory.get(arcFile);
208         // Get to second record.  Get its offset for later use.
209         boolean readFirst = false;
210         String url = null;
211         long offset = -1;
212         long totalRecords = 0;
213         boolean readSecond = false;
214         for (final Iterator i = reader.iterator(); i.hasNext(); totalRecords++) {
215             ARCRecord ar = (ARCRecord)i.next();
216             if (!readFirst) {
217                 readFirst = true;
218                 continue;
219             }
220             if (!readSecond) {
221                 url = ar.getMetaData().getUrl();
222                 offset = ar.getMetaData().getOffset();
223                 readSecond = true;
224             }
225         }
226         
227         reader = ARCReaderFactory.get(arcFile, offset);
228         ArchiveRecord ar = reader.get();
229         assertEquals(ar.getHeader().getUrl(), url);
230         ar.close();
231         
232         // Get reader again.  See how iterator works with offset
233         reader = ARCReaderFactory.get(arcFile, offset);
234         int count = 0;
235         for (final Iterator i = reader.iterator(); i.hasNext(); i.next()) {
236             count++;
237         }
238         reader.close();
239         assertEquals(totalRecords - 1, count);
240     }
241 
242     public void testWriteRecordCompressed() throws IOException {
243         final int recordCount = 2;
244         File arcFile = writeRecords("writeRecordCompressed", true,
245                 DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
246         validate(arcFile, recordCount + 1 /*Header record*/);
247     }
248     
249     public void testWriteGiantRecord() throws IOException {
250         File [] files = {getTmpDir()};
251         PrintStream dummyStream = new PrintStream(new NullOutputStream());
252         ARCWriter arcWriter = new ARCWriter(SERIAL_NO, dummyStream,
253                 new File("dummy"),
254                 false, null, null);
255         assertNotNull(arcWriter);
256 
257         // Start the record with an arbitrary 14-digit date per RFC2540
258         long now = System.currentTimeMillis();
259         long recordLength = org.apache.commons.io.FileUtils.ONE_GB * 3;
260        
261         arcWriter.write("dummy:uri", "application/octet-stream",
262             "0.1.2.3", now, recordLength, new NullInputStream(recordLength));
263         arcWriter.close();
264         }
265     
266     private void runCheckARCFileSizeTest(String baseName, boolean compress)
267     throws FileNotFoundException, IOException  {
268         writeRecords(baseName, compress, 1024, 15);
269         // Now validate all files just created.
270         File [] files = FileUtils.getFilesWithPrefix(getTmpDir(), SUFFIX);
271         for (int i = 0; i < files.length; i++) {
272             validate(files[i], -1);
273         }
274     }
275     
276     protected CorruptibleARCWriter createARCWriter(String NAME, boolean compress) {
277         File [] files = {getTmpDir()};
278         return new CorruptibleARCWriter(SERIAL_NO, Arrays.asList(files), NAME,
279             compress, DEFAULT_MAX_ARC_FILE_SIZE);
280     }
281     
282     protected static ByteArrayInputStream getBais(String str)
283     throws IOException {
284         return new ByteArrayInputStream(str.getBytes());
285     }
286     
287     /***
288      * Writes a record, suppressing normal length-checks (so that 
289      * intentionally malformed records may be written). 
290      */
291     protected static void writeRecord(ARCWriter writer, String url,
292         String type, int len, ByteArrayInputStream bais)
293     throws IOException {
294         writer.write(url, type, "192.168.1.1", (new Date()).getTime(), len,
295             bais, false);
296     }
297     
298     protected int iterateRecords(ARCReader r)
299     throws IOException {
300         int count = 0;
301         for (Iterator i = r.iterator(); i.hasNext();) {
302             ARCRecord rec = (ARCRecord)i.next();
303             rec.close();
304             if (count != 0) {
305                 assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
306                     rec.getMetaData().getUrl().equals(SOME_URL));
307             }
308             count++;
309         }
310         return count;
311     }
312     
313     protected CorruptibleARCWriter createArcWithOneRecord(String name,
314         boolean compressed)
315     throws IOException {
316     	CorruptibleARCWriter writer = createARCWriter(name, compressed);
317         String content = getContent();
318         writeRecord(writer, SOME_URL, "text/html",
319             content.length(), getBais(content));
320         return writer;
321     }
322     
323     public void testSpaceInURL() {
324         String eMessage = null;
325         try {
326             holeyUrl("testSpaceInURL-" + SUFFIX, false, " ");
327         } catch (IOException e) {
328             eMessage = e.getMessage();
329         }
330         assertTrue("Didn't get expected exception: " + eMessage,
331             eMessage.startsWith("Metadata line doesn't match"));
332     }
333 
334     public void testTabInURL() {        
335         String eMessage = null;
336         try {
337             holeyUrl("testTabInURL-" + SUFFIX, false, "\t");
338         } catch (IOException e) {
339             eMessage = e.getMessage();
340         }
341         assertTrue("Didn't get expected exception: " + eMessage,
342             eMessage.startsWith("Metadata line doesn't match"));
343     }
344     
345     protected void holeyUrl(String name, boolean compress, String urlInsert)
346     throws IOException {
347     	ARCWriter writer = createArcWithOneRecord(name, compress);
348         // Add some bytes on the end to mess up the record.
349         String content = getContent();
350         writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
351             content.length(), getBais(content));
352         writer.close();
353     }
354     
355 // If uncompressed, length has to be right or parse will fail.
356 //
357 //    public void testLengthTooShort() throws IOException {
358 //        lengthTooShort("testLengthTooShort-" + PREFIX, false);
359 //    }
360     
361     public void testLengthTooShortCompressed() throws IOException {
362         lengthTooShort("testLengthTooShortCompressed-" + SUFFIX, true, false);
363     }
364     
365     public void testLengthTooShortCompressedStrict()
366     throws IOException {      
367         String eMessage = null;
368         try {
369             lengthTooShort("testLengthTooShortCompressedStrict-" + SUFFIX,
370                 true, true);
371         } catch (RuntimeException e) {
372             eMessage = e.getMessage();
373         }
374         assertTrue("Didn't get expected exception: " + eMessage,
375             eMessage.startsWith("java.io.IOException: Record ENDING at"));
376     }
377      
378     protected void lengthTooShort(String name, boolean compress, boolean strict)
379     throws IOException {
380     	CorruptibleARCWriter writer = createArcWithOneRecord(name, compress);
381         // Add some bytes on the end to mess up the record.
382         String content = getContent();
383         ByteArrayInputStream bais = getBais(content+"SOME TRAILING BYTES");
384         writeRecord(writer, SOME_URL, "text/html",
385             content.length(), bais);
386         writer.setEndJunk("SOME TRAILING BYTES".getBytes());
387         writeRecord(writer, SOME_URL, "text/html",
388             content.length(), getBais(content));
389         writer.close();
390         
391         // Catch System.err into a byte stream.
392         ByteArrayOutputStream os = new ByteArrayOutputStream();
393         System.setErr(new PrintStream(os));
394         
395         ARCReader r = ARCReaderFactory.get(writer.getFile());
396         r.setStrict(strict);
397         int count = iterateRecords(r);
398         assertTrue("Count wrong " + count, count == 4);
399 
400         // Make sure we get the warning string which complains about the
401         // trailing bytes.
402         String err = os.toString();
403         assertTrue("No message " + err, err.startsWith("WARNING") &&
404             (err.indexOf("Record ENDING at") > 0));
405     }
406     
407 //  If uncompressed, length has to be right or parse will fail.
408 //
409 //    public void testLengthTooLong()
410 //    throws IOException {
411 //        lengthTooLong("testLengthTooLongCompressed-" + PREFIX,
412 //            false, false);
413 //    }
414     
415     public void testLengthTooLongCompressed()
416     throws IOException {
417         lengthTooLong("testLengthTooLongCompressed-" + SUFFIX,
418             true, false);
419     }
420     
421     public void testLengthTooLongCompressedStrict() {
422         String eMessage = null;
423         try {
424             lengthTooLong("testLengthTooLongCompressed-" + SUFFIX,
425                 true, true);
426         } catch (IOException e) {
427             eMessage = e.getMessage();
428         }
429         assertTrue("Didn't get expected exception: " + eMessage,
430             eMessage.startsWith("Premature EOF before end-of-record"));
431     }
432     
433     protected void lengthTooLong(String name, boolean compress,
434             boolean strict)
435     throws IOException {
436     	ARCWriter writer = createArcWithOneRecord(name, compress);
437         // Add a record with a length that is too long.
438         String content = getContent();
439         writeRecord(writer, SOME_URL, "text/html",
440             content.length() + 10, getBais(content));
441         writeRecord(writer, SOME_URL, "text/html",
442             content.length(), getBais(content));
443         writer.close();
444         
445         // Catch System.err.
446         ByteArrayOutputStream os = new ByteArrayOutputStream();
447         System.setErr(new PrintStream(os));
448         
449         ARCReader r = ARCReaderFactory.get(writer.getFile());
450         r.setStrict(strict);
451         int count = iterateRecords(r);
452         assertTrue("Count wrong " + count, count == 4);
453         
454         // Make sure we get the warning string which complains about the
455         // trailing bytes.
456         String err = os.toString();
457         assertTrue("No message " + err, 
458             err.startsWith("WARNING Premature EOF before end-of-record"));
459     }
460     
461     public void testGapError() throws IOException {
462     	ARCWriter writer = createArcWithOneRecord("testGapError", true);
463         String content = getContent();
464         // Make a 'weird' RIS that returns bad 'remaining' length
465         // awhen remaining should be 0
466         ReplayInputStream ris = new ReplayInputStream(content.getBytes(),
467                 content.length(), null) {
468             public long remaining() {
469                 return (super.remaining()==0) ? -1 : super.remaining();
470             }
471         };
472         String message = null;
473         try {
474         writer.write(SOME_URL, "text/html", "192.168.1.1",
475             (new Date()).getTime(), content.length(), ris);
476         } catch (IOException e) {
477             message = e.getMessage();
478         } finally {
479             IOUtils.closeQuietly(ris);
480         }
481         writer.close();
482         assertTrue("No gap when should be",
483             message != null &&
484             message.indexOf("Gap between expected and actual") >= 0);
485     }
486     
487     /***
488      * Write an arc file for other tests to use.
489      * @param arcdir Directory to write to.
490      * @param compress True if file should be compressed.
491      * @return ARC written.
492      * @throws IOException 
493      */
494     public static File createARCFile(File arcdir, boolean compress)
495     throws IOException {
496         File [] files = {arcdir};
497         ARCWriter writer = new ARCWriter(SERIAL_NO, Arrays.asList(files),
498             "test", compress, DEFAULT_MAX_ARC_FILE_SIZE);
499         String content = getContent();
500         writeRecord(writer, SOME_URL, "text/html", content.length(),
501             getBais(content));
502         writer.close();
503         return writer.getFile();
504     }
505     
506 //    public void testSpeed() throws IOException {
507 //        ARCWriter writer = createArcWithOneRecord("speed", true);
508 //        // Add a record with a length that is too long.
509 //        String content = getContent();
510 //        final int count = 100000;
511 //        logger.info("Starting speed write of " + count + " records.");
512 //        for (int i = 0; i < count; i++) {
513 //            writeRecord(writer, SOME_URL, "text/html", content.length(),
514 //                    getBaos(content));
515 //        }
516 //        writer.close();
517 //        logger.info("Finished speed write test.");
518 //    }
519     
520     
521     public void testValidateMetaLine() throws Exception {
522         final String line = "http://www.aandw.net/images/walden2.png " +
523             "128.197.34.86 20060111174224 image/png 2160";
524         ARCWriter w = createARCWriter("testValidateMetaLine", true);
525         try {
526             w.validateMetaLine(line);
527             w.validateMetaLine(line + LINE_SEPARATOR);
528             w.validateMetaLine(line + "//r//n");
529         } finally {
530             w.close();
531         }
532     }
533     
534     public void testArcRecordOffsetReads() throws Exception {
535 		ARCRecord ar = getSingleRecord("testArcRecordInBufferStream");
536 		// Now try getting some random set of bytes out of it 
537 		// at an odd offset (used to fail because we were
538 		// doing bad math to find where in buffer to read).
539 		final byte[] buffer = new byte[17];
540 		final int maxRead = 4;
541 		int totalRead = 0;
542 		while (totalRead < maxRead) {
543 			totalRead = totalRead
544 			    + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
545 			assertTrue(totalRead > 0);
546 		}
547 	}
548     
549     // available should always be >= 0; extra read()s should all give EOF
550     public void testArchiveRecordAvailableConsistent() throws Exception {
551     	// first test reading byte-at-a-time via no-param read()
552         ARCRecord record = getSingleRecord("testArchiveRecordAvailableConsistent");
553         int c = record.read(); 
554         while(c>=0) {
555         	c = record.read(); 
556         }
557         // consecutive reads after EOR should always give -1, still show zero available()
558         for (int i=0; i<5; i++) {
559         	assertTrue("available negative:"+record.available(), record.available()>=0);
560             assertEquals(-1, record.read());            
561         }
562     }
563         
564     // should always give -1 on repeated reads past EOR
565     public void testArchiveRecordEORConsistent() throws Exception {
566         ARCRecord record = getSingleRecord("testArchiveRecordEORConsistent");
567         this.readToEOS(record);
568         // consecutive reads after EOR should always give -1
569         for (int i=0; i<5; i++) {
570             assertEquals(-1, record.read(new byte[1]));            
571         }
572     }
573     
574     // should not throw premature EOF when wrapped with BufferedInputStream
575     // [HER-1450] showed this was the case using Apache Tika
576     public void testArchiveRecordMarkSupport() throws Exception {
577         ARCRecord record = getSingleRecord("testArchiveRecordMarkSupport");
578         record.setStrict(true);
579         // ensure mark support
580         InputStream stream = new BufferedInputStream(record);
581         if (stream.markSupported()) {
582             for (int i=0; i<3; i++) {
583                 this.readToEOS(stream);
584                 stream.mark(stream.available());
585                 stream.reset();
586             }
587             stream.close();
588         }
589     }
590 
591     protected void readToEOS(InputStream in) throws Exception {
592         byte [] buf = new byte[1024];
593         int read = 0;
594         while (read >= 0) {
595             read = in.read(buf);
596             // System.out.println("readToEOS read " + read + " bytes");
597         }
598     }
599     
600     protected ARCRecord getSingleRecord(String name) throws Exception {
601         // Get an ARC with one record.
602         WriterPoolMember w = createArcWithOneRecord(name, true);
603         w.close();
604         // Get reader on said ARC.
605         ARCReader r = ARCReaderFactory.get(w.getFile());
606         final Iterator<ArchiveRecord> i = r.iterator();
607         // Skip first ARC meta record.
608         i.next();
609         i.hasNext();
610         // Now we're at first and only record in ARC.
611         return (ARCRecord) i.next();
612     }
613     
614 }