1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.io.warc;
27
28 import java.io.ByteArrayInputStream;
29 import java.io.ByteArrayOutputStream;
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.IOException;
33 import java.net.URI;
34 import java.net.URISyntaxException;
35 import java.util.Arrays;
36 import java.util.Iterator;
37 import java.util.List;
38 import java.util.concurrent.atomic.AtomicInteger;
39
40 import org.archive.io.ArchiveRecord;
41 import org.archive.io.ArchiveRecordHeader;
42 import org.archive.io.UTF8Bytes;
43 import org.archive.io.WriterPoolMember;
44 import org.archive.io.warc.WARCConstants;
45 import org.archive.uid.GeneratorFactory;
46 import org.archive.util.ArchiveUtils;
47 import org.archive.util.TmpDirTestCase;
48 import org.archive.util.anvl.ANVLRecord;
49
50 /***
51 * Test Writer and Reader.
52 * @author stack
53 * @version $Date: 2006-08-29 19:35:48 -0700 (Tue, 29 Aug 2006) $ $Version$
54 */
55 public class WARCWriterTest
56 extends TmpDirTestCase implements WARCConstants {
57 private static final AtomicInteger SERIAL_NO = new AtomicInteger();
58
59 /***
60 * Prefix to use for ARC files made by JUNIT.
61 */
62 private static final String PREFIX = "IAH";
63
64 private static final String SOME_URL = "http://www.archive.org/test/";
65
66 public void testCheckHeaderLineValue() throws Exception {
67 WARCWriter writer = new WARCWriter();
68 writer.checkHeaderValue("one");
69 IllegalArgumentException exception = null;
70 try {
71 writer.checkHeaderValue("with space");
72 } catch(IllegalArgumentException e) {
73 exception = e;
74 }
75 assertNotNull(exception);
76 exception = null;
77 try {
78 writer.checkHeaderValue("with\0x0000controlcharacter");
79 } catch(IllegalArgumentException e) {
80 exception = e;
81 }
82 assertNotNull(exception);
83 }
84
85 public void testMimetypes() throws IOException {
86 WARCWriter writer = new WARCWriter();
87 writer.checkHeaderLineMimetypeParameter("text/xml");
88 writer.checkHeaderLineMimetypeParameter("text/xml+rdf");
89 assertEquals(writer.checkHeaderLineMimetypeParameter(
90 "text/plain; charset=SHIFT-JIS"), "text/plain; charset=SHIFT-JIS");
91 assertEquals(writer.checkHeaderLineMimetypeParameter(
92 "multipart/mixed; \r\n boundary=\"simple boundary\""),
93 "multipart/mixed; boundary=\"simple boundary\"");
94 }
95
96 public void testWriteRecord() throws IOException {
97 File [] files = {getTmpDir()};
98
99
100 WARCWriter writer =
101 new WARCWriter(SERIAL_NO, Arrays.asList(files),
102 this.getClass().getName(), "suffix", false, -1, null);
103 writeFile(writer);
104
105
106 writer = new WARCWriter(SERIAL_NO, Arrays.asList(files),
107 this.getClass().getName(), "suffix", true, -1, null);
108 writeFile(writer);
109 }
110
111 private void writeFile(final WARCWriter writer)
112 throws IOException {
113 try {
114 writeWarcinfoRecord(writer);
115 writeBasicRecords(writer);
116 } finally {
117 writer.close();
118 writer.getFile().delete();
119 }
120 }
121
122 private void writeWarcinfoRecord(WARCWriter writer)
123 throws IOException {
124 ANVLRecord meta = new ANVLRecord();
125 meta.addLabelValue("size", "1G");
126 meta.addLabelValue("operator", "igor");
127 byte [] bytes = meta.getUTF8Bytes();
128 writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null,
129 new ByteArrayInputStream(bytes), bytes.length);
130 }
131
132 protected void writeBasicRecords(final WARCWriter writer)
133 throws IOException {
134 ANVLRecord headerFields = new ANVLRecord();
135 headerFields.addLabelValue("x", "y");
136 headerFields.addLabelValue("a", "b");
137
138 URI rid = null;
139 try {
140 rid = GeneratorFactory.getFactory().
141 getQualifiedRecordID(TYPE, METADATA);
142 } catch (URISyntaxException e) {
143
144 throw new IOException(e.getMessage());
145 }
146 final String content = "Any old content.";
147 for (int i = 0; i < 10; i++) {
148 String body = i + ". " + content;
149 byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8);
150 writer.writeRecord(METADATA, "http://www.archive.org/",
151 ArchiveUtils.get14DigitDate(), "no/type",
152 rid, headerFields, new ByteArrayInputStream(bodyBytes),
153 (long)bodyBytes.length, true);
154 }
155 }
156
157 /***
158 * @return Generic HTML Content.
159 */
160 protected static String getContent() {
161 return getContent(null);
162 }
163
164 /***
165 * @return Generic HTML Content with mention of passed <code>indexStr</code>
166 * in title and body.
167 */
168 protected static String getContent(String indexStr) {
169 String page = (indexStr != null)? "Page #" + indexStr: "Some Page";
170 return "HTTP/1.1 200 OK\r\n" +
171 "Content-Type: text/html\r\n\r\n" +
172 "<html><head><title>" + page +
173 "</title></head>" +
174 "<body>" + page +
175 "</body></html>";
176 }
177
178 /***
179 * Write random HTML Record.
180 * @param w Where to write.
181 * @param index An index to put into content.
182 * @return Length of record written.
183 * @throws IOException
184 */
185 protected int writeRandomHTTPRecord(WARCWriter w, int index)
186 throws IOException {
187 ByteArrayOutputStream baos = new ByteArrayOutputStream();
188 String indexStr = Integer.toString(index);
189 byte[] record = (getContent(indexStr)).getBytes();
190 int recordLength = record.length;
191 baos.write(record);
192
193
194 ANVLRecord r = new ANVLRecord(1);
195 r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1");
196 w.writeResourceRecord(
197 "http://www.one.net/id=" + indexStr,
198 ArchiveUtils.get14DigitDate(),
199 "text/html; charset=UTF-8",
200 r,
201 new ByteArrayInputStream(baos.toByteArray()),
202 recordLength);
203 return recordLength;
204 }
205
206 /***
207 * Fill a WARC with HTML Records.
208 * @param baseName WARC basename.
209 * @param compress Whether to compress or not.
210 * @param maxSize Maximum WARC size.
211 * @param recordCount How many records.
212 * @return The written file.
213 * @throws IOException
214 */
215 private File writeRecords(String baseName, boolean compress,
216 int maxSize, int recordCount)
217 throws IOException {
218 cleanUpOldFiles(baseName);
219 File [] files = {getTmpDir()};
220 WARCWriter w = new WARCWriter(SERIAL_NO,
221 Arrays.asList(files), baseName + '-' + PREFIX, "", compress,
222 maxSize, null);
223 assertNotNull(w);
224 for (int i = 0; i < recordCount; i++) {
225 writeRandomHTTPRecord(w, i);
226 }
227 w.close();
228 assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(),
229 w.getFile().exists());
230 return w.getFile();
231 }
232
233 /***
234 * Run validation of passed file.
235 * @param f File to validate.
236 * @param recordCount Expected count of records.
237 * @throws FileNotFoundException
238 * @throws IOException
239 */
240 private void validate(File f, int recordCount)
241 throws FileNotFoundException, IOException {
242 WARCReader reader = WARCReaderFactory.get(f);
243 assertNotNull(reader);
244 List<ArchiveRecordHeader> headers = null;
245 if (recordCount == -1) {
246 headers = reader.validate();
247 } else {
248 headers = reader.validate(recordCount);
249 }
250 reader.close();
251
252
253
254
255 reader = WARCReaderFactory.get(f);
256 for (int i = headers.size() - 1; i >= 0; i--) {
257 ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
258 ArchiveRecord r = reader.get(h.getOffset());
259 String mimeType = r.getHeader().getMimetype();
260 assertTrue("Record is bogus, bad mimetype "+mimeType,
261 mimeType != null && mimeType.length() > 0);
262 }
263 reader.close();
264
265 assertTrue("Metadatas not equal", headers.size() == recordCount);
266 for (Iterator<ArchiveRecordHeader> i = headers.iterator(); i.hasNext();) {
267 ArchiveRecordHeader r = i.next();
268 assertTrue("Record is empty", r.getLength() > 0);
269 }
270 }
271
272 public void testWriteRecords() throws IOException {
273 final int recordCount = 2;
274 File f = writeRecords("writeRecord", false, DEFAULT_MAX_WARC_FILE_SIZE,
275 recordCount);
276 validate(f, recordCount + 1);
277 }
278
279 public void testRandomAccess() throws IOException {
280 final int recordCount = 3;
281 File f = writeRecords("writeRecord", true, DEFAULT_MAX_WARC_FILE_SIZE,
282 recordCount);
283 WARCReader reader = WARCReaderFactory.get(f);
284
285 boolean readFirst = false;
286 String url = null;
287 long offset = -1;
288 long totalRecords = 0;
289 boolean readSecond = false;
290 for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();
291 totalRecords++) {
292 WARCRecord ar = (WARCRecord)i.next();
293 if (!readFirst) {
294 readFirst = true;
295 continue;
296 }
297 if (!readSecond) {
298 url = ar.getHeader().getUrl();
299 offset = ar.getHeader().getOffset();
300 readSecond = true;
301 }
302 }
303
304 reader = WARCReaderFactory.get(f, offset);
305 ArchiveRecord ar = reader.get();
306 assertEquals(ar.getHeader().getUrl(), url);
307 ar.close();
308
309
310 reader = WARCReaderFactory.get(f, offset);
311 int count = 0;
312 for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext(); i.next()) {
313 count++;
314 }
315 reader.close();
316 assertEquals(totalRecords - 1, count);
317 }
318
319 public void testWriteRecordCompressed() throws IOException {
320 final int recordCount = 2;
321 File arcFile = writeRecords("writeRecordCompressed", true,
322 DEFAULT_MAX_WARC_FILE_SIZE, recordCount);
323 validate(arcFile, recordCount + 1
324 }
325
326 protected WARCWriter createWARCWriter(String NAME,
327 boolean compress) {
328 File [] files = {getTmpDir()};
329 return new WARCWriter(SERIAL_NO,
330 Arrays.asList(files), NAME, "",
331 compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
332 }
333
334 protected static ByteArrayOutputStream getBaos(String str)
335 throws IOException {
336 ByteArrayOutputStream baos = new ByteArrayOutputStream();
337 baos.write(str.getBytes());
338 return baos;
339 }
340
341 protected static void writeRecord(WARCWriter w, String url,
342 String mimetype, int len, ByteArrayOutputStream baos)
343 throws IOException {
344 w.writeResourceRecord(url,
345 ArchiveUtils.get14DigitDate(),
346 mimetype,
347 null,
348 new ByteArrayInputStream(baos.toByteArray()),
349 len);
350 }
351
352 protected int iterateRecords(WARCReader r)
353 throws IOException {
354 int count = 0;
355 for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
356 ArchiveRecord ar = i.next();
357 ar.close();
358 if (count != 0) {
359 assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
360 ar.getHeader().getUrl().equals(SOME_URL));
361 }
362 count++;
363 }
364 return count;
365 }
366
367 protected WARCWriter createWithOneRecord(String name,
368 boolean compressed)
369 throws IOException {
370 WARCWriter writer = createWARCWriter(name, compressed);
371 String content = getContent();
372 writeRecord(writer, SOME_URL, "text/html",
373 content.length(), getBaos(content));
374 return writer;
375 }
376
377 public void testSpaceInURL() throws Exception {
378 long bytesWritten = holeyUrl("testSpaceInURL-" + PREFIX, false, " ");
379 assertEquals("Unexpected successful writing occurred",0,bytesWritten);
380 }
381
382 public void testTabInURL() throws Exception {
383 long bytesWritten = holeyUrl("testTabInURL-" + PREFIX, false, "\t");
384 assertEquals("Unexpected successful writing occurred",0,bytesWritten);
385 }
386
387 protected long holeyUrl(String name, boolean compress, String urlInsert)
388 throws IOException {
389 WARCWriter writer = createWithOneRecord(name, compress);
390
391 long startPos = writer.getPosition();
392 String content = getContent();
393 ByteArrayOutputStream baos = getBaos(content);
394 writeRecord(writer, SOME_URL + urlInsert + "/index.html", "text/html",
395 content.length(), baos);
396 long endPos = writer.getPosition();
397 writer.close();
398 return endPos-startPos;
399 }
400
401 /***
402 * Write an arc file for other tests to use.
403 * @param arcdir Directory to write to.
404 * @param compress True if file should be compressed.
405 * @return ARC written.
406 * @throws IOException
407 */
408 public static File createWARCFile(File arcdir, boolean compress)
409 throws IOException {
410 File [] files = {arcdir};
411 WARCWriter writer =
412 new WARCWriter(SERIAL_NO, Arrays.asList(files),
413 "test", "", compress, DEFAULT_MAX_WARC_FILE_SIZE, null);
414 String content = getContent();
415 writeRecord(writer, SOME_URL, "text/html", content.length(),
416 getBaos(content));
417 writer.close();
418 return writer.getFile();
419 }
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435 public void testArcRecordOffsetReads() throws Exception {
436
437 WriterPoolMember w =
438 createWithOneRecord("testArcRecordInBufferStream", true);
439 w.close();
440
441 WARCReader r = WARCReaderFactory.get(w.getFile());
442 final Iterator<ArchiveRecord> i = r.iterator();
443
444 ArchiveRecord ar = i.next();
445 i.hasNext();
446
447 ar = (WARCRecord) i.next();
448
449
450
451 final byte[] buffer = new byte[17];
452 final int maxRead = 4;
453 int totalRead = 0;
454 while (totalRead < maxRead) {
455 totalRead = totalRead
456 + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
457 assertTrue(totalRead > 0);
458 }
459 }
460 }