1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.writer;
24
25 import java.io.ByteArrayInputStream;
26 import java.io.File;
27 import java.io.IOException;
28 import java.net.InetAddress;
29 import java.net.URI;
30 import java.net.URISyntaxException;
31 import java.net.UnknownHostException;
32 import java.util.Collection;
33 import java.util.HashMap;
34 import java.util.Map;
35 import java.util.concurrent.atomic.AtomicInteger;
36 import java.util.logging.Level;
37 import java.util.logging.Logger;
38
39 import org.apache.commons.httpclient.Header;
40 import org.apache.commons.httpclient.HttpMethodBase;
41 import org.apache.commons.httpclient.HttpStatus;
42 import org.apache.commons.lang.StringUtils;
43 import org.archive.crawler.Heritrix;
44 import org.archive.crawler.datamodel.CoreAttributeConstants;
45 import org.archive.crawler.datamodel.CrawlURI;
46 import org.archive.crawler.datamodel.FetchStatusCodes;
47 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
48 import org.archive.crawler.event.CrawlStatusListener;
49 import org.archive.crawler.extractor.Link;
50 import org.archive.crawler.framework.WriterPoolProcessor;
51 import org.archive.crawler.settings.SimpleType;
52 import org.archive.crawler.settings.Type;
53 import org.archive.io.ReplayInputStream;
54 import org.archive.io.WriterPoolMember;
55 import org.archive.io.WriterPoolSettings;
56 import org.archive.io.warc.WARCConstants;
57 import org.archive.io.warc.WARCWriter;
58 import org.archive.io.warc.WARCWriterPool;
59 import org.archive.uid.GeneratorFactory;
60 import org.archive.util.ArchiveUtils;
61 import org.archive.util.XmlUtils;
62 import org.archive.util.anvl.ANVLRecord;
63 import org.w3c.dom.Document;
64
65 /***
66 * WARCWriterProcessor.
67 * Goes against the 0.18 version of the WARC specification (which
68 * is functionally identical to 0.17 except in the protocol
69 * identifier string).
70 * See http://archive-access.sourceforge.net/warc/
71 *
72 * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
73 * (commons-httpclient?) or find something else.
74 *
75 * @author stack
76 */
77 public class WARCWriterProcessor extends WriterPoolProcessor
78 implements CoreAttributeConstants, CrawlStatusListener,
79 WriterPoolSettings, FetchStatusCodes, WARCConstants {
80 private static final long serialVersionUID = 6182850087635847443L;
81
82 private final Logger logger = Logger.getLogger(this.getClass().getName());
83
84 private HashMap<String,Map<String,Long>> stats;
85 private int urlsWritten;
86
87 public long getDefaultMaxFileSize() {
88 return 1000000000L;
89 }
90
91 /***
92 * Key for whether to write 'request' type records where possible
93 */
94 public static final String ATTR_WRITE_REQUESTS =
95 "write-requests";
96
97 /***
98 * Key for whether to write 'metadata' type records where possible
99 */
100 public static final String ATTR_WRITE_METADATA =
101 "write-metadata";
102
103 /***
104 * Key for whether to write 'revisit' type records when
105 * consecutive identical digest
106 */
107 public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
108 "write-revisit-for-identical-digests";
109
110 /***
111 * Key for whether to write 'revisit' type records for server
112 * "304 not modified" responses
113 */
114 public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
115 "write-revisit-for-not-modified";
116
117 /***
118 * Default path list.
119 */
120 private static final String [] DEFAULT_PATH = {"warcs"};
121
122 protected String [] getDefaultPath() {
123 return DEFAULT_PATH;
124 }
125
126 /***
127 * @param name Name of this writer.
128 */
129 public WARCWriterProcessor(final String name) {
130 super(name, "Experimental WARCWriter processor (Version 0.17)");
131 Type e = addElementToDefinition(
132 new SimpleType(ATTR_WRITE_REQUESTS,
133 "Whether to write 'request' type records. " +
134 "Default is true.", new Boolean(true)));
135 e.setOverrideable(true);
136 e.setExpertSetting(true);
137 e = addElementToDefinition(
138 new SimpleType(ATTR_WRITE_METADATA,
139 "Whether to write 'metadata' type records. " +
140 "Default is true.", new Boolean(true)));
141 e.setOverrideable(true);
142 e.setExpertSetting(true);
143 e = addElementToDefinition(
144 new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
145 "Whether to write 'revisit' type records when a URI's " +
146 "history indicates the previous fetch had an identical " +
147 "content digest. " +
148 "Default is true.", new Boolean(true)));
149 e.setOverrideable(true);
150 e.setExpertSetting(true);
151 e = addElementToDefinition(
152 new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
153 "Whether to write 'revisit' type records when a " +
154 "304-Not Modified response is received. " +
155 "Default is true.", new Boolean(true)));
156 e.setOverrideable(true);
157 e.setExpertSetting(true);
158 }
159
160 protected void setupPool(final AtomicInteger serialNo) {
161 setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
162 getPoolMaximumWait()));
163 }
164
165 /***
166 * Writes a CrawlURI and its associated data to store file.
167 *
168 * Currently this method understands the following uri types: dns, http, and
169 * https.
170 *
171 * @param curi CrawlURI to process.
172 *
173 */
174 protected void innerProcess(CrawlURI curi) {
175
176 if (curi.getFetchStatus() <= 0) {
177 return;
178 }
179
180
181
182 String scheme = curi.getUURI().getScheme().toLowerCase();
183 long recordLength = curi.getContentSize();
184 if (recordLength <= 0 && !scheme.equals("ftp")) {
185
186
187 return;
188 }
189
190 try {
191 if (shouldWrite(curi)) {
192 write(scheme, curi);
193 } else {
194 logger.info("This writer does not write out scheme " +
195 scheme + " content");
196 }
197 } catch (IOException e) {
198 curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
199 curi.toString());
200 logger.log(Level.SEVERE, "Failed write of Record: " +
201 curi.toString(), e);
202 }
203 }
204
205 protected void write(final String lowerCaseScheme, final CrawlURI curi)
206 throws IOException {
207 logger.finer("writing warc record for " + curi);
208 WriterPoolMember writer = getPool().borrowFile();
209 WARCWriter w = (WARCWriter) writer;
210 w.resetStats();
211
212 try {
213
214
215 final URI baseid = getRecordID();
216 final String timestamp =
217 ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
218 if (lowerCaseScheme.startsWith("http")) {
219 writeHttpRecords(w, curi, baseid, timestamp);
220 } else if (lowerCaseScheme.equals("dns")) {
221 writeDnsRecords(w, curi, baseid, timestamp);
222 } else if (lowerCaseScheme.equals("ftp")) {
223 writeFtpRecords(w, curi, baseid, timestamp);
224 } else {
225 logger.warning("No handler for scheme " + lowerCaseScheme);
226 }
227 } catch (IOException e) {
228
229 getPool().invalidateFile(writer);
230
231
232
233 writer = null;
234 throw e;
235 } finally {
236 if (writer != null) {
237 if (WARCWriter.getStat(w.getStats(), WARCWriter.TOTALS, WARCWriter.NUM_RECORDS) > 0) {
238 addStats(w.getStats());
239 urlsWritten++;
240
241 String filename = w.getFile().getName();
242 if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
243 filename = filename.substring(0,
244 filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
245 }
246
247 curi.getAList().putString(CoreAttributeConstants.A_WRITTEN_TO_WARC, filename);
248 }
249 logger.fine("wrote " + WARCWriter.getStat(w.getStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK) + " bytes to " + w.getFile().getName() + " for " + curi);
250 setTotalBytesWritten(getTotalBytesWritten() + WARCWriter.getStat(w.getStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK));
251
252 getPool().returnFile(writer);
253 }
254 }
255 checkBytesWritten();
256 }
257
258 protected void addStats(Map<String,Map<String,Long>> statz) {
259 if (stats == null) {
260 stats = new HashMap<String,Map<String,Long>>();
261 }
262
263 for (String key: statz.keySet()) {
264 if (stats.get(key) == null) {
265 stats.put(key, new HashMap<String,Long>());
266 }
267 for (String subkey: statz.get(key).keySet()) {
268 if (stats.get(key).get(subkey) == null) {
269 stats.get(key).put(subkey, statz.get(key).get(subkey));
270 } else {
271 stats.get(key).put(subkey, stats.get(key).get(subkey) + statz.get(key).get(subkey));
272 }
273 }
274 }
275 }
276
277 private void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
278 final String timestamp) throws IOException {
279 ANVLRecord headers = new ANVLRecord(3);
280 headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
281 String controlConversation = curi.getString(A_FTP_CONTROL_CONVERSATION);
282 URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation);
283
284 if (curi.getContentDigest() != null) {
285 headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
286 curi.getContentDigestSchemeString());
287 }
288
289 if (curi.getHttpRecorder() != null) {
290 if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) &&
291 ((Boolean)getUncheckedAttribute(curi,
292 ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
293 rid = writeRevisitDigest(w, timestamp, null,
294 baseid, curi, headers);
295 } else {
296 headers = new ANVLRecord(3);
297 if (curi.isTruncatedFetch()) {
298 String value = curi.isTimeTruncatedFetch()?
299 NAMED_FIELD_TRUNCATED_VALUE_TIME:
300 curi.isLengthTruncatedFetch()?
301 NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
302 curi.isHeaderTruncatedFetch()?
303 NAMED_FIELD_TRUNCATED_VALUE_HEAD:
304
305 TRUNCATED_VALUE_UNSPECIFIED;
306 headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
307 }
308 if (curi.getContentDigest() != null) {
309 headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
310 curi.getContentDigestSchemeString());
311 }
312 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
313 rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers);
314 }
315 }
316 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
317 headers = new ANVLRecord(1);
318 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
319 writeMetadata(w, timestamp, baseid, curi, headers);
320 }
321 }
322
323 private void writeDnsRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
324 final String timestamp) throws IOException {
325 ANVLRecord headers = null;
326 String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
327 if (ip != null && ip.length() > 0) {
328 headers = new ANVLRecord(1);
329 headers.addLabelValue(HEADER_KEY_IP, ip);
330 }
331 writeResponse(w, timestamp, curi.getContentType(), baseid,
332 curi, headers);
333 }
334
335 private void writeHttpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
336 final String timestamp) throws IOException {
337
338
339
340
341 ANVLRecord headers = new ANVLRecord(5);
342 if (curi.getContentDigest() != null) {
343 headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
344 curi.getContentDigestSchemeString());
345 }
346 headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
347 URI rid;
348
349 if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) &&
350 ((Boolean)getUncheckedAttribute(curi,
351 ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
352 rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
353 baseid, curi, headers);
354 } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED &&
355 ((Boolean)getUncheckedAttribute(curi,
356 ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
357 rid = writeRevisitNotModified(w, timestamp,
358 baseid, curi, headers);
359 } else {
360 if (curi.isTruncatedFetch()) {
361 String value = curi.isTimeTruncatedFetch()?
362 NAMED_FIELD_TRUNCATED_VALUE_TIME:
363 curi.isLengthTruncatedFetch()?
364 NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
365 curi.isHeaderTruncatedFetch()?
366 NAMED_FIELD_TRUNCATED_VALUE_HEAD:
367
368 TRUNCATED_VALUE_UNSPECIFIED;
369 headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
370 }
371 rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
372 baseid, curi, headers);
373 }
374
375 headers = new ANVLRecord(1);
376 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
377 '<' + rid.toString() + '>');
378
379 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
380 writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
381 baseid, curi, headers);
382 }
383 if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
384 writeMetadata(w, timestamp, baseid, curi, headers);
385 }
386 }
387
388 protected URI writeFtpControlConversation(WARCWriter w, String timestamp, URI baseid,
389 CrawlURI curi, ANVLRecord headers, String controlConversation)
390 throws IOException {
391 final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
392 byte[] b = controlConversation.getBytes("UTF-8");
393 w.writeMetadataRecord(curi.toString(), timestamp, FTP_CONTROL_CONVERSATION_MIMETYPE,
394 uid, headers, new ByteArrayInputStream(b), b.length);
395 return uid;
396 }
397
398 protected URI writeRequest(final WARCWriter w,
399 final String timestamp, final String mimetype,
400 final URI baseid, final CrawlURI curi,
401 final ANVLRecord namedFields)
402 throws IOException {
403 final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
404 ReplayInputStream ris =
405 curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
406 try {
407 w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
408 namedFields, ris,
409 curi.getHttpRecorder().getRecordedOutput().getSize());
410 } finally {
411 if (ris != null) {
412 ris.close();
413 }
414 }
415 return uid;
416 }
417
418 protected URI writeResponse(final WARCWriter w,
419 final String timestamp, final String mimetype,
420 final URI baseid, final CrawlURI curi,
421 final ANVLRecord namedFields)
422 throws IOException {
423 ReplayInputStream ris =
424 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
425 try {
426 w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
427 namedFields, ris,
428 curi.getHttpRecorder().getRecordedInput().getSize());
429 } finally {
430 if (ris != null) {
431 ris.close();
432 }
433 }
434 return baseid;
435 }
436
437 protected URI writeResource(final WARCWriter w,
438 final String timestamp, final String mimetype,
439 final URI baseid, final CrawlURI curi,
440 final ANVLRecord namedFields)
441 throws IOException {
442 ReplayInputStream ris =
443 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
444 try {
445 w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
446 namedFields, ris,
447 curi.getHttpRecorder().getRecordedInput().getSize());
448 } finally {
449 if (ris != null) {
450 ris.close();
451 }
452 }
453 return baseid;
454 }
455
456 protected URI writeRevisitDigest(final WARCWriter w,
457 final String timestamp, final String mimetype,
458 final URI baseid, final CrawlURI curi,
459 final ANVLRecord namedFields)
460 throws IOException {
461 namedFields.addLabelValue(
462 HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
463 namedFields.addLabelValue(
464 HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
465
466 ReplayInputStream ris = null;
467 long revisedLength = 0;
468
469
470 if (mimetype != null) {
471 ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
472 revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
473 revisedLength = revisedLength > 0
474 ? revisedLength
475 : curi.getHttpRecorder().getRecordedInput().getSize();
476 }
477
478 try {
479 w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
480 namedFields, ris, revisedLength);
481 } finally {
482 if (ris != null) {
483 ris.close();
484 }
485 }
486 curi.addAnnotation("warcRevisit:digest");
487 return baseid;
488 }
489
490 protected URI writeRevisitNotModified(final WARCWriter w,
491 final String timestamp,
492 final URI baseid, final CrawlURI curi,
493 final ANVLRecord namedFields)
494 throws IOException {
495 namedFields.addLabelValue(
496 HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
497
498 if(curi.containsKey(A_HTTP_TRANSACTION)) {
499 HttpMethodBase method =
500 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
501 saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
502 saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
503 HEADER_KEY_LAST_MODIFIED);
504 }
505
506 namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
507 NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
508 ReplayInputStream ris =
509 curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
510 try {
511 w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
512 namedFields, ris, 0);
513 } finally {
514 if (ris != null) {
515 ris.close();
516 }
517 }
518 curi.addAnnotation("warcRevisit:notModified");
519 return baseid;
520 }
521
522 /***
523 * Save a header from the given HTTP operation into the
524 * provider headers under a new name
525 *
526 * @param origName header name to get if present
527 * @param method http operation containing headers
528 */
529 protected void saveHeader(String origName, HttpMethodBase method,
530 ANVLRecord headers, String newName) {
531 Header header = method.getResponseHeader(origName);
532 if(header!=null) {
533 headers.addLabelValue(newName, header.getValue());
534 }
535 }
536
537 protected URI writeMetadata(final WARCWriter w,
538 final String timestamp,
539 final URI baseid, final CrawlURI curi,
540 final ANVLRecord namedFields)
541 throws IOException {
542 final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
543
544
545
546
547 ANVLRecord r = new ANVLRecord();
548 if (curi.isSeed()) {
549 r.addLabel("seed");
550 } else {
551 if (curi.forceFetch()) {
552 r.addLabel("force-fetch");
553 }
554 r.addLabelValue("via", curi.flattenVia());
555 r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
556 if (curi.containsKey(A_SOURCE_TAG)) {
557 r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
558 }
559 }
560 long duration = curi.getFetchDuration();
561 if(duration>-1) {
562 r.addLabelValue("fetchTimeMs", Long.toString(duration));
563 }
564
565 if (curi.containsKey(A_FTP_FETCH_STATUS)) {
566 r.addLabelValue("ftpFetchStatus", curi.getString(A_FTP_FETCH_STATUS));
567 }
568
569
570 Collection<Link> links = curi.getOutLinks();
571 if (links != null && links.size() > 0) {
572 for (Link link: links) {
573 r.addLabelValue("outlink", link.toString());
574 }
575 }
576
577
578
579
580
581
582
583
584
585
586 byte [] b = r.getUTF8Bytes();
587 w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
588 uid, namedFields, new ByteArrayInputStream(b), b.length);
589 return uid;
590 }
591
592 protected URI getRecordID() throws IOException {
593 URI result;
594 try {
595 result = GeneratorFactory.getFactory().getRecordID();
596 } catch (URISyntaxException e) {
597 throw new IOException(e.toString());
598 }
599 return result;
600 }
601
602 protected URI qualifyRecordID(final URI base, final String key,
603 final String value)
604 throws IOException {
605 URI result;
606 Map<String, String> qualifiers = new HashMap<String, String>(1);
607 qualifiers.put(key, value);
608 try {
609 result = GeneratorFactory.getFactory().
610 qualifyRecordID(base, qualifiers);
611 } catch (URISyntaxException e) {
612 throw new IOException(e.toString());
613 }
614 return result;
615 }
616
617 @Override
618 protected String getFirstrecordStylesheet() {
619 return "/warcinfobody.xsl";
620 }
621
622 /***
623 * Return relevant values as header-like fields (here ANVLRecord, but
624 * spec-defined "application/warc-fields" type when written). Field
625 * names from from DCMI Terms and the WARC/0.17 specification.
626 *
627 * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
628 */
629 @Override
630 protected String getFirstrecordBody(File orderFile) {
631 ANVLRecord record = new ANVLRecord(7);
632 record.addLabelValue("software", "Heritrix/" +
633 Heritrix.getVersion() + " http://crawler.archive.org");
634 try {
635 InetAddress host = InetAddress.getLocalHost();
636 record.addLabelValue("ip", host.getHostAddress());
637 record.addLabelValue("hostname", host.getCanonicalHostName());
638 } catch (UnknownHostException e) {
639 logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
640 }
641
642
643
644
645 record.addLabelValue("format","WARC File Format 1.0");
646 record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
647
648
649 try {
650 Document doc = XmlUtils.getDocument(orderFile);
651 addIfNotBlank(record,"operator",
652 XmlUtils.xpathOrNull(doc,"//meta/operator"));
653 addIfNotBlank(record,"publisher",
654 XmlUtils.xpathOrNull(doc,"//meta/organization"));
655 addIfNotBlank(record,"audience",
656 XmlUtils.xpathOrNull(doc,"//meta/audience"));
657 addIfNotBlank(record,"isPartOf",
658 XmlUtils.xpathOrNull(doc,"//meta/name"));
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679 addIfNotBlank(record,"description",
680 XmlUtils.xpathOrNull(doc,"//meta/description"));
681 addIfNotBlank(record,"robots",
682 XmlUtils.xpathOrNull(doc,
683 "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
684 addIfNotBlank(record,"http-header-user-agent",
685 XmlUtils.xpathOrNull(doc,
686 "//map[@name='http-headers']/string[@name='user-agent']"));
687 addIfNotBlank(record,"http-header-from",
688 XmlUtils.xpathOrNull(doc,
689 "//map[@name='http-headers']/string[@name='from']"));
690 } catch (IOException e) {
691 logger.log(Level.WARNING,"obtaining warcinfo",e);
692 }
693
694
695
696 return record.toString();
697 }
698
699
700 protected void addIfNotBlank(ANVLRecord record, String label, String value) {
701 if(StringUtils.isNotBlank(value)) {
702 record.addLabelValue(label, value);
703 }
704 }
705
706 @Override
707 public String report() {
708 logger.info("final stats: " + stats);
709
710 StringBuilder buf = new StringBuilder();
711 buf.append("Processor: " + getClass().getName() + "\n");
712 buf.append(" Function: Writes WARCs\n");
713 buf.append(" Total CrawlURIs: " + urlsWritten + "\n");
714 buf.append(" Revisit records: " + WARCWriter.getStat(stats, WARCWriter.REVISIT, WARCWriter.NUM_RECORDS) + "\n");
715
716 long bytes = WARCWriter.getStat(stats, WARCWriter.RESPONSE, WARCWriter.CONTENT_BYTES)
717 + WARCWriter.getStat(stats, WARCWriter.RESOURCE, WARCWriter.CONTENT_BYTES);
718 buf.append(" Crawled content bytes (including http headers): "
719 + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n");
720
721 bytes = WARCWriter.getStat(stats, WARCWriter.TOTALS, WARCWriter.TOTAL_BYTES);
722 buf.append(" Total uncompressed bytes (including all warc records): "
723 + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n");
724
725 buf.append(" Total size on disk ("+ (isCompressed() ? "compressed" : "uncompressed") + "): "
726 + getTotalBytesWritten() + " (" + ArchiveUtils.formatBytesForDisplay(getTotalBytesWritten()) + ")\n");
727
728 return buf.toString();
729 }
730 }