View Javadoc

1   /* $Id: ExperimentalWARCWriterProcessor.java 4935 2007-02-23 00:27:24Z gojomo $
2    *
3    * Created on August 1st, 2006.
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.writer;
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.File;
27  import java.io.IOException;
28  import java.net.InetAddress;
29  import java.net.URI;
30  import java.net.URISyntaxException;
31  import java.net.UnknownHostException;
32  import java.util.Collection;
33  import java.util.HashMap;
34  import java.util.Map;
35  import java.util.concurrent.atomic.AtomicInteger;
36  import java.util.logging.Level;
37  import java.util.logging.Logger;
38  
39  import org.apache.commons.httpclient.Header;
40  import org.apache.commons.httpclient.HttpMethodBase;
41  import org.apache.commons.httpclient.HttpStatus;
42  import org.apache.commons.lang.StringUtils;
43  import org.archive.crawler.Heritrix;
44  import org.archive.crawler.datamodel.CoreAttributeConstants;
45  import org.archive.crawler.datamodel.CrawlURI;
46  import org.archive.crawler.datamodel.FetchStatusCodes;
47  import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
48  import org.archive.crawler.event.CrawlStatusListener;
49  import org.archive.crawler.extractor.Link;
50  import org.archive.crawler.framework.WriterPoolProcessor;
51  import org.archive.crawler.settings.SimpleType;
52  import org.archive.crawler.settings.Type;
53  import org.archive.io.ReplayInputStream;
54  import org.archive.io.WriterPoolMember;
55  import org.archive.io.WriterPoolSettings;
56  import org.archive.io.warc.WARCConstants;
57  import org.archive.io.warc.WARCWriter;
58  import org.archive.io.warc.WARCWriterPool;
59  import org.archive.uid.GeneratorFactory;
60  import org.archive.util.ArchiveUtils;
61  import org.archive.util.XmlUtils;
62  import org.archive.util.anvl.ANVLRecord;
63  import org.w3c.dom.Document;
64  
65  /***
66   * WARCWriterProcessor.
67   * Goes against the 0.18 version of the WARC specification (which
68   * is functionally identical to 0.17 except in the protocol 
69   * identifier string). 
70   * See http://archive-access.sourceforge.net/warc/
71   * 
72   * <p>TODO: Remove ANVLRecord. Rename NameValue or use RFC822
73   * (commons-httpclient?) or find something else.
74   * 
75   * @author stack
76   */
77  public class WARCWriterProcessor extends WriterPoolProcessor
78  implements CoreAttributeConstants, CrawlStatusListener,
79  WriterPoolSettings, FetchStatusCodes, WARCConstants {
80      private static final long serialVersionUID = 6182850087635847443L;
81  
82      private final Logger logger = Logger.getLogger(this.getClass().getName());
83  
84      private HashMap<String,Map<String,Long>> stats;
85      private int urlsWritten;
86      
87      public long getDefaultMaxFileSize() {
88            return 1000000000L; // 1 SI giga-byte (109 bytes), per WARC appendix A
89      }
90      
91      /***
92       * Key for whether to write 'request' type records where possible
93       */
94      public static final String ATTR_WRITE_REQUESTS =
95          "write-requests";
96      
97      /***
98       * Key for whether to write 'metadata' type records where possible
99       */
100     public static final String ATTR_WRITE_METADATA =
101         "write-metadata";
102     
103     /***
104      * Key for whether to write 'revisit' type records when
105      * consecutive identical digest
106      */
107     public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS =
108         "write-revisit-for-identical-digests";
109     
110     /***
111      * Key for whether to write 'revisit' type records for server
112      * "304 not modified" responses
113      */
114     public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED =
115         "write-revisit-for-not-modified";
116     
117     /***
118      * Default path list.
119      */
120     private static final String [] DEFAULT_PATH = {"warcs"};
121 
122     protected String [] getDefaultPath() {
123         return DEFAULT_PATH;
124     }
125     
126     /***
127      * @param name Name of this writer.
128      */
129     public WARCWriterProcessor(final String name) {
130         super(name, "Experimental WARCWriter processor (Version 0.17)");
131         Type e = addElementToDefinition(
132                 new SimpleType(ATTR_WRITE_REQUESTS,
133                 "Whether to write 'request' type records. " +
134                 "Default is true.", new Boolean(true)));
135         e.setOverrideable(true);
136         e.setExpertSetting(true);
137         e = addElementToDefinition(
138                 new SimpleType(ATTR_WRITE_METADATA,
139                 "Whether to write 'metadata' type records. " +
140                 "Default is true.", new Boolean(true)));
141         e.setOverrideable(true);
142         e.setExpertSetting(true);
143         e = addElementToDefinition(
144                 new SimpleType(ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
145                 "Whether to write 'revisit' type records when a URI's " +
146                 "history indicates the previous fetch had an identical " +
147                 "content digest. " +
148                 "Default is true.", new Boolean(true)));
149         e.setOverrideable(true);
150         e.setExpertSetting(true);
151         e = addElementToDefinition(
152                 new SimpleType(ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED,
153                 "Whether to write 'revisit' type records when a " +
154                 "304-Not Modified response is received. " +
155                 "Default is true.", new Boolean(true)));
156         e.setOverrideable(true);
157         e.setExpertSetting(true);
158     }
159 
160     protected void setupPool(final AtomicInteger serialNo) {
161 		setPool(new WARCWriterPool(serialNo, this, getPoolMaximumActive(),
162             getPoolMaximumWait()));
163     }
164     
165     /***
166      * Writes a CrawlURI and its associated data to store file.
167      * 
168      * Currently this method understands the following uri types: dns, http, and
169      * https.
170      * 
171      * @param curi CrawlURI to process.
172      * 
173      */
174     protected void innerProcess(CrawlURI curi) {
175         // If failure, or we haven't fetched the resource yet, return
176         if (curi.getFetchStatus() <= 0) {
177             return;
178         }
179         
180         // If no recorded content at all, don't write record. Except FTP, which
181         // can have empty content, since the "headers" don't count as content.
182         String scheme = curi.getUURI().getScheme().toLowerCase();
183         long recordLength = curi.getContentSize();
184         if (recordLength <= 0 && !scheme.equals("ftp")) {
185             // getContentSize() should be > 0 if any material (even just
186             // HTTP headers with zero-length body) is available. 
187             return;
188         }
189         
190         try {
191             if (shouldWrite(curi)) {
192                 write(scheme, curi);
193             } else {
194                 logger.info("This writer does not write out scheme " +
195                         scheme + " content");
196             }
197         } catch (IOException e) {
198             curi.addLocalizedError(this.getName(), e, "WriteRecord: " +
199                 curi.toString());
200             logger.log(Level.SEVERE, "Failed write of Record: " +
201                 curi.toString(), e);
202         }
203     }
204     
205     protected void write(final String lowerCaseScheme, final CrawlURI curi)
206     throws IOException {
207         logger.finer("writing warc record for " + curi);
208         WriterPoolMember writer = getPool().borrowFile();
209         WARCWriter w = (WARCWriter) writer;
210         w.resetStats();
211         
212         try {
213             // Write a request, response, and metadata all in the one
214             // 'transaction'.
215             final URI baseid = getRecordID();
216             final String timestamp =
217                 ArchiveUtils.getLog14Date(curi.getLong(A_FETCH_BEGAN_TIME));
218             if (lowerCaseScheme.startsWith("http")) {
219                 writeHttpRecords(w, curi, baseid, timestamp); 
220             } else if (lowerCaseScheme.equals("dns")) {
221                 writeDnsRecords(w, curi, baseid, timestamp);
222             } else if (lowerCaseScheme.equals("ftp")) {
223                 writeFtpRecords(w, curi, baseid, timestamp); 
224             } else {
225                 logger.warning("No handler for scheme " + lowerCaseScheme);
226             }
227         } catch (IOException e) {
228             // Invalidate this file (It gets a '.invalid' suffix).
229             getPool().invalidateFile(writer);
230             // Set the writer to null otherwise the pool accounting
231             // of how many active writers gets skewed if we subsequently
232             // do a returnWriter call on this object in the finally block.
233             writer = null;
234             throw e;
235         } finally {
236             if (writer != null) {
237                 if (WARCWriter.getStat(w.getStats(), WARCWriter.TOTALS, WARCWriter.NUM_RECORDS) > 0) {
238                     addStats(w.getStats());
239                     urlsWritten++;
240 
241                     String filename = w.getFile().getName();
242                     if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
243                         filename = filename.substring(0,
244                             filename.length() - WriterPoolMember.OCCUPIED_SUFFIX.length());
245                     }
246 
247                     curi.getAList().putString(CoreAttributeConstants.A_WRITTEN_TO_WARC, filename);
248                 }
249                 logger.fine("wrote " + WARCWriter.getStat(w.getStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK) + " bytes to " + w.getFile().getName() + " for " + curi);
250                 setTotalBytesWritten(getTotalBytesWritten() + WARCWriter.getStat(w.getStats(), WARCWriter.TOTALS, WARCWriter.SIZE_ON_DISK));
251 
252                 getPool().returnFile(writer);
253             }
254         }
255         checkBytesWritten();
256     }
257 
258     protected void addStats(Map<String,Map<String,Long>> statz) {
259         if (stats == null) {
260             stats = new HashMap<String,Map<String,Long>>();
261         }
262 
263         for (String key: statz.keySet()) {
264             if (stats.get(key) == null) {
265                 stats.put(key, new HashMap<String,Long>());
266             }
267             for (String subkey: statz.get(key).keySet()) {
268                 if (stats.get(key).get(subkey) == null) {
269                     stats.get(key).put(subkey, statz.get(key).get(subkey));
270                 } else {
271                     stats.get(key).put(subkey, stats.get(key).get(subkey) + statz.get(key).get(subkey));
272                 }
273             }
274         }
275     }
276 
277     private void writeFtpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
278             final String timestamp) throws IOException {
279         ANVLRecord headers = new ANVLRecord(3);
280         headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
281         String controlConversation = curi.getString(A_FTP_CONTROL_CONVERSATION);
282         URI rid = writeFtpControlConversation(w, timestamp, baseid, curi, headers, controlConversation);
283         
284         if (curi.getContentDigest() != null) {
285             headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
286                 curi.getContentDigestSchemeString());
287         }
288 
289         if (curi.getHttpRecorder() != null) {
290             if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && 
291                     ((Boolean)getUncheckedAttribute(curi, 
292                         ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
293                 rid = writeRevisitDigest(w, timestamp, null,
294                         baseid, curi, headers);
295             } else {
296                 headers = new ANVLRecord(3);
297                 if (curi.isTruncatedFetch()) {
298                     String value = curi.isTimeTruncatedFetch()?
299                         NAMED_FIELD_TRUNCATED_VALUE_TIME:
300                         curi.isLengthTruncatedFetch()?
301                         NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
302                         curi.isHeaderTruncatedFetch()?
303                         NAMED_FIELD_TRUNCATED_VALUE_HEAD:
304                         // TODO: Add this to spec.
305                         TRUNCATED_VALUE_UNSPECIFIED;
306                     headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
307                 }
308                 if (curi.getContentDigest() != null) {
309                     headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
310                             curi.getContentDigestSchemeString());
311                 }
312                 headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
313                 rid = writeResource(w, timestamp, curi.getContentType(), baseid, curi, headers);
314             }
315         }
316         if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
317             headers = new ANVLRecord(1);
318             headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>');
319             writeMetadata(w, timestamp, baseid, curi, headers);
320         }
321     }
322 
323     private void writeDnsRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
324             final String timestamp) throws IOException {
325         ANVLRecord headers = null;
326         String ip = curi.getString(A_DNS_SERVER_IP_LABEL);
327         if (ip != null && ip.length() > 0) {
328             headers = new ANVLRecord(1);
329             headers.addLabelValue(HEADER_KEY_IP, ip);
330         }
331         writeResponse(w, timestamp, curi.getContentType(), baseid,
332             curi, headers);
333     }
334 
335     private void writeHttpRecords(WARCWriter w, final CrawlURI curi, final URI baseid,
336             final String timestamp) throws IOException {
337         // Add named fields for ip, checksum, and relate the metadata
338         // and request to the resource field.
339         // TODO: Use other than ANVL (or rename ANVL as NameValue or
340         // use RFC822 (commons-httpclient?).
341         ANVLRecord headers = new ANVLRecord(5);
342         if (curi.getContentDigest() != null) {
343             headers.addLabelValue(HEADER_KEY_PAYLOAD_DIGEST,
344                 curi.getContentDigestSchemeString());
345         }
346         headers.addLabelValue(HEADER_KEY_IP, getHostAddress(curi));
347         URI rid;
348         
349         if (IdenticalDigestDecideRule.hasIdenticalDigest(curi) && 
350                 ((Boolean)getUncheckedAttribute(curi, 
351                         ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS))) {
352             rid = writeRevisitDigest(w, timestamp, HTTP_RESPONSE_MIMETYPE,
353                     baseid, curi, headers);
354         } else if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && 
355                 ((Boolean)getUncheckedAttribute(curi, 
356                         ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED))) {
357             rid = writeRevisitNotModified(w, timestamp,
358                     baseid, curi, headers);
359         } else {
360             if (curi.isTruncatedFetch()) {
361                 String value = curi.isTimeTruncatedFetch()?
362                     NAMED_FIELD_TRUNCATED_VALUE_TIME:
363                     curi.isLengthTruncatedFetch()?
364                         NAMED_FIELD_TRUNCATED_VALUE_LENGTH:
365                         curi.isHeaderTruncatedFetch()?
366                             NAMED_FIELD_TRUNCATED_VALUE_HEAD:
367                     // TODO: Add this to spec.
368                     TRUNCATED_VALUE_UNSPECIFIED;
369                 headers.addLabelValue(HEADER_KEY_TRUNCATED, value);
370             }
371             rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,
372             	baseid, curi, headers);
373         }
374         
375         headers = new ANVLRecord(1);
376         headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,
377             '<' + rid.toString() + '>');
378 
379         if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {
380             writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,
381                     baseid, curi, headers);
382         }
383         if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {
384             writeMetadata(w, timestamp, baseid, curi, headers);
385         }
386     }
387     
388     protected URI writeFtpControlConversation(WARCWriter w, String timestamp, URI baseid,
389             CrawlURI curi, ANVLRecord headers, String controlConversation) 
390     throws IOException {
391         final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
392         byte[] b = controlConversation.getBytes("UTF-8");
393         w.writeMetadataRecord(curi.toString(), timestamp, FTP_CONTROL_CONVERSATION_MIMETYPE,
394             uid, headers, new ByteArrayInputStream(b), b.length);
395         return uid;
396     }
397 
398     protected URI writeRequest(final WARCWriter w,
399             final String timestamp, final String mimetype,
400             final URI baseid, final CrawlURI curi,
401             final ANVLRecord namedFields) 
402     throws IOException {
403         final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);
404         ReplayInputStream ris =
405             curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();
406         try {
407             w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,
408                 namedFields, ris,
409                 curi.getHttpRecorder().getRecordedOutput().getSize());
410         } finally {
411             if (ris != null) {
412                 ris.close();
413             }
414         }
415         return uid;
416     }
417     
418     protected URI writeResponse(final WARCWriter w,
419             final String timestamp, final String mimetype,
420             final URI baseid, final CrawlURI curi,
421             final ANVLRecord namedFields) 
422     throws IOException {
423         ReplayInputStream ris =
424             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
425         try {
426             w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,
427                 namedFields, ris,
428                 curi.getHttpRecorder().getRecordedInput().getSize());
429         } finally {
430             if (ris != null) {
431                 ris.close();
432             }
433         }
434         return baseid;
435     }
436     
437     protected URI writeResource(final WARCWriter w,
438             final String timestamp, final String mimetype,
439             final URI baseid, final CrawlURI curi,
440             final ANVLRecord namedFields) 
441     throws IOException {
442         ReplayInputStream ris =
443             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
444         try {
445             w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid,
446                 namedFields, ris,
447                 curi.getHttpRecorder().getRecordedInput().getSize());
448         } finally {
449             if (ris != null) {
450                 ris.close();
451             }
452         }
453         return baseid;
454     }
455     
456     protected URI writeRevisitDigest(final WARCWriter w,
457             final String timestamp, final String mimetype,
458             final URI baseid, final CrawlURI curi,
459             final ANVLRecord namedFields) 
460     throws IOException {
461         namedFields.addLabelValue(
462                 HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);
463         namedFields.addLabelValue(
464                 HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
465         
466         ReplayInputStream ris = null;
467         long revisedLength = 0;
468         
469         // null mimetype implies no payload
470         if (mimetype != null) {
471             ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
472             revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();
473             revisedLength = revisedLength > 0 
474                 ? revisedLength 
475                 : curi.getHttpRecorder().getRecordedInput().getSize();
476         }
477         
478         try {
479             w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,
480                 namedFields, ris, revisedLength);
481         } finally {
482             if (ris != null) {
483                 ris.close();
484             }
485         }
486         curi.addAnnotation("warcRevisit:digest"); 
487         return baseid;
488     }
489     
490     protected URI writeRevisitNotModified(final WARCWriter w,
491             final String timestamp, 
492             final URI baseid, final CrawlURI curi,
493             final ANVLRecord namedFields) 
494     throws IOException {
495         namedFields.addLabelValue(
496         		HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);
497         // save just enough context to understand basis of not-modified
498         if(curi.containsKey(A_HTTP_TRANSACTION)) {
499             HttpMethodBase method = 
500                 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);
501             saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);
502             saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,
503             		HEADER_KEY_LAST_MODIFIED);
504         }
505         // truncate to zero-length (all necessary info is above)
506         namedFields.addLabelValue(HEADER_KEY_TRUNCATED,
507             NAMED_FIELD_TRUNCATED_VALUE_LENGTH);
508         ReplayInputStream ris =
509             curi.getHttpRecorder().getRecordedInput().getReplayInputStream();
510         try {
511             w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,
512                 namedFields, ris, 0);
513         } finally {
514             if (ris !=  null) {
515                 ris.close();
516             }
517         }
518         curi.addAnnotation("warcRevisit:notModified"); 
519         return baseid;
520     }
521     
522     /***
523      * Save a header from the given HTTP operation into the 
524      * provider headers under a new name
525      * 
526      * @param origName header name to get if present
527      * @param method http operation containing headers
528      */
529     protected void saveHeader(String origName, HttpMethodBase method, 
530     		ANVLRecord headers, String newName) {
531         Header header = method.getResponseHeader(origName);
532         if(header!=null) {
533             headers.addLabelValue(newName, header.getValue());
534         }
535     }
536 
537 	protected URI writeMetadata(final WARCWriter w,
538             final String timestamp,
539             final URI baseid, final CrawlURI curi,
540             final ANVLRecord namedFields) 
541     throws IOException {
542         final URI uid = qualifyRecordID(baseid, TYPE, METADATA);
543         // Get some metadata from the curi.
544         // TODO: Get all curi metadata.
545         // TODO: Use other than ANVL (or rename ANVL as NameValue or use
546         // RFC822 (commons-httpclient?).
547         ANVLRecord r = new ANVLRecord();
548         if (curi.isSeed()) {
549             r.addLabel("seed");
550         } else {
551         	if (curi.forceFetch()) {
552         		r.addLabel("force-fetch");
553         	}
554             r.addLabelValue("via", curi.flattenVia());
555             r.addLabelValue("hopsFromSeed", curi.getPathFromSeed());
556             if (curi.containsKey(A_SOURCE_TAG)) {
557                 r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));
558             }
559         }
560         long duration = curi.getFetchDuration();
561         if(duration>-1) {
562             r.addLabelValue("fetchTimeMs", Long.toString(duration));
563         }
564         
565         if (curi.containsKey(A_FTP_FETCH_STATUS)) {
566             r.addLabelValue("ftpFetchStatus", curi.getString(A_FTP_FETCH_STATUS));
567         }
568         
569         // Add outlinks though they are effectively useless without anchor text.
570         Collection<Link> links = curi.getOutLinks();
571         if (links != null && links.size() > 0) {
572             for (Link link: links) {
573                 r.addLabelValue("outlink", link.toString());
574             }
575         }
576         
577         // TODO: Other curi fields to write to metadata.
578         // 
579         // Credentials
580         // 
581         // fetch-began-time: 1154569278774
582         // fetch-completed-time: 1154569281816
583         //
584         // Annotations.
585         
586         byte [] b = r.getUTF8Bytes();
587         w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,
588             uid, namedFields, new ByteArrayInputStream(b), b.length);
589         return uid;
590     }
591     
592     protected URI getRecordID() throws IOException {
593         URI result;
594         try {
595             result = GeneratorFactory.getFactory().getRecordID();
596         } catch (URISyntaxException e) {
597             throw new IOException(e.toString());
598         }
599         return result;
600     }
601     
602     protected URI qualifyRecordID(final URI base, final String key,
603             final String value)
604     throws IOException {
605         URI result;
606         Map<String, String> qualifiers = new HashMap<String, String>(1);
607         qualifiers.put(key, value);
608         try {
609             result = GeneratorFactory.getFactory().
610                 qualifyRecordID(base, qualifiers);
611         } catch (URISyntaxException e) {
612             throw new IOException(e.toString());
613         }
614         return result;
615     }  
616     
617     @Override
618     protected String getFirstrecordStylesheet() {
619         return "/warcinfobody.xsl";
620     }
621 
622     /***
623      * Return relevant values as header-like fields (here ANVLRecord, but 
624      * spec-defined "application/warc-fields" type when written). Field
625      * names from from DCMI Terms and the WARC/0.17 specification.
626      * 
627      * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File)
628      */
629     @Override
630     protected String getFirstrecordBody(File orderFile) {
631         ANVLRecord record = new ANVLRecord(7);
632         record.addLabelValue("software", "Heritrix/" +
633                 Heritrix.getVersion() + " http://crawler.archive.org");
634         try {
635             InetAddress host = InetAddress.getLocalHost();
636             record.addLabelValue("ip", host.getHostAddress());
637             record.addLabelValue("hostname", host.getCanonicalHostName());
638         } catch (UnknownHostException e) {
639             logger.log(Level.WARNING,"unable top obtain local crawl engine host",e);
640         }
641 
642         // conforms to ISO 28500:2009 as of May 2009
643         // as described at http://bibnum.bnf.fr/WARC/ 
644         // latest draft as of November 2008
645         record.addLabelValue("format","WARC File Format 1.0"); 
646         record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
647 
648         // Get other values from order.xml 
649         try {
650             Document doc = XmlUtils.getDocument(orderFile);
651             addIfNotBlank(record,"operator",
652                     XmlUtils.xpathOrNull(doc,"//meta/operator"));
653             addIfNotBlank(record,"publisher",
654                     XmlUtils.xpathOrNull(doc,"//meta/organization"));
655             addIfNotBlank(record,"audience",
656                     XmlUtils.xpathOrNull(doc,"//meta/audience"));
657             addIfNotBlank(record,"isPartOf",
658                     XmlUtils.xpathOrNull(doc,"//meta/name"));
659 
660             // disabling "created" field per HER-1634
661             // though it's theoretically useful as a means of distinguishing 
662             // one crawl from another, the current usage/specification is too 
663             // vague... in particular a 'created' field in the 'warcinfo' is 
664             // reasonable to interpret as applying to the WARC-unit, rather 
665             // than the crawl-job-unit so we remove it and see if anyone 
666             // complains or makes a case for restoring it in a less-ambiguous 
667             // manner
668 //            String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date");
669 //            if(StringUtils.isNotBlank(rawDate)) {
670 //            	Date date;
671 //            	try {
672 //            		date = ArchiveUtils.parse14DigitDate(rawDate);
673 //            		addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
674 //            	} catch (ParseException e) {
675 //            		logger.log(Level.WARNING,"obtaining warc created date",e);
676 //            	}
677 //            }
678 
679             addIfNotBlank(record,"description",
680                     XmlUtils.xpathOrNull(doc,"//meta/description"));
681             addIfNotBlank(record,"robots",
682                     XmlUtils.xpathOrNull(doc, 
683                             "//newObject[@name='robots-honoring-policy']/string[@name='type']"));
684             addIfNotBlank(record,"http-header-user-agent",
685                     XmlUtils.xpathOrNull(doc, 
686                             "//map[@name='http-headers']/string[@name='user-agent']"));
687             addIfNotBlank(record,"http-header-from",
688                     XmlUtils.xpathOrNull(doc, 
689                             "//map[@name='http-headers']/string[@name='from']"));
690         } catch (IOException e) {
691             logger.log(Level.WARNING,"obtaining warcinfo",e);
692         } 
693         // really ugly to return as string, when it may just be merged with 
694         // a couple other fields at write time, but changing would require 
695         // larger refactoring
696         return record.toString();
697     }
698 
699 
700     protected void addIfNotBlank(ANVLRecord record, String label, String value) {
701         if(StringUtils.isNotBlank(value)) {
702             record.addLabelValue(label, value);
703         }
704     }
705 
706     @Override
707     public String report() {
708         logger.info("final stats: " + stats);
709         
710         StringBuilder buf = new StringBuilder();
711         buf.append("Processor: " + getClass().getName() + "\n");
712         buf.append("  Function:          Writes WARCs\n");
713         buf.append("  Total CrawlURIs:   " + urlsWritten + "\n");
714         buf.append("  Revisit records:   " + WARCWriter.getStat(stats, WARCWriter.REVISIT, WARCWriter.NUM_RECORDS) + "\n");
715         
716         long bytes = WARCWriter.getStat(stats, WARCWriter.RESPONSE, WARCWriter.CONTENT_BYTES)
717                 + WARCWriter.getStat(stats, WARCWriter.RESOURCE, WARCWriter.CONTENT_BYTES);
718         buf.append("  Crawled content bytes (including http headers): "
719                 + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n");
720         
721         bytes = WARCWriter.getStat(stats, WARCWriter.TOTALS, WARCWriter.TOTAL_BYTES);
722         buf.append("  Total uncompressed bytes (including all warc records): "
723                 + bytes + " (" + ArchiveUtils.formatBytesForDisplay(bytes) + ")\n");
724         
725         buf.append("  Total size on disk ("+ (isCompressed() ? "compressed" : "uncompressed") + "): "
726                 + getTotalBytesWritten() + " (" + ArchiveUtils.formatBytesForDisplay(getTotalBytesWritten()) + ")\n");
727         
728         return buf.toString();
729     }
730 }