View Javadoc

1   /* $Id: Arc2Warc.java 6792 2010-03-12 23:47:28Z szznax $
2    *
3    * Created Aug 29, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.BufferedOutputStream;
26  import java.io.ByteArrayOutputStream;
27  import java.io.File;
28  import java.io.FileOutputStream;
29  import java.io.IOException;
30  import java.util.ArrayList;
31  import java.util.Iterator;
32  import java.util.List;
33  import java.util.logging.Level;
34  import java.util.logging.Logger;
35  
36  import org.apache.commons.cli.CommandLine;
37  import org.apache.commons.cli.HelpFormatter;
38  import org.apache.commons.cli.Option;
39  import org.apache.commons.cli.Options;
40  import org.apache.commons.cli.ParseException;
41  import org.apache.commons.cli.PosixParser;
42  import org.archive.io.arc.ARCConstants;
43  import org.archive.io.arc.ARCReader;
44  import org.archive.io.arc.ARCReaderFactory;
45  import org.archive.io.arc.ARCRecord;
46  import org.archive.io.warc.WARCConstants;
47  import org.archive.io.warc.WARCWriter;
48  import org.archive.util.FileUtils;
49  import org.archive.util.anvl.ANVLRecord;
50  import org.joda.time.DateTimeZone;
51  import org.joda.time.format.DateTimeFormat;
52  import org.joda.time.format.ISODateTimeFormat;
53  
54  
55  /***
56   * Convert ARCs to (sortof) WARCs.
57   * @author stack
58   * @version $Date: 2010-03-12 23:47:28 +0000 (Fri, 12 Mar 2010) $ $Revision: 6792 $
59   */
60  public class Arc2Warc {
61     private static void usage(HelpFormatter formatter, Options options,
62             int exitCode) {
63         formatter.printHelp("java org.archive.io.arc.Arc2Warc " +
64         		"[--force] ARC_INPUT WARC_OUTPUT", options);
65         System.exit(exitCode);
66     }
67     
68     private static String getRevision() {
69         return Warc2Arc.parseRevision("$Revision: 6792 $");
70     }
71     
72     public void transform(final File arc, final File warc, final boolean force)
73     throws IOException {
74         FileUtils.isReadable(arc);
75         if (warc.exists() && !force) {
76      	   throw new IOException("Target WARC already exists. " +
77      	       "Will not overwrite.");
78         }
79  
80         ARCReader reader = ARCReaderFactory.get(arc, false, 0);
81         transform(reader, warc);
82     }
83     
84     protected void transform(final ARCReader reader, final File warc)
85     throws IOException {
86  	   WARCWriter writer = null;
87  	   // No point digesting. Digest is available after reading of ARC which
88  	   // is too late for inclusion in WARC.
89  	   reader.setDigest(false);
90  	   try {
91  		   BufferedOutputStream bos =
92  			   new BufferedOutputStream(new FileOutputStream(warc));
93  		   // Get the body of the first ARC record as a String so can dump it
94  		   // into first record of WARC.
95  		   final Iterator<ArchiveRecord> i = reader.iterator();
96  		   ARCRecord firstRecord = (ARCRecord)i.next();
97  		   ByteArrayOutputStream baos =
98  			   new ByteArrayOutputStream((int)firstRecord.getHeader().
99  			       getLength());
100 		   firstRecord.dump(baos);
101 	       // Add ARC first record content as an ANVLRecord.
102 	       ANVLRecord ar = new ANVLRecord(1);
103 	       ar.addLabelValue("Filedesc", baos.toString());
104 	       List<String> metadata = new ArrayList<String>(1);
105 	       metadata.add(ar.toString());
106 	       // Now create the writer.  If reader was compressed, lets write
107 	       // a compressed WARC.
108 		   writer = new WARCWriter(null, bos, warc,
109 		       reader.isCompressed(), null, metadata);
110 		   // Write a warcinfo record with description about how this WARC
111 		   // was made.
112 		   writer.writeWarcinfoRecord(warc.getName(),
113 		       "Made from " + reader.getReaderIdentifier() + " by " +
114 	               this.getClass().getName() + "/" + getRevision());
115 		   for (; i.hasNext();) {
116 			   write(writer, (ARCRecord)i.next());
117 		   }
118 	   } finally {
119 		   if (reader != null) {
120 			   reader.close();
121 		   }
122 		   if (writer != null) {
123 			   // I don't want the close being logged -- least, not w/o log of
124 			   // an opening (and that'd be a little silly for simple script
125 			   // like this). Currently, it logs at level INFO so that close
126 			   // of files gets written to log files.  Up the log level just
127 			   // for the close.
128 			   Logger l = Logger.getLogger(writer.getClass().getName());
129 			   Level oldLevel = l.getLevel();
130 			   l.setLevel(Level.WARNING);
131 			   try {
132 				   writer.close();
133 			   } finally {
134 				   l.setLevel(oldLevel);
135 			   }
136 		   }
137 	   }
138    }
139    
140    protected void write(final WARCWriter writer, final ARCRecord r)
141    throws IOException {
142 
143        // convert ARC date to WARC-Date format
144        String arcDateString = r.getHeader().getDate();
145        String warcDateString = DateTimeFormat.forPattern("yyyyMMddHHmmss")
146            .withZone(DateTimeZone.UTC)
147                .parseDateTime(arcDateString)
148                    .toString(ISODateTimeFormat.dateTimeNoMillis());
149 
150        ANVLRecord ar = new ANVLRecord();
151        String ip = (String)r.getHeader()
152            .getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
153        if (ip != null && ip.length() > 0) {
154            ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
155            r.getMetaData();
156        }
157 
158        // enable reconstruction of ARC from transformed WARC
159        // TODO: deferred for further analysis (see HER-1750) 
160        // ar.addLabelValue("ARC-Header-Line", r.getHeaderString());
161 
162        // If contentBody > 0, assume http headers.  Make the mimetype
163        // be application/http.  Otherwise, give it ARC mimetype.
164        String warcMimeTypeString;
165        if (r.getHeader().getContentBegin() > 0) {
166            warcMimeTypeString = WARCConstants.HTTP_RESPONSE_MIMETYPE;
167            writer.writeResponseRecord(r.getHeader().getUrl(), warcDateString,
168                warcMimeTypeString, WARCWriter.getRecordID(), ar, r, 
169                    r.getHeader().getLength());
170        } else {
171            warcMimeTypeString = r.getHeader().getMimetype();
172            writer.writeResourceRecord(r.getHeader().getUrl(), warcDateString,
173                warcMimeTypeString, ar, r, r.getHeader().getLength());
174        }
175 
176    }
177 
178    /***
179     * Command-line interface to Arc2Warc.
180     *
181     * @param args Command-line arguments.
182     * @throws ParseException Failed parse of the command line.
183     * @throws IOException
184     * @throws java.text.ParseException
185     */
186    public static void main(String [] args)
187    throws ParseException, IOException, java.text.ParseException {
188        Options options = new Options();
189        options.addOption(new Option("h","help", false,
190            "Prints this message and exits."));
191        options.addOption(new Option("f","force", false,
192        	   "Force overwrite of target file."));
193        PosixParser parser = new PosixParser();
194        CommandLine cmdline = parser.parse(options, args, false);
195        List<?> cmdlineArgs = cmdline.getArgList();
196        Option [] cmdlineOptions = cmdline.getOptions();
197        HelpFormatter formatter = new HelpFormatter();
198        
199        // If no args, print help.
200        if (cmdlineArgs.size() <= 0) {
201            usage(formatter, options, 0);
202        }
203 
204        // Now look at options passed.
205        boolean force = false;
206        for (int i = 0; i < cmdlineOptions.length; i++) {
207            switch(cmdlineOptions[i].getId()) {
208                case 'h':
209                    usage(formatter, options, 0);
210                    break;
211                    
212                case 'f':
213                    force = true;
214                    break;
215                    
216                default:
217                    throw new RuntimeException("Unexpected option: " +
218                        + cmdlineOptions[i].getId());
219            }
220        }
221        
222        // If no args, print help.
223        if (cmdlineArgs.size() != 2) {
224            usage(formatter, options, 0);
225        }
226        (new Arc2Warc()).transform(new File(cmdlineArgs.get(0).toString()),
227            new File(cmdlineArgs.get(1).toString()), force);
228    }
229 }