View Javadoc

1   /* $Id: Warc2Arc.java 4977 2007-03-09 23:57:28Z stack-sf $
2    *
3    * Created Aug 29, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  
26  import java.io.File;
27  import java.io.IOException;
28  import java.util.ArrayList;
29  import java.util.Arrays;
30  import java.util.Iterator;
31  import java.util.List;
32  import java.util.concurrent.atomic.AtomicInteger;
33  import java.util.logging.Level;
34  import java.util.logging.Logger;
35  
36  import org.apache.commons.cli.CommandLine;
37  import org.apache.commons.cli.HelpFormatter;
38  import org.apache.commons.cli.Option;
39  import org.apache.commons.cli.Options;
40  import org.apache.commons.cli.ParseException;
41  import org.apache.commons.cli.PosixParser;
42  import org.archive.io.arc.ARCWriter;
43  import org.archive.io.warc.WARCConstants;
44  import org.archive.io.warc.WARCReader;
45  import org.archive.io.warc.WARCReaderFactory;
46  import org.archive.io.warc.WARCRecord;
47  import org.archive.util.ArchiveUtils;
48  import org.archive.util.FileUtils;
49  
50  
51  /***
52   * Convert WARCs to (sortof) ARCs.
53   * WARCs can be 1Gig in size, that is, 10x default ARC size.  Script takes
54   * directory as output and will write multiple ARCs for a single large WARC.
55   * Only writes resource records of type <code>text/dns</code> or
56   * <code>application/http; msgtype=response</code>.  All others -- metadata,
57   * request -- are skipped.
58   * @author stack
59   * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
60   */
61  public class Warc2Arc {
62     private static void usage(HelpFormatter formatter, Options options,
63             int exitCode) {
64         formatter.printHelp("java org.archive.io.arc.Warc2Arc " +
65         		"[--force] [--prefix=PREFIX] [--suffix=SUFFIX] WARC_INPUT " +
66         		    "OUTPUT_DIR",
67              options);
68         System.exit(exitCode);
69     }
70     
71     static String parseRevision(final String version) {
72         final String ID = "$Revision: ";
73         int index = version.indexOf(ID);
74         return (index < 0)? version:
75             version.substring(index + ID.length(), version.length() - 1).trim();
76     }
77     
78     private static String getRevision() {
79         return parseRevision("$Revision: 4977 $");
80     }
81     
82     public void transform(final File warc, final File dir, final String prefix,
83             final String suffix, final boolean force)
84     throws IOException, java.text.ParseException {
85         FileUtils.isReadable(warc);
86         FileUtils.isReadable(dir);
87         WARCReader reader = WARCReaderFactory.get(warc);
88         List<String> metadata =  new ArrayList<String>();
89         metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
90             this.getClass().getName() + "/" + getRevision());
91         ARCWriter writer = new ARCWriter(new AtomicInteger(),
92              Arrays.asList(new File [] {dir}), prefix, suffix,
93              reader.isCompressed(), -1, metadata);
94         transform(reader, writer);
95     }
96  
97     protected void transform(final WARCReader reader, final ARCWriter writer)
98     throws IOException, java.text.ParseException {
99  	   // No point digesting. Digest is available after reading of ARC which
100 	   // is too late for inclusion in WARC.
101 	   reader.setDigest(false);
102        // I don't want the close being logged -- least, not w/o log of
103        // an opening (and that'd be a little silly for simple script
104        // like this). Currently, it logs at level INFO so that close
105        // of files gets written to log files.  Up the log level just
106        // for the close.
107        Logger l = Logger.getLogger(writer.getClass().getName());
108        Level oldLevel = l.getLevel();
109 	   try {
110            l.setLevel(Level.WARNING);
111 		   for (final Iterator i = reader.iterator(); i.hasNext();) {
112                WARCRecord r = (WARCRecord)i.next();
113                if (!isARCType(r.getHeader().getMimetype())) {
114                    continue;
115                }
116                if (r.getHeader().getContentBegin() <= 0) {
117                    // Otherwise, because length include Header-Line and
118                    // Named Fields, these will end up in the ARC unless there
119                    // is a non-zero content begin.
120                    continue;
121                }
122                String ip = (String)r.getHeader().
123                    getHeaderValue((WARCConstants.HEADER_KEY_IP));
124                long length = r.getHeader().getLength();
125                int offset = r.getHeader().getContentBegin();
126                // This mimetype is not exactly what you'd expect to find in
127                // an ARC though technically its 'correct'.  To get right one,
128                // need to parse the HTTP Headers.  Thats messy.  Not doing for
129                // now.
130                String mimetype = r.getHeader().getMimetype();
131                // Clean out ISO time string '-', 'T', ':', and 'Z' characters.
132                String t = r.getHeader().getDate().replaceAll("[-T:Z]", "");
133                long time = ArchiveUtils.getSecondsSinceEpoch(t).getTime();
134                writer.write(r.getHeader().getUrl(), mimetype, ip, time,
135                    (int)(length - offset), r);
136 		   }
137 	   } finally {
138 		   if (reader != null) {
139 			   reader.close();
140 		   }
141 		   if (writer != null) {
142 			   try {
143 				   writer.close();
144 			   } finally {
145 				   l.setLevel(oldLevel);
146 			   }
147 		   }
148 	   }
149    }
150    
151    protected boolean isARCType(final String mimetype) {
152        // Comparing mimetypes, especially WARC types can be problematic since
153        // they have whitespace.  For now, ignore.
154        if (mimetype == null || mimetype.length() <= 0) {
155            return false;
156        }
157        String cleaned = mimetype.toLowerCase().trim();
158        if (cleaned.equals(WARCConstants.HTTP_RESPONSE_MIMETYPE) ||
159                cleaned.equals("text/dns")) {
160            return true;
161        }
162        return false;
163    }
164 
165    /***
166     * Command-line interface to Arc2Warc.
167     *
168     * @param args Command-line arguments.
169     * @throws ParseException Failed parse of the command line.
170     * @throws IOException
171     * @throws java.text.ParseException
172     */
173    public static void main(String [] args)
174    throws ParseException, IOException, java.text.ParseException {
175        Options options = new Options();
176        options.addOption(new Option("h","help", false,
177            "Prints this message and exits."));
178        options.addOption(new Option("f","force", false,
179        	   "Force overwrite of target file."));
180        options.addOption(new Option("p","prefix", true,
181            "Prefix to use on created ARC files, else uses default."));
182        options.addOption(new Option("s","suffix", true,
183            "Suffix to use on created ARC files, else uses default."));
184        PosixParser parser = new PosixParser();
185        CommandLine cmdline = parser.parse(options, args, false);
186        List cmdlineArgs = cmdline.getArgList();
187        Option [] cmdlineOptions = cmdline.getOptions();
188        HelpFormatter formatter = new HelpFormatter();
189        
190        // If no args, print help.
191        if (cmdlineArgs.size() < 0) {
192            usage(formatter, options, 0);
193        }
194 
195        // Now look at options passed.
196        boolean force = false;
197        String prefix = "WARC2ARC";
198        String suffix = null;
199        for (int i = 0; i < cmdlineOptions.length; i++) {
200            switch(cmdlineOptions[i].getId()) {
201                case 'h':
202                    usage(formatter, options, 0);
203                    break;
204                    
205                case 'f':
206                    force = true;
207                    break;
208                    
209                case 'p':
210                    prefix = cmdlineOptions[i].getValue();
211                    break;
212                    
213                case 's':
214                    suffix = cmdlineOptions[i].getValue();
215                    break;
216                    
217                default:
218                    throw new RuntimeException("Unexpected option: " +
219                        + cmdlineOptions[i].getId());
220            }
221        }
222        
223        // If no args, print help.
224        if (cmdlineArgs.size() != 2) {
225            usage(formatter, options, 0);
226        }
227        (new Warc2Arc()).transform(new File(cmdlineArgs.get(0).toString()),
228            new File(cmdlineArgs.get(1).toString()), prefix, suffix, force);
229    }
230 }