1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25
26 import java.io.File;
27 import java.io.IOException;
28 import java.util.ArrayList;
29 import java.util.Arrays;
30 import java.util.Iterator;
31 import java.util.List;
32 import java.util.concurrent.atomic.AtomicInteger;
33 import java.util.logging.Level;
34 import java.util.logging.Logger;
35
36 import org.apache.commons.cli.CommandLine;
37 import org.apache.commons.cli.HelpFormatter;
38 import org.apache.commons.cli.Option;
39 import org.apache.commons.cli.Options;
40 import org.apache.commons.cli.ParseException;
41 import org.apache.commons.cli.PosixParser;
42 import org.archive.io.arc.ARCWriter;
43 import org.archive.io.warc.WARCConstants;
44 import org.archive.io.warc.WARCReader;
45 import org.archive.io.warc.WARCReaderFactory;
46 import org.archive.io.warc.WARCRecord;
47 import org.archive.util.ArchiveUtils;
48 import org.archive.util.FileUtils;
49
50
51 /***
52 * Convert WARCs to (sortof) ARCs.
53 * WARCs can be 1Gig in size, that is, 10x default ARC size. Script takes
54 * directory as output and will write multiple ARCs for a single large WARC.
55 * Only writes resource records of type <code>text/dns</code> or
56 * <code>application/http; msgtype=response</code>. All others -- metadata,
57 * request -- are skipped.
58 * @author stack
59 * @version $Date: 2007-03-09 23:57:28 +0000 (Fri, 09 Mar 2007) $ $Revision: 4977 $
60 */
61 public class Warc2Arc {
62 private static void usage(HelpFormatter formatter, Options options,
63 int exitCode) {
64 formatter.printHelp("java org.archive.io.arc.Warc2Arc " +
65 "[--force] [--prefix=PREFIX] [--suffix=SUFFIX] WARC_INPUT " +
66 "OUTPUT_DIR",
67 options);
68 System.exit(exitCode);
69 }
70
71 static String parseRevision(final String version) {
72 final String ID = "$Revision: ";
73 int index = version.indexOf(ID);
74 return (index < 0)? version:
75 version.substring(index + ID.length(), version.length() - 1).trim();
76 }
77
78 private static String getRevision() {
79 return parseRevision("$Revision: 4977 $");
80 }
81
82 public void transform(final File warc, final File dir, final String prefix,
83 final String suffix, final boolean force)
84 throws IOException, java.text.ParseException {
85 FileUtils.isReadable(warc);
86 FileUtils.isReadable(dir);
87 WARCReader reader = WARCReaderFactory.get(warc);
88 List<String> metadata = new ArrayList<String>();
89 metadata.add("Made from " + reader.getReaderIdentifier() + " by " +
90 this.getClass().getName() + "/" + getRevision());
91 ARCWriter writer = new ARCWriter(new AtomicInteger(),
92 Arrays.asList(new File [] {dir}), prefix, suffix,
93 reader.isCompressed(), -1, metadata);
94 transform(reader, writer);
95 }
96
97 protected void transform(final WARCReader reader, final ARCWriter writer)
98 throws IOException, java.text.ParseException {
99
100
101 reader.setDigest(false);
102
103
104
105
106
107 Logger l = Logger.getLogger(writer.getClass().getName());
108 Level oldLevel = l.getLevel();
109 try {
110 l.setLevel(Level.WARNING);
111 for (final Iterator i = reader.iterator(); i.hasNext();) {
112 WARCRecord r = (WARCRecord)i.next();
113 if (!isARCType(r.getHeader().getMimetype())) {
114 continue;
115 }
116 if (r.getHeader().getContentBegin() <= 0) {
117
118
119
120 continue;
121 }
122 String ip = (String)r.getHeader().
123 getHeaderValue((WARCConstants.HEADER_KEY_IP));
124 long length = r.getHeader().getLength();
125 int offset = r.getHeader().getContentBegin();
126
127
128
129
130 String mimetype = r.getHeader().getMimetype();
131
132 String t = r.getHeader().getDate().replaceAll("[-T:Z]", "");
133 long time = ArchiveUtils.getSecondsSinceEpoch(t).getTime();
134 writer.write(r.getHeader().getUrl(), mimetype, ip, time,
135 (int)(length - offset), r);
136 }
137 } finally {
138 if (reader != null) {
139 reader.close();
140 }
141 if (writer != null) {
142 try {
143 writer.close();
144 } finally {
145 l.setLevel(oldLevel);
146 }
147 }
148 }
149 }
150
151 protected boolean isARCType(final String mimetype) {
152
153
154 if (mimetype == null || mimetype.length() <= 0) {
155 return false;
156 }
157 String cleaned = mimetype.toLowerCase().trim();
158 if (cleaned.equals(WARCConstants.HTTP_RESPONSE_MIMETYPE) ||
159 cleaned.equals("text/dns")) {
160 return true;
161 }
162 return false;
163 }
164
165 /***
166 * Command-line interface to Arc2Warc.
167 *
168 * @param args Command-line arguments.
169 * @throws ParseException Failed parse of the command line.
170 * @throws IOException
171 * @throws java.text.ParseException
172 */
173 public static void main(String [] args)
174 throws ParseException, IOException, java.text.ParseException {
175 Options options = new Options();
176 options.addOption(new Option("h","help", false,
177 "Prints this message and exits."));
178 options.addOption(new Option("f","force", false,
179 "Force overwrite of target file."));
180 options.addOption(new Option("p","prefix", true,
181 "Prefix to use on created ARC files, else uses default."));
182 options.addOption(new Option("s","suffix", true,
183 "Suffix to use on created ARC files, else uses default."));
184 PosixParser parser = new PosixParser();
185 CommandLine cmdline = parser.parse(options, args, false);
186 List cmdlineArgs = cmdline.getArgList();
187 Option [] cmdlineOptions = cmdline.getOptions();
188 HelpFormatter formatter = new HelpFormatter();
189
190
191 if (cmdlineArgs.size() < 0) {
192 usage(formatter, options, 0);
193 }
194
195
196 boolean force = false;
197 String prefix = "WARC2ARC";
198 String suffix = null;
199 for (int i = 0; i < cmdlineOptions.length; i++) {
200 switch(cmdlineOptions[i].getId()) {
201 case 'h':
202 usage(formatter, options, 0);
203 break;
204
205 case 'f':
206 force = true;
207 break;
208
209 case 'p':
210 prefix = cmdlineOptions[i].getValue();
211 break;
212
213 case 's':
214 suffix = cmdlineOptions[i].getValue();
215 break;
216
217 default:
218 throw new RuntimeException("Unexpected option: " +
219 + cmdlineOptions[i].getId());
220 }
221 }
222
223
224 if (cmdlineArgs.size() != 2) {
225 usage(formatter, options, 0);
226 }
227 (new Warc2Arc()).transform(new File(cmdlineArgs.get(0).toString()),
228 new File(cmdlineArgs.get(1).toString()), prefix, suffix, force);
229 }
230 }