1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.BufferedOutputStream;
26 import java.io.ByteArrayOutputStream;
27 import java.io.File;
28 import java.io.FileOutputStream;
29 import java.io.IOException;
30 import java.util.ArrayList;
31 import java.util.Iterator;
32 import java.util.List;
33 import java.util.logging.Level;
34 import java.util.logging.Logger;
35
36 import org.apache.commons.cli.CommandLine;
37 import org.apache.commons.cli.HelpFormatter;
38 import org.apache.commons.cli.Option;
39 import org.apache.commons.cli.Options;
40 import org.apache.commons.cli.ParseException;
41 import org.apache.commons.cli.PosixParser;
42 import org.archive.io.arc.ARCConstants;
43 import org.archive.io.arc.ARCReader;
44 import org.archive.io.arc.ARCReaderFactory;
45 import org.archive.io.arc.ARCRecord;
46 import org.archive.io.warc.WARCConstants;
47 import org.archive.io.warc.WARCWriter;
48 import org.archive.util.FileUtils;
49 import org.archive.util.anvl.ANVLRecord;
50 import org.joda.time.DateTimeZone;
51 import org.joda.time.format.DateTimeFormat;
52 import org.joda.time.format.ISODateTimeFormat;
53
54
55 /***
56 * Convert ARCs to (sortof) WARCs.
57 * @author stack
58 * @version $Date: 2010-03-12 23:47:28 +0000 (Fri, 12 Mar 2010) $ $Revision: 6792 $
59 */
60 public class Arc2Warc {
61 private static void usage(HelpFormatter formatter, Options options,
62 int exitCode) {
63 formatter.printHelp("java org.archive.io.arc.Arc2Warc " +
64 "[--force] ARC_INPUT WARC_OUTPUT", options);
65 System.exit(exitCode);
66 }
67
68 private static String getRevision() {
69 return Warc2Arc.parseRevision("$Revision: 6792 $");
70 }
71
72 public void transform(final File arc, final File warc, final boolean force)
73 throws IOException {
74 FileUtils.isReadable(arc);
75 if (warc.exists() && !force) {
76 throw new IOException("Target WARC already exists. " +
77 "Will not overwrite.");
78 }
79
80 ARCReader reader = ARCReaderFactory.get(arc, false, 0);
81 transform(reader, warc);
82 }
83
84 protected void transform(final ARCReader reader, final File warc)
85 throws IOException {
86 WARCWriter writer = null;
87
88
89 reader.setDigest(false);
90 try {
91 BufferedOutputStream bos =
92 new BufferedOutputStream(new FileOutputStream(warc));
93
94
95 final Iterator<ArchiveRecord> i = reader.iterator();
96 ARCRecord firstRecord = (ARCRecord)i.next();
97 ByteArrayOutputStream baos =
98 new ByteArrayOutputStream((int)firstRecord.getHeader().
99 getLength());
100 firstRecord.dump(baos);
101
102 ANVLRecord ar = new ANVLRecord(1);
103 ar.addLabelValue("Filedesc", baos.toString());
104 List<String> metadata = new ArrayList<String>(1);
105 metadata.add(ar.toString());
106
107
108 writer = new WARCWriter(null, bos, warc,
109 reader.isCompressed(), null, metadata);
110
111
112 writer.writeWarcinfoRecord(warc.getName(),
113 "Made from " + reader.getReaderIdentifier() + " by " +
114 this.getClass().getName() + "/" + getRevision());
115 for (; i.hasNext();) {
116 write(writer, (ARCRecord)i.next());
117 }
118 } finally {
119 if (reader != null) {
120 reader.close();
121 }
122 if (writer != null) {
123
124
125
126
127
128 Logger l = Logger.getLogger(writer.getClass().getName());
129 Level oldLevel = l.getLevel();
130 l.setLevel(Level.WARNING);
131 try {
132 writer.close();
133 } finally {
134 l.setLevel(oldLevel);
135 }
136 }
137 }
138 }
139
140 protected void write(final WARCWriter writer, final ARCRecord r)
141 throws IOException {
142
143
144 String arcDateString = r.getHeader().getDate();
145 String warcDateString = DateTimeFormat.forPattern("yyyyMMddHHmmss")
146 .withZone(DateTimeZone.UTC)
147 .parseDateTime(arcDateString)
148 .toString(ISODateTimeFormat.dateTimeNoMillis());
149
150 ANVLRecord ar = new ANVLRecord();
151 String ip = (String)r.getHeader()
152 .getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
153 if (ip != null && ip.length() > 0) {
154 ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
155 r.getMetaData();
156 }
157
158
159
160
161
162
163
164 String warcMimeTypeString;
165 if (r.getHeader().getContentBegin() > 0) {
166 warcMimeTypeString = WARCConstants.HTTP_RESPONSE_MIMETYPE;
167 writer.writeResponseRecord(r.getHeader().getUrl(), warcDateString,
168 warcMimeTypeString, WARCWriter.getRecordID(), ar, r,
169 r.getHeader().getLength());
170 } else {
171 warcMimeTypeString = r.getHeader().getMimetype();
172 writer.writeResourceRecord(r.getHeader().getUrl(), warcDateString,
173 warcMimeTypeString, ar, r, r.getHeader().getLength());
174 }
175
176 }
177
178 /***
179 * Command-line interface to Arc2Warc.
180 *
181 * @param args Command-line arguments.
182 * @throws ParseException Failed parse of the command line.
183 * @throws IOException
184 * @throws java.text.ParseException
185 */
186 public static void main(String [] args)
187 throws ParseException, IOException, java.text.ParseException {
188 Options options = new Options();
189 options.addOption(new Option("h","help", false,
190 "Prints this message and exits."));
191 options.addOption(new Option("f","force", false,
192 "Force overwrite of target file."));
193 PosixParser parser = new PosixParser();
194 CommandLine cmdline = parser.parse(options, args, false);
195 List<?> cmdlineArgs = cmdline.getArgList();
196 Option [] cmdlineOptions = cmdline.getOptions();
197 HelpFormatter formatter = new HelpFormatter();
198
199
200 if (cmdlineArgs.size() <= 0) {
201 usage(formatter, options, 0);
202 }
203
204
205 boolean force = false;
206 for (int i = 0; i < cmdlineOptions.length; i++) {
207 switch(cmdlineOptions[i].getId()) {
208 case 'h':
209 usage(formatter, options, 0);
210 break;
211
212 case 'f':
213 force = true;
214 break;
215
216 default:
217 throw new RuntimeException("Unexpected option: " +
218 + cmdlineOptions[i].getId());
219 }
220 }
221
222
223 if (cmdlineArgs.size() != 2) {
224 usage(formatter, options, 0);
225 }
226 (new Arc2Warc()).transform(new File(cmdlineArgs.get(0).toString()),
227 new File(cmdlineArgs.get(1).toString()), force);
228 }
229 }