1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.io.arc;
27
28 import java.io.File;
29 import java.io.FileOutputStream;
30 import java.io.IOException;
31 import java.io.PrintStream;
32 import java.util.Date;
33 import java.util.Iterator;
34 import java.util.zip.GZIPOutputStream;
35
36 import org.apache.commons.httpclient.Header;
37 import org.apache.commons.httpclient.HeaderGroup;
38 import org.apache.commons.httpclient.util.DateParseException;
39 import org.apache.commons.httpclient.util.DateUtil;
40 import org.archive.util.ArchiveUtils;
41 import org.archive.util.SURT;
42
43 /***
44 * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
45 * Writes .wcdx.gz in same directory.
46 *
47 * @author gojomo
48 */
49 public class ARC2WCDX {
50 final public static String WCDX_VERSION="0.1";
51
52 public static void main(String[] args) throws IOException {
53 String arcFilename = args[0];
54 createWcdx(arcFilename);
55 }
56
57 public static Object[] createWcdx(String arcFilename) throws IOException {
58 ARCReader reader = ARCReaderFactory.get(arcFilename);
59 Object[] retVal = createWcdx(reader);
60 reader.close();
61 return retVal;
62 }
63
64 public static Object[] createWcdx(ARCReader reader) {
65 reader.setDigest(true);
66
67 String wcdxPath = reader.getReaderIdentifier().replaceAll("//.arc(//.gz)?$",".wcdx.gz");
68 File wcdxFile = new File(wcdxPath+".open");
69 PrintStream writer = null;
70 long count = 0;
71 try {
72 writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
73
74
75 StringBuilder legend = new StringBuilder();
76 appendField(legend,"CDX");
77 appendField(legend,"surt-uri");
78 appendField(legend,"b");
79 appendField(legend,"http-date");
80 appendField(legend,"s");
81 appendField(legend,"m");
82 appendField(legend,"sha1");
83 appendField(legend,"g");
84 appendField(legend,"V");
85 appendField(legend,"end-offset");
86 appendField(legend,"n");
87 appendField(legend,"http-content-length");
88 appendField(legend,"http-last-modified");
89 appendField(legend,"http-expires");
90 appendField(legend,"http-etag");
91 appendField(legend,"http-location");
92 appendField(legend,"e");
93 appendField(legend,"a");
94
95 appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate());
96 writer.println(legend.toString());
97
98 Iterator iter = reader.iterator();
99 count = 0;
100 while(iter.hasNext()) {
101 ARCRecord record = (ARCRecord) iter.next();
102 record.close();
103 ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader();
104 Header[] httpHeaders = record.getHttpHeaders();
105 if(httpHeaders==null) {
106 httpHeaders = new Header[0];
107 }
108 HeaderGroup hg = new HeaderGroup();
109 hg.setHeaders(httpHeaders);
110 StringBuilder builder = new StringBuilder();
111
112
113 appendField(builder,SURT.fromURI(h.getUrl()));
114
115 appendField(builder,h.getDate());
116
117 appendTimeField(builder,hg.getFirstHeader("Date"));
118
119 appendField(builder,h.getStatusCode());
120
121 appendField(builder,h.getMimetype());
122
123 appendField(builder,record.getDigestStr());
124
125 appendField(builder,reader.getFileName());
126
127 appendField(builder,h.getOffset());
128
129
130
131
132
133
134
135 appendField(builder, "-");
136
137
138 appendField(builder,h.getLength());
139
140 appendField(builder,hg.getFirstHeader("Content-Length"));
141
142
143 appendTimeField(builder,hg.getFirstHeader("Last-Modified"));
144
145 appendTimeField(builder,hg.getFirstHeader("Expires"));
146
147
148 appendField(builder,hg.getFirstHeader("ETag"));
149
150 appendField(builder,hg.getFirstHeader("Location"));
151
152 appendField(builder,h.getIp());
153
154 appendField(builder,h.getUrl());
155
156
157 writer.println(builder.toString());
158 count++;
159 }
160 wcdxFile.renameTo(new File(wcdxPath));
161 } catch (IOException e) {
162
163 if(!wcdxFile.exists()) {
164 try {
165 wcdxFile.createNewFile();
166 } catch (IOException e1) {
167
168 throw new RuntimeException(e1);
169 }
170 }
171 } catch (RuntimeException e) {
172
173 if(!wcdxFile.exists()) {
174 try {
175 wcdxFile.createNewFile();
176 } catch (IOException e1) {
177
178 throw new RuntimeException(e1);
179 }
180 }
181 } finally {
182 if(writer!=null) {
183 writer.close();
184 }
185 }
186
187 return new Object[] {wcdxPath, count};
188 }
189
190 protected static void appendField(StringBuilder builder, Object obj) {
191 if(builder.length()>0) {
192
193 builder.append(' ');
194 }
195 if(obj instanceof Header) {
196 obj = ((Header)obj).getValue().trim();
197 }
198
199 builder.append((obj==null||obj.toString().length()==0)?"-":obj);
200 }
201
202 protected static void appendTimeField(StringBuilder builder, Object obj) {
203 if(builder.length()>0) {
204
205 builder.append(' ');
206 }
207 if(obj==null) {
208 builder.append("-");
209 return;
210 }
211 if(obj instanceof Header) {
212 String s = ((Header)obj).getValue().trim();
213 try {
214 Date date = DateUtil.parseDate(s);
215 String d = ArchiveUtils.get14DigitDate(date);
216 if(d.startsWith("209")) {
217 d = "199"+d.substring(3);
218 }
219 obj = d;
220 } catch (DateParseException e) {
221 builder.append('e');
222 return;
223 }
224
225 }
226 builder.append(obj);
227 }
228 }
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249