View Javadoc

1   /*
2    * ARC2WCDX.java
3    *
4    * $Id: ARC2WCDX.java 4903 2007-02-16 01:45:10Z gojomo $
5    *
6    * Created on Nov 13, 2006
7    *
8    * Copyright (C) 2006 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  package org.archive.io.arc;
27  
28  import java.io.File;
29  import java.io.FileOutputStream;
30  import java.io.IOException;
31  import java.io.PrintStream;
32  import java.util.Date;
33  import java.util.Iterator;
34  import java.util.zip.GZIPOutputStream;
35  
36  import org.apache.commons.httpclient.Header;
37  import org.apache.commons.httpclient.HeaderGroup;
38  import org.apache.commons.httpclient.util.DateParseException;
39  import org.apache.commons.httpclient.util.DateUtil;
40  import org.archive.util.ArchiveUtils;
41  import org.archive.util.SURT;
42  
43  /***
44   * Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
45   * Writes .wcdx.gz in same directory.
46   *
47   * @author gojomo
48   */
49  public class ARC2WCDX {
50      final public static String WCDX_VERSION="0.1";
51  
52      public static void main(String[] args) throws IOException {
53          String arcFilename = args[0];
54          createWcdx(arcFilename);
55      }
56  
57      public static Object[] createWcdx(String arcFilename) throws IOException {
58          ARCReader reader = ARCReaderFactory.get(arcFilename);
59          Object[] retVal = createWcdx(reader);
60          reader.close();
61          return retVal; 
62      }
63  
64      public static Object[] createWcdx(ARCReader reader) {
65          reader.setDigest(true);
66  
67          String wcdxPath = reader.getReaderIdentifier().replaceAll("//.arc(//.gz)?$",".wcdx.gz");
68          File wcdxFile = new File(wcdxPath+".open");
69          PrintStream writer = null;
70          long count = 0;
71          try {
72              writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
73              
74              // write header: legend + timestamp
75              StringBuilder legend = new StringBuilder();
76              appendField(legend,"CDX");
77              appendField(legend,"surt-uri");
78              appendField(legend,"b"); // ARC timestamp
79              appendField(legend,"http-date");
80              appendField(legend,"s"); // status code
81              appendField(legend,"m"); // media type
82              appendField(legend,"sha1"); // content sha1
83              appendField(legend,"g"); // ARC name
84              appendField(legend,"V"); // start offset
85              appendField(legend,"end-offset"); // TODO: implement
86              appendField(legend,"n"); // ARC record length TODO: verify
87              appendField(legend,"http-content-length");
88              appendField(legend,"http-last-modified");
89              appendField(legend,"http-expires");
90              appendField(legend,"http-etag");
91              appendField(legend,"http-location");
92              appendField(legend,"e"); // IP
93              appendField(legend,"a"); // original URL
94              // WCDX version+creation time: crude version control
95              appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate());
96              writer.println(legend.toString());
97  
98              Iterator iter = reader.iterator();
99              count = 0; 
100             while(iter.hasNext()) {
101                 ARCRecord record = (ARCRecord) iter.next();
102                 record.close();
103                 ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader();
104                 Header[] httpHeaders = record.getHttpHeaders();
105                 if(httpHeaders==null) {
106                     httpHeaders = new Header[0];
107                 }
108                 HeaderGroup hg = new HeaderGroup();
109                 hg.setHeaders(httpHeaders);
110                 StringBuilder builder = new StringBuilder();
111 
112                 // SURT-form URI
113                 appendField(builder,SURT.fromURI(h.getUrl()));
114                 // record timestamp ('b')
115                 appendField(builder,h.getDate());
116                 // http header date
117                 appendTimeField(builder,hg.getFirstHeader("Date"));
118                 // response code ('s')
119                 appendField(builder,h.getStatusCode());
120                 // media type ('m')
121                 appendField(builder,h.getMimetype());
122                 // content checksum (like 'c', but here Base32 SHA1)
123                 appendField(builder,record.getDigestStr());
124                 // arc name ('g')
125                 appendField(builder,reader.getFileName());
126                 // compressed start offset ('V')
127                 appendField(builder,h.getOffset());
128 
129                 // compressed end offset (?)
130 //            appendField(builder,
131 //                    reader.getInputStream() instanceof RepositionableStream
132 //                    ? ((GzippedInputStream)reader.getInputStream()).vPosition()
133 //                    : "-");
134                 // TODO; leave unavail for now
135                 appendField(builder, "-");
136 
137                 // uncompressed (declared in ARC headerline) record length
138                 appendField(builder,h.getLength());
139                 // http header content-length
140                 appendField(builder,hg.getFirstHeader("Content-Length"));
141 
142                 // http header mod-date
143                 appendTimeField(builder,hg.getFirstHeader("Last-Modified"));
144                 // http header expires
145                 appendTimeField(builder,hg.getFirstHeader("Expires"));
146                 
147                 // http header etag
148                 appendField(builder,hg.getFirstHeader("ETag"));
149                 // http header redirect ('Location' header?)
150                 appendField(builder,hg.getFirstHeader("Location"));
151                 // ip ('e')
152                 appendField(builder,h.getIp());
153                 // original URI
154                 appendField(builder,h.getUrl());
155                 // TODO MAYBE - a title from inside content? 
156 
157                 writer.println(builder.toString());
158                 count++;
159             }
160             wcdxFile.renameTo(new File(wcdxPath));
161         } catch (IOException e) {
162             // soldier on: but leave '.open' wcdx file as indicator of error
163             if(!wcdxFile.exists()) {
164                 try {
165                     wcdxFile.createNewFile();
166                 } catch (IOException e1) {
167                     // TODO Auto-generated catch block
168                     throw new RuntimeException(e1);
169                 }
170             }
171         } catch (RuntimeException e) {
172             // soldier on: but leave '.open' wcdx file as indicator of error
173             if(!wcdxFile.exists()) {
174                 try {
175                     wcdxFile.createNewFile();
176                 } catch (IOException e1) {
177                     // TODO Auto-generated catch block
178                     throw new RuntimeException(e1);
179                 }
180             }
181         } finally {
182             if(writer!=null) {
183                 writer.close();
184             }
185         }
186         
187         return new Object[] {wcdxPath, count};
188     }
189 
190     protected static void appendField(StringBuilder builder, Object obj) {
191         if(builder.length()>0) {
192             // prepend with delimiter
193             builder.append(' ');
194         }
195         if(obj instanceof Header) {
196             obj = ((Header)obj).getValue().trim();
197         }
198 
199         builder.append((obj==null||obj.toString().length()==0)?"-":obj);
200     }
201 
202     protected static void appendTimeField(StringBuilder builder, Object obj) {
203         if(builder.length()>0) {
204             // prepend with delimiter
205             builder.append(' ');
206         }
207         if(obj==null) {
208             builder.append("-");
209             return;
210         }
211         if(obj instanceof Header) {
212             String s = ((Header)obj).getValue().trim();
213             try {
214                 Date date = DateUtil.parseDate(s);
215                 String d = ArchiveUtils.get14DigitDate(date);
216                 if(d.startsWith("209")) {
217                     d = "199"+d.substring(3);
218                 }
219                 obj = d;
220             } catch (DateParseException e) {
221                 builder.append('e');
222                 return;
223             }
224 
225         }
226         builder.append(obj);
227     }
228 }
229 
230 //'wide' CDX
231 //a original url
232 //b timestamp
233 //s resp code
234 //m type
235 //? content md5 (full 'k'? 'c'?
236 //g arc name
237 //V compressed start offset
238 //? compressed length
239 //n? uncompressed length
240 //? mod date
241 //? expires
242 //? server 'date' hdr
243 //? etag
244 //r redirect ('Location'?)
245 //e ip
246 //MAYBE: 
247 //? TITLE from HTML or other format?
248 
249