ARCRecord xref

View Javadoc

1   /* ARCRecord
2    *
3    * $Id: ARCRecord.java 6786 2010-03-10 00:42:08Z szznax $
4    *
5    * Created on Jan 7, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.ByteArrayInputStream;
28  import java.io.ByteArrayOutputStream;
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.util.ArrayList;
32  import java.util.List;
33  import org.apache.commons.httpclient.Header;
34  import org.apache.commons.httpclient.HttpParser;
35  import org.apache.commons.httpclient.StatusLine;
36  import org.apache.commons.httpclient.util.EncodingUtil;
37  import org.apache.commons.lang.StringUtils;
38  import org.archive.io.ArchiveRecord;
39  import org.archive.io.ArchiveRecordHeader;
40  import org.archive.io.RecoverableIOException;
41  
42  
43  /***
44   * An ARC file record.
45   * Does not compass the ARCRecord metadata line, just the record content.
46   * @author stack
47   */
48  public class ARCRecord extends ArchiveRecord implements ARCConstants {
49      /***
50       * Http status line object.
51       * 
52       * May be null if record is not http.
53       */
54      private StatusLine httpStatus = null;
55  
56      /***
57       * Http header bytes.
58       * 
59       * If non-null and bytes available, give out its contents before we
60       * go back to the underlying stream.
61       */
62      private InputStream httpHeaderStream = null;
63      
64      /***
65       * Http headers.
66       * 
67       * Only populated after reading of headers.
68       */
69      private Header [] httpHeaders = null;
70  
71      
72      /***
73       * Minimal http header length.
74       * 
75       * I've seen in arcs content length of 1 with no 
76       * header.
77       */
78      private static final long MIN_HTTP_HEADER_LENGTH =
79          "HTTP/1.1 200 OK\r\n".length();
80      
81      /***
82       * verbatim ARC record header string
83       */
84      private String headerString;
85      private void fillHeaderString() {
86          List<String> hl = new ArrayList<String>();
87          for (String key : ARCReader.HEADER_FIELD_NAME_KEYS) 
88              hl.add((String) this.getMetaData().getHeaderValue(key));
89          this.headerString = StringUtils.join(hl," ");
90      }
91      public String getHeaderString() {   
92          return this.headerString;
93      }
94  
95      /***
96       * Constructor.
97       *
98       * @param in Stream cue'd up to be at the start of the record this instance
99       * is to represent.
100      * @param metaData Meta data.
101      * @throws IOException
102      */
103     public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
104     		throws IOException {
105         this(in, metaData, 0, true, false, true);
106     }
107 
108     /***
109      * Constructor.
110      *
111      * @param in Stream cue'd up to be at the start of the record this instance
112      * is to represent.
113      * @param metaData Meta data.
114      * @param bodyOffset Offset into the body.  Usually 0.
115      * @param digest True if we're to calculate digest for this record.  Not
116      * digesting saves about ~15% of cpu during an ARC parse.
117      * @param strict Be strict parsing (Parsing stops if ARC inproperly
118      * formatted).
119      * @param parseHttpHeaders True if we are to parse HTTP headers.  Costs
120      * about ~20% of CPU during an ARC parse.
121      * @throws IOException
122      */
123     public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
124         int bodyOffset, boolean digest, boolean strict,
125         final boolean parseHttpHeaders) 
126     throws IOException {
127     	super(in, metaData, bodyOffset, digest, strict);
128     	this.fillHeaderString();
129         if (parseHttpHeaders) {
130             this.httpHeaderStream = readHttpHeader();
131         }
132     }
133     
134     /***
135      * Skip over the the http header if one present.
136      * 
137      * Subsequent reads will get the body.
138      * 
139      * <p>Calling this method in the midst of reading the header
140      * will make for strange results.  Otherwise, safe to call
141      * at any time though before reading any of the arc record
142      * content is only time that it makes sense.
143      * 
144      * <p>After calling this method, you can call
145      * {@link #getHttpHeaders()} to get the read http header.
146      * 
147      * @throws IOException
148      */
149     public void skipHttpHeader() throws IOException {
150         if (this.httpHeaderStream != null) {
151             // Empty the httpHeaderStream
152             for (int available = this.httpHeaderStream.available();
153             		this.httpHeaderStream != null &&
154             			(available = this.httpHeaderStream.available()) > 0;) {
155                 // We should be in this loop once only we should only do this
156                 // buffer allocation once.
157                 byte [] buffer = new byte[available];
158                 // The read nulls out httpHeaderStream when done with it so
159                 // need check for null in the loop control line.
160                 read(buffer, 0, available);
161             }
162         }
163     }
164     
165     public void dumpHttpHeader() throws IOException {
166 		if (this.httpHeaderStream == null) {
167 			return;
168 		}
169 		// Dump the httpHeaderStream to STDOUT
170 		for (int available = this.httpHeaderStream.available();
171 			this.httpHeaderStream != null
172 				&& (available = this.httpHeaderStream.available()) > 0;) {
173 			// We should be in this loop only once and should do this
174 			// buffer allocation once.
175 			byte[] buffer = new byte[available];
176 			// The read nulls out httpHeaderStream when done with it so
177 			// need check for null in the loop control line.
178 			int read = read(buffer, 0, available);
179 			System.out.write(buffer, 0, read);
180 		}
181 	}
182     
183     /***
184 	 * Read http header if present. Technique borrowed from HttpClient HttpParse
185 	 * class.
186 	 * 
187 	 * @return ByteArrayInputStream with the http header in it or null if no
188 	 *         http header.
189 	 * @throws IOException
190 	 */
191     private InputStream readHttpHeader() throws IOException {
192         // If judged a record that doesn't have an http header, return
193         // immediately.
194         if(!getHeader().getUrl().startsWith("http") ||
195             getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
196             return null;
197         }
198         byte [] statusBytes = HttpParser.readRawLine(getIn());
199         int eolCharCount = getEolCharsCount(statusBytes);
200         if (eolCharCount <= 0) {
201             throw new IOException(
202                 "Failed to read http status where one was expected: " 
203                 + ((statusBytes == null) ? "" : new String(statusBytes)));
204         }
205         String statusLine = EncodingUtil.getString(statusBytes, 0,
206             statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
207         if ((statusLine == null) ||
208                 !StatusLine.startsWithHTTP(statusLine)) {
209             if (statusLine.startsWith("DELETED")) {
210                 // Some old ARCs have deleted records like following:
211                 // http://vireo.gatech.edu:80/ebt-bin/nph-dweb/dynaweb/SGI_Developer/SGITCL_PG/@Generic__BookTocView/11108%3Btd%3D2 130.207.168.42 19991010131803 text/html 29202
212                 // DELETED_TIME=20000425001133_DELETER=Kurt_REASON=alexalist
213                 // (follows ~29K spaces)
214                 // For now, throw a RecoverableIOException so if iterating over
215                 // records, we keep going.  TODO: Later make a legitimate
216                 // ARCRecord from the deleted record rather than throw
217                 // exception.
218                 throw new DeletedARCRecordIOException(statusLine);
219             } else {
220                 throw new IOException("Failed parse of http status line.");
221             }
222         }
223         this.httpStatus = new StatusLine(statusLine);
224         
225         // Save off all bytes read.  Keep them as bytes rather than
226         // convert to strings so we don't have to worry about encodings
227         // though this should never be a problem doing http headers since
228         // its all supposed to be ascii.
229         ByteArrayOutputStream baos =
230             new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
231         baos.write(statusBytes);
232         
233         // Now read rest of the header lines looking for the separation
234         // between header and body.
235         for (byte [] lineBytes = null; true;) {
236             lineBytes = HttpParser.readRawLine(getIn());
237             eolCharCount = getEolCharsCount(lineBytes);
238             if (eolCharCount <= 0) {
239                 throw new IOException("Failed reading http headers: " +
240                     ((lineBytes != null)? new String(lineBytes): null));
241             }
242             // Save the bytes read.
243             baos.write(lineBytes);
244             if ((lineBytes.length - eolCharCount) <= 0) {
245                 // We've finished reading the http header.
246                 break;
247             }
248         }
249         
250         byte [] headerBytes = baos.toByteArray();
251         // Save off where body starts.
252         this.getMetaData().setContentBegin(headerBytes.length);
253         ByteArrayInputStream bais =
254             new ByteArrayInputStream(headerBytes);
255         if (!bais.markSupported()) {
256             throw new IOException("ByteArrayInputStream does not support mark");
257         }
258         bais.mark(headerBytes.length);
259         // Read the status line.  Don't let it into the parseHeaders function.
260         // It doesn't know what to do with it.
261         bais.read(statusBytes, 0, statusBytes.length);
262         this.httpHeaders = HttpParser.parseHeaders(bais,
263             ARCConstants.DEFAULT_ENCODING);
264         this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
265         bais.reset();
266         return bais;
267     }
268     
269     private static class DeletedARCRecordIOException
270     extends RecoverableIOException {
271         public DeletedARCRecordIOException(final String reason) {
272             super(reason);
273         }
274     }
275     
276     /***
277      * Return status code for this record.
278      * 
279      * This method will return -1 until the http header has been read.
280      * @return Status code.
281      */
282     public int getStatusCode() {
283         return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
284     }
285     
286     /***
287      * @param bytes Array of bytes to examine for an EOL.
288      * @return Count of end-of-line characters or zero if none.
289      */
290     private int getEolCharsCount(byte [] bytes) {
291         int count = 0;
292         if (bytes != null && bytes.length >=1 &&
293                 bytes[bytes.length - 1] == '\n') {
294             count++;
295             if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
296                 count++;
297             }
298         }
299         return count;
300     }
301 
302     /***
303      * @return Meta data for this record.
304      */
305     public ARCRecordMetaData getMetaData() {
306         return (ARCRecordMetaData)getHeader();
307     }
308     
309     /***
310      * @return http headers (Only available after header has been read).
311      */
312     public Header [] getHttpHeaders() {
313         return this.httpHeaders;
314     }
315 
316     /***
317      * @return Next character in this ARCRecord's content else -1 if at end of
318      * this record.
319      * @throws IOException
320      */
321     public int read() throws IOException {
322         int c = -1;
323         if (this.httpHeaderStream != null &&
324                 (this.httpHeaderStream.available() > 0)) {
325             // If http header, return bytes from it before we go to underlying
326             // stream.
327             c = this.httpHeaderStream.read();
328             // If done with the header stream, null it out.
329             if (this.httpHeaderStream.available() <= 0) {
330                 this.httpHeaderStream = null;
331             }
332             incrementPosition();
333         } else {
334             c = super.read();
335         }
336         return c;
337     }
338 
339     public int read(byte [] b, int offset, int length) throws IOException {
340         int read = -1;
341         if (this.httpHeaderStream != null &&
342                 (this.httpHeaderStream.available() > 0)) {
343             // If http header, return bytes from it before we go to underlying
344             // stream.
345             read = Math.min(length, this.httpHeaderStream.available());
346             if (read == 0) {
347                 read = -1;
348             } else {
349                 read = this.httpHeaderStream.read(b, offset, read);
350             }
351             // If done with the header stream, null it out.
352             if (this.httpHeaderStream.available() <= 0) {
353                 this.httpHeaderStream = null;
354             }
355             incrementPosition(read);
356         } else {
357             read = super.read(b, offset, length);
358         }
359         return read;
360     }
361 
362     /***
363      * @return Offset at which the body begins (Only known after
364      * header has been read) or -1 if none or if we haven't read
365      * headers yet.  Usually length of HTTP headers (does not include ARC
366      * metadata line length).
367      */
368     public int getBodyOffset() {
369         return this.getMetaData().getContentBegin();
370     }
371     
372     @Override
373     protected String getIp4Cdx(ArchiveRecordHeader h) {
374     	String result = null;
375     	if (h instanceof ARCRecordMetaData) {
376     		result = ((ARCRecordMetaData)h).getIp();
377     	}
378     	return (result != null)? result: super.getIp4Cdx(h);
379     }
380     
381     @Override
382 	protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
383 		String result = null;
384 		if (h instanceof ARCRecordMetaData) {
385 			result = ((ARCRecordMetaData) h).getStatusCode();
386 		}
387 		return (result != null) ? result: super.getStatusCode4Cdx(h);
388 	}
389     
390     @Override
391 	protected String getDigest4Cdx(ArchiveRecordHeader h) {
392 		String result = null;
393 		if (h instanceof ARCRecordMetaData) {
394 			result = ((ARCRecordMetaData) h).getDigest();
395 		}
396 		return (result != null) ? result: super.getDigest4Cdx(h);
397 	}
398 }