View Javadoc

1   /* $Id: ArchiveRecord.java 6812 2010-04-09 18:59:44Z szznax $
2    *
3    * Created on August 21st, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.OutputStream;
28  import java.security.MessageDigest;
29  import java.security.NoSuchAlgorithmException;
30  import java.util.logging.Level;
31  
32  import org.archive.util.Base32;
33  
34  /***
35   * Archive file Record.
36   * @author stack
37   * @version $Date: 2010-04-09 18:59:44 +0000 (Fri, 09 Apr 2010) $ $Version$
38   */
39  public abstract class ArchiveRecord extends InputStream {
40      ArchiveRecordHeader header = null;
41  
42      /***
43       * Stream to read this record from.
44       *
45       * Stream can only be read sequentially.  Will only return this records'
46       * content returning a -1 if you try to read beyond the end of the current
47       * record.
48       *
49       * <p>Streams can be markable or not.  If they are, we'll be able to roll
50       * back when we've read too far.  If not markable, assumption is that
51       * the underlying stream is managing our not reading too much (This pertains
52       * to the skipping over the end of the ARCRecord.  See {@link #skip()}.
53       */
54      InputStream in = null;
55  
56      /***
57       * Position w/i the Record content, within <code>in</code>.
58       * This position is relative within this Record.  Its not same as the
59       * Archive file position.
60       */
61      long position = 0;
62  
63      /***
64       * Set flag when we've reached the end-of-record.
65       */
66      boolean eor = false;
67      
68      /***
69       * Compute digest on what we read and add to metadata when done.
70       * 
71       * Currently hardcoded as sha-1. TODO: Remove when archive records
72       * digest or else, add a facility that allows the arc reader to
73       * compare the calculated digest to that which is recorded in
74       * the arc.
75       * 
76       * <p>Protected instead of private so subclasses can update and complete
77       * the digest.
78       */
79      protected MessageDigest digest = null;
80      private String digestStr = null;
81  
82      boolean strict = false;
83      
84      private ArchiveRecord() {
85          super();
86      }
87      
88      /***
89       * Constructor.
90       *
91       * @param in Stream cue'd up to be at the start of the record this instance
92       * is to represent.
93       * @throws IOException
94       */
95      public ArchiveRecord(InputStream in)
96              throws IOException {
97          this(in, null, 0, true, false);
98      }
99      
100     /***
101      * Constructor.
102      *
103      * @param in Stream cue'd up to be at the start of the record this instance
104      * is to represent.
105      * @param header Header data.
106      * @throws IOException
107      */
108     public ArchiveRecord(InputStream in, ArchiveRecordHeader header)
109             throws IOException {
110         this(in, header, 0, true, false);
111     }
112 
113     /***
114      * Constructor.
115      *
116      * @param in Stream cue'd up to be at the start of the record this instance
117      * is to represent.
118      * @param header Header data.
119      * @param bodyOffset Offset into the body.  Usually 0.
120      * @param digest True if we're to calculate digest for this record.  Not
121      * digesting saves about ~15% of cpu during an ARC parse.
122      * @param strict Be strict parsing (Parsing stops if ARC inproperly
123      * formatted).
124      * @throws IOException
125      */
126     public ArchiveRecord(InputStream in, ArchiveRecordHeader header,
127         int bodyOffset, boolean digest, boolean strict) 
128     throws IOException {
129         this.in = in;
130         this.header = header;
131         this.position = bodyOffset;
132         if (digest) {
133             try {
134                 this.digest = MessageDigest.getInstance("SHA1");
135             } catch (NoSuchAlgorithmException e) {
136                 // Convert to IOE because thats more amenable to callers
137                 // -- they are dealing with it anyways.
138                 throw new IOException(e.getMessage());
139             }
140         }
141         this.strict = strict;
142     }
143 
144     public boolean markSupported() {
145         return false;
146     }
147 
148     /***
149      * @return Header data for this record.
150      */
151     public ArchiveRecordHeader getHeader() {
152         return this.header;
153     }
154     
155 	protected void setHeader(ArchiveRecordHeader header) {
156 		this.header = header;
157 	}
158 
159     /***
160      * Calling close on a record skips us past this record to the next record
161      * in the stream.
162      *
163      * It does not actually close the stream.  The underlying steam is probably
164      * being used by the next arc record.
165      *
166      * @throws IOException
167      */
168     public void close() throws IOException {
169         if (this.in != null) {
170             skip();
171             this.in = null;
172             if (this.digest != null) {
173             	this.digestStr = Base32.encode(this.digest.digest());
174             }
175         }
176     }
177 
178     /***
179      * @return Next character in this Record content else -1 if at EOR.
180      * @throws IOException
181      */
182     public int read() throws IOException {
183         int c = -1;
184         if (available() > 0) {
185             c = this.in.read();
186             if (c == -1) {
187                 throw new IOException("Premature EOF before end-of-record.");
188             }
189             if (this.digest != null) {
190                 this.digest.update((byte) c);
191             }
192             incrementPosition();
193         }
194         return c;
195     }
196 
197     public int read(byte[] b, int offset, int length) throws IOException {
198         int read = Math.min(length, available());
199         if (read == -1 || read == 0) {
200             read = -1;
201         } else {
202             read = this.in.read(b, offset, read);
203             if (read == -1) {
204                 String msg = "Premature EOF before end-of-record: "
205                     + getHeader().getHeaderFields();
206                 if (isStrict()) {
207                     throw new IOException(msg);
208                 }
209                 setEor(true);
210                 System.err.println(Level.WARNING.toString() + " " + msg);
211             }
212             if (this.digest != null && read >= 0) {
213                 this.digest.update(b, offset, read);
214             }
215             incrementPosition(read);
216         }
217         return read;
218     }
219 
220     /***
221 	 * This available is not the stream's available. Its an available based on
222 	 * what the stated Archive record length is minus what we've read to date.
223 	 * 
224 	 * @return bytes remaining in record content.
225 	 */
226     public int available() {
227         long amount = getHeader().getLength() - getPosition();
228         return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount);
229     }
230 
231     /***
232      * Skip over this records content.
233      *
234      * @throws IOException
235      */
236     void skip() throws IOException {
237         if (this.eor) {
238             return;
239         }
240         
241         // Read to the end of the body of the record.  Exhaust the stream.
242         // Can't skip direct to end because underlying stream may be compressed
243         // and we're calculating the digest for the record.
244         while (available() > 0 && !this.eor) {
245             skip(available());
246         }
247     }
248     
249     public long skip(long n) throws IOException {
250         final int SKIP_BUFFERSIZE = 1024 * 4;
251         byte[] b = new byte[SKIP_BUFFERSIZE];
252         long total = 0;
253         for (int read = 0; (total < n) && (read != -1);) {
254             read = Math.min(SKIP_BUFFERSIZE, (int) (n - total));
255             // TODO: Interesting is that reading from compressed stream, we only
256             // read about 500 characters at a time though we ask for 4k.
257             // Look at this sometime.
258             read = read(b, 0, read);
259             if (read <= 0) {
260                 read = -1;
261             } else {
262                 total += read;
263             }
264         }
265         return total;
266     }
267 
268     /***
269      * @return Returns the strict.
270      */
271     public boolean isStrict() {
272         return this.strict;
273     }
274 
275     /***
276      * @param strict The strict to set.
277      */
278     public void setStrict(boolean strict) {
279         this.strict = strict;
280     }
281 
282 	protected InputStream getIn() {
283 		return this.in;
284 	}
285 
286 	public String getDigestStr() {
287 		return this.digestStr;
288 	}
289 	
290 	protected void incrementPosition() {
291 		this.position++;
292 	}
293 	
294 	protected void incrementPosition(final long incr) {
295 		this.position += incr;
296 	}
297 	
298 	protected long getPosition() {
299 		return this.position;
300 	}
301 
302 	protected boolean isEor() {
303 		return eor;
304 	}
305 
306 	protected void setEor(boolean eor) {
307 		this.eor = eor;
308 	}
309 	
310 	protected String getStatusCode4Cdx(final ArchiveRecordHeader h) {
311 		return "-";
312 	}
313 	
314 	protected String getIp4Cdx(final ArchiveRecordHeader h) {
315 		return "-";
316 	}
317 	
318 	protected String getDigest4Cdx(final ArchiveRecordHeader h) {
319 		return getDigestStr() == null? "-": getDigestStr();
320 	}
321     
322     protected String getMimetype4Cdx(final ArchiveRecordHeader h) {
323         return h.getMimetype();
324     }
325 
326     protected String outputCdx(final String strippedFileName)
327     throws IOException {
328         // Read the whole record so we get out a hash. Should be safe calling
329     	// close on already closed Record.
330         close();
331         ArchiveRecordHeader h = getHeader();
332         StringBuilder buffer =
333         	new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
334         buffer.append(h.getDate());
335         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
336         buffer.append(getIp4Cdx(h));
337         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
338         buffer.append(h.getUrl());
339         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
340         buffer.append(getMimetype4Cdx(h));
341         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
342         buffer.append(getStatusCode4Cdx(h));
343         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
344         buffer.append(getDigest4Cdx(h));
345         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
346         buffer.append(h.getOffset());
347         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
348         buffer.append(h.getLength());
349         buffer.append(ArchiveFileConstants.SINGLE_SPACE);
350         buffer.append(strippedFileName != null? strippedFileName: '-');
351         return buffer.toString();
352     }
353     
354     /***
355      * Writes output on STDOUT.
356      * @throws IOException
357      */
358     public void dump()
359     throws IOException {
360     	dump(System.out);
361     }
362     
363     /***
364      * Writes output on passed <code>os</code>.
365      * @throws IOException
366      */
367     public void dump(final OutputStream os)
368     throws IOException {
369     	final byte [] outputBuffer = new byte [16*1024];
370         int read = outputBuffer.length;
371         while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) {
372             os.write(outputBuffer, 0, read);
373         }
374         os.flush();
375     }
376 }