1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.OutputStream;
28 import java.security.MessageDigest;
29 import java.security.NoSuchAlgorithmException;
30 import java.util.logging.Level;
31
32 import org.archive.util.Base32;
33
34 /***
35 * Archive file Record.
36 * @author stack
37 * @version $Date: 2010-04-09 18:59:44 +0000 (Fri, 09 Apr 2010) $ $Version$
38 */
39 public abstract class ArchiveRecord extends InputStream {
40 ArchiveRecordHeader header = null;
41
42 /***
43 * Stream to read this record from.
44 *
45 * Stream can only be read sequentially. Will only return this records'
46 * content returning a -1 if you try to read beyond the end of the current
47 * record.
48 *
49 * <p>Streams can be markable or not. If they are, we'll be able to roll
50 * back when we've read too far. If not markable, assumption is that
51 * the underlying stream is managing our not reading too much (This pertains
52 * to the skipping over the end of the ARCRecord. See {@link #skip()}.
53 */
54 InputStream in = null;
55
56 /***
57 * Position w/i the Record content, within <code>in</code>.
58 * This position is relative within this Record. Its not same as the
59 * Archive file position.
60 */
61 long position = 0;
62
63 /***
64 * Set flag when we've reached the end-of-record.
65 */
66 boolean eor = false;
67
68 /***
69 * Compute digest on what we read and add to metadata when done.
70 *
71 * Currently hardcoded as sha-1. TODO: Remove when archive records
72 * digest or else, add a facility that allows the arc reader to
73 * compare the calculated digest to that which is recorded in
74 * the arc.
75 *
76 * <p>Protected instead of private so subclasses can update and complete
77 * the digest.
78 */
79 protected MessageDigest digest = null;
80 private String digestStr = null;
81
82 boolean strict = false;
83
84 private ArchiveRecord() {
85 super();
86 }
87
88 /***
89 * Constructor.
90 *
91 * @param in Stream cue'd up to be at the start of the record this instance
92 * is to represent.
93 * @throws IOException
94 */
95 public ArchiveRecord(InputStream in)
96 throws IOException {
97 this(in, null, 0, true, false);
98 }
99
100 /***
101 * Constructor.
102 *
103 * @param in Stream cue'd up to be at the start of the record this instance
104 * is to represent.
105 * @param header Header data.
106 * @throws IOException
107 */
108 public ArchiveRecord(InputStream in, ArchiveRecordHeader header)
109 throws IOException {
110 this(in, header, 0, true, false);
111 }
112
113 /***
114 * Constructor.
115 *
116 * @param in Stream cue'd up to be at the start of the record this instance
117 * is to represent.
118 * @param header Header data.
119 * @param bodyOffset Offset into the body. Usually 0.
120 * @param digest True if we're to calculate digest for this record. Not
121 * digesting saves about ~15% of cpu during an ARC parse.
122 * @param strict Be strict parsing (Parsing stops if ARC inproperly
123 * formatted).
124 * @throws IOException
125 */
126 public ArchiveRecord(InputStream in, ArchiveRecordHeader header,
127 int bodyOffset, boolean digest, boolean strict)
128 throws IOException {
129 this.in = in;
130 this.header = header;
131 this.position = bodyOffset;
132 if (digest) {
133 try {
134 this.digest = MessageDigest.getInstance("SHA1");
135 } catch (NoSuchAlgorithmException e) {
136
137
138 throw new IOException(e.getMessage());
139 }
140 }
141 this.strict = strict;
142 }
143
144 public boolean markSupported() {
145 return false;
146 }
147
148 /***
149 * @return Header data for this record.
150 */
151 public ArchiveRecordHeader getHeader() {
152 return this.header;
153 }
154
155 protected void setHeader(ArchiveRecordHeader header) {
156 this.header = header;
157 }
158
159 /***
160 * Calling close on a record skips us past this record to the next record
161 * in the stream.
162 *
163 * It does not actually close the stream. The underlying steam is probably
164 * being used by the next arc record.
165 *
166 * @throws IOException
167 */
168 public void close() throws IOException {
169 if (this.in != null) {
170 skip();
171 this.in = null;
172 if (this.digest != null) {
173 this.digestStr = Base32.encode(this.digest.digest());
174 }
175 }
176 }
177
178 /***
179 * @return Next character in this Record content else -1 if at EOR.
180 * @throws IOException
181 */
182 public int read() throws IOException {
183 int c = -1;
184 if (available() > 0) {
185 c = this.in.read();
186 if (c == -1) {
187 throw new IOException("Premature EOF before end-of-record.");
188 }
189 if (this.digest != null) {
190 this.digest.update((byte) c);
191 }
192 incrementPosition();
193 }
194 return c;
195 }
196
197 public int read(byte[] b, int offset, int length) throws IOException {
198 int read = Math.min(length, available());
199 if (read == -1 || read == 0) {
200 read = -1;
201 } else {
202 read = this.in.read(b, offset, read);
203 if (read == -1) {
204 String msg = "Premature EOF before end-of-record: "
205 + getHeader().getHeaderFields();
206 if (isStrict()) {
207 throw new IOException(msg);
208 }
209 setEor(true);
210 System.err.println(Level.WARNING.toString() + " " + msg);
211 }
212 if (this.digest != null && read >= 0) {
213 this.digest.update(b, offset, read);
214 }
215 incrementPosition(read);
216 }
217 return read;
218 }
219
220 /***
221 * This available is not the stream's available. Its an available based on
222 * what the stated Archive record length is minus what we've read to date.
223 *
224 * @return bytes remaining in record content.
225 */
226 public int available() {
227 long amount = getHeader().getLength() - getPosition();
228 return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount);
229 }
230
231 /***
232 * Skip over this records content.
233 *
234 * @throws IOException
235 */
236 void skip() throws IOException {
237 if (this.eor) {
238 return;
239 }
240
241
242
243
244 while (available() > 0 && !this.eor) {
245 skip(available());
246 }
247 }
248
249 public long skip(long n) throws IOException {
250 final int SKIP_BUFFERSIZE = 1024 * 4;
251 byte[] b = new byte[SKIP_BUFFERSIZE];
252 long total = 0;
253 for (int read = 0; (total < n) && (read != -1);) {
254 read = Math.min(SKIP_BUFFERSIZE, (int) (n - total));
255
256
257
258 read = read(b, 0, read);
259 if (read <= 0) {
260 read = -1;
261 } else {
262 total += read;
263 }
264 }
265 return total;
266 }
267
268 /***
269 * @return Returns the strict.
270 */
271 public boolean isStrict() {
272 return this.strict;
273 }
274
275 /***
276 * @param strict The strict to set.
277 */
278 public void setStrict(boolean strict) {
279 this.strict = strict;
280 }
281
282 protected InputStream getIn() {
283 return this.in;
284 }
285
286 public String getDigestStr() {
287 return this.digestStr;
288 }
289
290 protected void incrementPosition() {
291 this.position++;
292 }
293
294 protected void incrementPosition(final long incr) {
295 this.position += incr;
296 }
297
298 protected long getPosition() {
299 return this.position;
300 }
301
302 protected boolean isEor() {
303 return eor;
304 }
305
306 protected void setEor(boolean eor) {
307 this.eor = eor;
308 }
309
310 protected String getStatusCode4Cdx(final ArchiveRecordHeader h) {
311 return "-";
312 }
313
314 protected String getIp4Cdx(final ArchiveRecordHeader h) {
315 return "-";
316 }
317
318 protected String getDigest4Cdx(final ArchiveRecordHeader h) {
319 return getDigestStr() == null? "-": getDigestStr();
320 }
321
322 protected String getMimetype4Cdx(final ArchiveRecordHeader h) {
323 return h.getMimetype();
324 }
325
326 protected String outputCdx(final String strippedFileName)
327 throws IOException {
328
329
330 close();
331 ArchiveRecordHeader h = getHeader();
332 StringBuilder buffer =
333 new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
334 buffer.append(h.getDate());
335 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
336 buffer.append(getIp4Cdx(h));
337 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
338 buffer.append(h.getUrl());
339 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
340 buffer.append(getMimetype4Cdx(h));
341 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
342 buffer.append(getStatusCode4Cdx(h));
343 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
344 buffer.append(getDigest4Cdx(h));
345 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
346 buffer.append(h.getOffset());
347 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
348 buffer.append(h.getLength());
349 buffer.append(ArchiveFileConstants.SINGLE_SPACE);
350 buffer.append(strippedFileName != null? strippedFileName: '-');
351 return buffer.toString();
352 }
353
354 /***
355 * Writes output on STDOUT.
356 * @throws IOException
357 */
358 public void dump()
359 throws IOException {
360 dump(System.out);
361 }
362
363 /***
364 * Writes output on passed <code>os</code>.
365 * @throws IOException
366 */
367 public void dump(final OutputStream os)
368 throws IOException {
369 final byte [] outputBuffer = new byte [16*1024];
370 int read = outputBuffer.length;
371 while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) {
372 os.write(outputBuffer, 0, read);
373 }
374 os.flush();
375 }
376 }