1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.io.arc;
26
27 import java.io.ByteArrayInputStream;
28 import java.io.ByteArrayOutputStream;
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.util.ArrayList;
32 import java.util.List;
33 import org.apache.commons.httpclient.Header;
34 import org.apache.commons.httpclient.HttpParser;
35 import org.apache.commons.httpclient.StatusLine;
36 import org.apache.commons.httpclient.util.EncodingUtil;
37 import org.apache.commons.lang.StringUtils;
38 import org.archive.io.ArchiveRecord;
39 import org.archive.io.ArchiveRecordHeader;
40 import org.archive.io.RecoverableIOException;
41
42
43 /***
44 * An ARC file record.
45 * Does not compass the ARCRecord metadata line, just the record content.
46 * @author stack
47 */
48 public class ARCRecord extends ArchiveRecord implements ARCConstants {
49 /***
50 * Http status line object.
51 *
52 * May be null if record is not http.
53 */
54 private StatusLine httpStatus = null;
55
56 /***
57 * Http header bytes.
58 *
59 * If non-null and bytes available, give out its contents before we
60 * go back to the underlying stream.
61 */
62 private InputStream httpHeaderStream = null;
63
64 /***
65 * Http headers.
66 *
67 * Only populated after reading of headers.
68 */
69 private Header [] httpHeaders = null;
70
71
72 /***
73 * Minimal http header length.
74 *
75 * I've seen in arcs content length of 1 with no
76 * header.
77 */
78 private static final long MIN_HTTP_HEADER_LENGTH =
79 "HTTP/1.1 200 OK\r\n".length();
80
81 /***
82 * verbatim ARC record header string
83 */
84 private String headerString;
85 private void fillHeaderString() {
86 List<String> hl = new ArrayList<String>();
87 for (String key : ARCReader.HEADER_FIELD_NAME_KEYS)
88 hl.add((String) this.getMetaData().getHeaderValue(key));
89 this.headerString = StringUtils.join(hl," ");
90 }
91 public String getHeaderString() {
92 return this.headerString;
93 }
94
95 /***
96 * Constructor.
97 *
98 * @param in Stream cue'd up to be at the start of the record this instance
99 * is to represent.
100 * @param metaData Meta data.
101 * @throws IOException
102 */
103 public ARCRecord(InputStream in, ArchiveRecordHeader metaData)
104 throws IOException {
105 this(in, metaData, 0, true, false, true);
106 }
107
108 /***
109 * Constructor.
110 *
111 * @param in Stream cue'd up to be at the start of the record this instance
112 * is to represent.
113 * @param metaData Meta data.
114 * @param bodyOffset Offset into the body. Usually 0.
115 * @param digest True if we're to calculate digest for this record. Not
116 * digesting saves about ~15% of cpu during an ARC parse.
117 * @param strict Be strict parsing (Parsing stops if ARC inproperly
118 * formatted).
119 * @param parseHttpHeaders True if we are to parse HTTP headers. Costs
120 * about ~20% of CPU during an ARC parse.
121 * @throws IOException
122 */
123 public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
124 int bodyOffset, boolean digest, boolean strict,
125 final boolean parseHttpHeaders)
126 throws IOException {
127 super(in, metaData, bodyOffset, digest, strict);
128 this.fillHeaderString();
129 if (parseHttpHeaders) {
130 this.httpHeaderStream = readHttpHeader();
131 }
132 }
133
134 /***
135 * Skip over the the http header if one present.
136 *
137 * Subsequent reads will get the body.
138 *
139 * <p>Calling this method in the midst of reading the header
140 * will make for strange results. Otherwise, safe to call
141 * at any time though before reading any of the arc record
142 * content is only time that it makes sense.
143 *
144 * <p>After calling this method, you can call
145 * {@link #getHttpHeaders()} to get the read http header.
146 *
147 * @throws IOException
148 */
149 public void skipHttpHeader() throws IOException {
150 if (this.httpHeaderStream != null) {
151
152 for (int available = this.httpHeaderStream.available();
153 this.httpHeaderStream != null &&
154 (available = this.httpHeaderStream.available()) > 0;) {
155
156
157 byte [] buffer = new byte[available];
158
159
160 read(buffer, 0, available);
161 }
162 }
163 }
164
165 public void dumpHttpHeader() throws IOException {
166 if (this.httpHeaderStream == null) {
167 return;
168 }
169
170 for (int available = this.httpHeaderStream.available();
171 this.httpHeaderStream != null
172 && (available = this.httpHeaderStream.available()) > 0;) {
173
174
175 byte[] buffer = new byte[available];
176
177
178 int read = read(buffer, 0, available);
179 System.out.write(buffer, 0, read);
180 }
181 }
182
183 /***
184 * Read http header if present. Technique borrowed from HttpClient HttpParse
185 * class.
186 *
187 * @return ByteArrayInputStream with the http header in it or null if no
188 * http header.
189 * @throws IOException
190 */
191 private InputStream readHttpHeader() throws IOException {
192
193
194 if(!getHeader().getUrl().startsWith("http") ||
195 getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
196 return null;
197 }
198 byte [] statusBytes = HttpParser.readRawLine(getIn());
199 int eolCharCount = getEolCharsCount(statusBytes);
200 if (eolCharCount <= 0) {
201 throw new IOException(
202 "Failed to read http status where one was expected: "
203 + ((statusBytes == null) ? "" : new String(statusBytes)));
204 }
205 String statusLine = EncodingUtil.getString(statusBytes, 0,
206 statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
207 if ((statusLine == null) ||
208 !StatusLine.startsWithHTTP(statusLine)) {
209 if (statusLine.startsWith("DELETED")) {
210
211
212
213
214
215
216
217
218 throw new DeletedARCRecordIOException(statusLine);
219 } else {
220 throw new IOException("Failed parse of http status line.");
221 }
222 }
223 this.httpStatus = new StatusLine(statusLine);
224
225
226
227
228
229 ByteArrayOutputStream baos =
230 new ByteArrayOutputStream(statusBytes.length + 4 * 1024);
231 baos.write(statusBytes);
232
233
234
235 for (byte [] lineBytes = null; true;) {
236 lineBytes = HttpParser.readRawLine(getIn());
237 eolCharCount = getEolCharsCount(lineBytes);
238 if (eolCharCount <= 0) {
239 throw new IOException("Failed reading http headers: " +
240 ((lineBytes != null)? new String(lineBytes): null));
241 }
242
243 baos.write(lineBytes);
244 if ((lineBytes.length - eolCharCount) <= 0) {
245
246 break;
247 }
248 }
249
250 byte [] headerBytes = baos.toByteArray();
251
252 this.getMetaData().setContentBegin(headerBytes.length);
253 ByteArrayInputStream bais =
254 new ByteArrayInputStream(headerBytes);
255 if (!bais.markSupported()) {
256 throw new IOException("ByteArrayInputStream does not support mark");
257 }
258 bais.mark(headerBytes.length);
259
260
261 bais.read(statusBytes, 0, statusBytes.length);
262 this.httpHeaders = HttpParser.parseHeaders(bais,
263 ARCConstants.DEFAULT_ENCODING);
264 this.getMetaData().setStatusCode(Integer.toString(getStatusCode()));
265 bais.reset();
266 return bais;
267 }
268
269 private static class DeletedARCRecordIOException
270 extends RecoverableIOException {
271 public DeletedARCRecordIOException(final String reason) {
272 super(reason);
273 }
274 }
275
276 /***
277 * Return status code for this record.
278 *
279 * This method will return -1 until the http header has been read.
280 * @return Status code.
281 */
282 public int getStatusCode() {
283 return (this.httpStatus == null)? -1: this.httpStatus.getStatusCode();
284 }
285
286 /***
287 * @param bytes Array of bytes to examine for an EOL.
288 * @return Count of end-of-line characters or zero if none.
289 */
290 private int getEolCharsCount(byte [] bytes) {
291 int count = 0;
292 if (bytes != null && bytes.length >=1 &&
293 bytes[bytes.length - 1] == '\n') {
294 count++;
295 if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
296 count++;
297 }
298 }
299 return count;
300 }
301
302 /***
303 * @return Meta data for this record.
304 */
305 public ARCRecordMetaData getMetaData() {
306 return (ARCRecordMetaData)getHeader();
307 }
308
309 /***
310 * @return http headers (Only available after header has been read).
311 */
312 public Header [] getHttpHeaders() {
313 return this.httpHeaders;
314 }
315
316 /***
317 * @return Next character in this ARCRecord's content else -1 if at end of
318 * this record.
319 * @throws IOException
320 */
321 public int read() throws IOException {
322 int c = -1;
323 if (this.httpHeaderStream != null &&
324 (this.httpHeaderStream.available() > 0)) {
325
326
327 c = this.httpHeaderStream.read();
328
329 if (this.httpHeaderStream.available() <= 0) {
330 this.httpHeaderStream = null;
331 }
332 incrementPosition();
333 } else {
334 c = super.read();
335 }
336 return c;
337 }
338
339 public int read(byte [] b, int offset, int length) throws IOException {
340 int read = -1;
341 if (this.httpHeaderStream != null &&
342 (this.httpHeaderStream.available() > 0)) {
343
344
345 read = Math.min(length, this.httpHeaderStream.available());
346 if (read == 0) {
347 read = -1;
348 } else {
349 read = this.httpHeaderStream.read(b, offset, read);
350 }
351
352 if (this.httpHeaderStream.available() <= 0) {
353 this.httpHeaderStream = null;
354 }
355 incrementPosition(read);
356 } else {
357 read = super.read(b, offset, length);
358 }
359 return read;
360 }
361
362 /***
363 * @return Offset at which the body begins (Only known after
364 * header has been read) or -1 if none or if we haven't read
365 * headers yet. Usually length of HTTP headers (does not include ARC
366 * metadata line length).
367 */
368 public int getBodyOffset() {
369 return this.getMetaData().getContentBegin();
370 }
371
372 @Override
373 protected String getIp4Cdx(ArchiveRecordHeader h) {
374 String result = null;
375 if (h instanceof ARCRecordMetaData) {
376 result = ((ARCRecordMetaData)h).getIp();
377 }
378 return (result != null)? result: super.getIp4Cdx(h);
379 }
380
381 @Override
382 protected String getStatusCode4Cdx(ArchiveRecordHeader h) {
383 String result = null;
384 if (h instanceof ARCRecordMetaData) {
385 result = ((ARCRecordMetaData) h).getStatusCode();
386 }
387 return (result != null) ? result: super.getStatusCode4Cdx(h);
388 }
389
390 @Override
391 protected String getDigest4Cdx(ArchiveRecordHeader h) {
392 String result = null;
393 if (h instanceof ARCRecordMetaData) {
394 result = ((ARCRecordMetaData) h).getDigest();
395 }
396 return (result != null) ? result: super.getDigest4Cdx(h);
397 }
398 }