View Javadoc

1   /* ARCRecordMetaData
2    *
3    * $Id: ARCRecordMetaData.java 4547 2006-08-28 23:44:20Z stack-sf $
4    *
5    * Created on Jan 7, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.io.arc;
26  
27  import java.io.File;
28  import java.io.IOException;
29  import java.util.Iterator;
30  import java.util.Map;
31  import java.util.Set;
32  
33  import org.archive.io.ArchiveRecordHeader;
34  
35  
36  /***
37   * An immutable class to hold an ARC record meta data.
38   *
39   * @author stack
40   */
41  public class ARCRecordMetaData implements ArchiveRecordHeader, ARCConstants {
42      /***
43       * Map of record header fields.
44       *
45       * We store all in a hashmap.  This way we can hold version 1 or
46       * version 2 record meta data.
47       *
48       * <p>Keys are lowercase.
49       */
50      protected Map headerFields = null;
51      
52      /***
53       * Digest for the record.
54       * 
55       * Only available after the record has been read in totality.
56       */
57      private String digest = null;
58      
59      /***
60       * Status for this request.
61       * 
62       * There may be no status.
63       */
64      private String statusCode = null;
65      
66      /***
67       * The arc this metadata came out.
68       * Descriptive String, either path or URL.
69       */
70      private String arc = null;
71      
72      private int contentBegin = 0;
73      
74      /***
75       * Shut down the default constructor.
76       */
77      protected ARCRecordMetaData() {
78          super();
79      }
80  
81      /***
82       * Constructor.
83       *
84       * @param arc The arc file this metadata came out of.
85       * @param headerFields Hash of meta fields.
86       *
87       * @throws IOException
88       */
89      public ARCRecordMetaData(final String arc, Map headerFields)
90          throws IOException {
91          // Make sure the minimum required fields are present,
92          for (Iterator i = REQUIRED_VERSION_1_HEADER_FIELDS.iterator();
93              i.hasNext(); ) {
94              testRequiredField(headerFields, (String)i.next());
95          }
96          this.headerFields = headerFields;
97          this.arc = arc;
98      }
99  
100     /***
101      * Test required field is present in hash.
102      *
103      * @param fields Map of fields.
104      * @param requiredField Field to test for.
105      *
106      * @exception IOException If required field is not present.
107      */
108     protected void testRequiredField(Map fields, String requiredField)
109         throws IOException {
110         if (!fields.containsKey(requiredField)) {
111             throw new IOException("Required field " + requiredField +
112             " not in meta data.");
113         }
114     }
115 
116     /***
117      * Get the time when the record was harvested.
118      * <p>
119      * Returns the date in Heritrix 14 digit time format (UTC). See the
120      * {@link org.archive.util.ArchiveUtils} class for converting to Java
121      * dates.
122      * 
123      * @return Header date in Heritrix 14 digit format.
124      * @see org.archive.util.ArchiveUtils#parse14DigitDate(String)
125      */
126     public String getDate() {
127         return (String) this.headerFields.get(DATE_FIELD_KEY);
128     }
129 
130     /***
131      * @return Return length of the record.
132      */
133     public long getLength() {
134         return Long.parseLong((String)this.headerFields.
135             get(LENGTH_FIELD_KEY));
136     }
137 
138     /***
139      * @return Header url.
140      */
141     public String getUrl() {
142         return (String)this.headerFields.get(URL_FIELD_KEY);
143     }
144 
145     /***
146      * @return IP.
147      */
148     public String getIp()
149     {
150         return (String)this.headerFields.get(IP_HEADER_FIELD_KEY);
151     }
152 
153     /***
154      * @return mimetype The mimetype that is in the ARC metaline -- NOT the http
155      * content-type content.
156      */
157     public String getMimetype() {
158         return (String)this.headerFields.get(MIMETYPE_FIELD_KEY);
159     }
160 
161     /***
162      * @return Arcfile version.
163      */
164     public String getVersion() {
165         return (String)this.headerFields.get(VERSION_FIELD_KEY);
166     }
167 
168     /***
169      * @return Offset into arcfile at which this record begins.
170      */
171     public long getOffset() {
172         return ((Long)this.headerFields.get(ABSOLUTE_OFFSET_KEY)).longValue();
173     }
174 
175     /***
176      * @param key Key to use looking up field value.
177      * @return value for passed key of null if no such entry.
178      */
179     public Object getHeaderValue(String key) {
180         return this.headerFields.get(key);
181     }
182 
183     /***
184      * @return Header field name keys.
185      */
186     public Set getHeaderFieldKeys()
187     {
188         return this.headerFields.keySet();
189     }
190 
191     /***
192      * @return Map of header fields.
193      */
194     public Map getHeaderFields() {
195         return this.headerFields;
196     }
197     
198     /***
199      * @return Returns identifier for ARC.
200      */
201     public String getArc() {
202         return this.arc;
203     }
204     
205     /***
206      * @return Convenience method that does a
207      * return new File(this.arc) (Be aware this.arc is not always
208      * full path to an ARC file -- may be an URL).  Test
209      * returned file for existence.
210      */
211     public File getArcFile() {
212         return new File(this.arc);
213     }
214     
215     /***
216      * @return Returns the digest.
217      */
218     public String getDigest() {
219         return this.digest;
220     }
221     
222     /***
223      * @param d The digest to set.
224      */
225     public void setDigest(String d) {
226         this.digest = d;
227     }
228     
229     /***
230      * @return Returns the statusCode.  May be null.
231      */
232     public String getStatusCode() {
233         return this.statusCode;
234     }
235     
236     /***
237      * @param statusCode The statusCode to set.
238      */
239     public void setStatusCode(String statusCode) {
240         this.statusCode = statusCode;
241     }
242     
243     public String toString() {
244         return ((this.arc != null)? this.arc: "") +
245            ": " +
246            ((this.headerFields != null)? this.headerFields.toString():  "");
247     }
248 
249 	public String getReaderIdentifier() {
250 		return this.getArc();
251 	}
252 
253 	public String getRecordIdentifier() {
254 	    return getDate() + "/" + getUrl();
255 	}
256 
257     public int getContentBegin() {
258         return this.contentBegin;
259     }
260     
261     void setContentBegin(final int offset) {
262         this.contentBegin = offset;
263     }
264 }