View Javadoc

1   /* ANVLRecord
2   *
3   * $Id: ANVLRecord.java 6539 2009-10-03 01:08:25Z szznax $
4   *
5   * Created on July 26, 2006.
6   *
7   * Copyright (C) 2006 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.util.anvl;
26  
27  import java.io.ByteArrayOutputStream;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.io.UnsupportedEncodingException;
31  import java.util.ArrayList;
32  import java.util.Collection;
33  import java.util.HashMap;
34  import java.util.Iterator;
35  import java.util.List;
36  import java.util.Map;
37  import java.util.logging.Level;
38  import java.util.logging.Logger;
39  
40  import org.archive.io.UTF8Bytes;
41  
42  /***
43   * An ordered {@link List} with 'data' {@link Element} values.
44   * ANVLRecords end with a blank line.
45   * 
46   * @see <a
47   * href="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value
48   * Language (ANVL)</a>
49   * @author stack
50   */
51  public class ANVLRecord extends ArrayList<Element> implements UTF8Bytes {
52  	private static final long serialVersionUID = -4610638888453052958L;
53      private static final Logger logger = 
54          Logger.getLogger(ANVLRecord.class.getName());
55  	
56  	public static final String MIMETYPE = "application/warc-fields";
57  	
58  	public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
59      
60      /***
61       * Arbitrary upper bound on maximum size of ANVL Record.
62       * Will throw an IOException if exceed this size.
63       */
64      public static final long MAXIMUM_SIZE = 1024 * 10;
65  	
66  	/***
67  	 * An ANVL 'newline'.
68  	 * @see <a href="http://en.wikipedia.org/wiki/CRLF">http://en.wikipedia.org/wiki/CRLF</a>
69  	 */
70      static final String CRLF = "\r\n";
71      
72      static final String FOLD_PREFIX = CRLF + ' ';
73      
74      public ANVLRecord() {
75          super();
76      }
77  
78      public ANVLRecord(Collection<? extends Element> c) {
79          super(c);
80      }
81  
82      public ANVLRecord(int initialCapacity) {
83          super(initialCapacity);
84      }
85      
86      public boolean addLabel(final String l) {
87      	return super.add(new Element(new Label(l)));
88      }
89  
90      public boolean addLabelValue(final String l, final String v) {
91      	try {
92      		return super.add(new Element(new Label(l), new Value(v)));
93      	} catch (IllegalArgumentException e) {
94      		logger.log(Level.WARNING, "bad label " + l + " or value " + v, e);
95      		return false;
96      	}
97      }
98      
99      @Override
100     public String toString() {
101         // TODO: What to emit for empty ANVLRecord?
102         StringBuilder sb = new StringBuilder();
103         for (final Iterator<Element> i = iterator(); i.hasNext();) {
104             sb.append(i.next());
105             sb.append(CRLF);
106         }
107         // 'ANVL Records end in a blank line'.
108         sb.append(CRLF);
109         return sb.toString();
110     }
111     
112     public Map<String, String> asMap() {
113         Map<String, String> m = new HashMap<String, String>(size());
114         for (final Iterator<Element> i = iterator(); i.hasNext();) {
115             Element e = i.next();
116             m.put(e.getLabel().toString(),
117                 e.isValue()? e.getValue().toString(): (String)null);
118         }
119         return m;
120     }
121     
122     @Override
123     public ANVLRecord clone() {
124         return new ANVLRecord(this);
125     }
126     
127     /***
128      * @return This ANVLRecord as UTF8 bytes.
129      */
130     public byte [] getUTF8Bytes()
131     throws UnsupportedEncodingException {
132         return toString().getBytes(UTF8);
133     }
134     
135     /***
136      * Parses a single ANVLRecord from passed InputStream.
137      * Read as a single-byte stream until we get to a CRLFCRLF which
138      * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
139      * Doing it this way, while requiring a double-scan, it  makes it so do not
140      * need to be passed a RepositionableStream or a Stream that supports
141      * marking.  Also no danger of over-reading which can happen when we
142      * wrap passed Stream with an InputStreamReader for doing UTF-8
143      * character conversion (See the ISR class comment).
144      * @param is InputStream
145      * @return An ANVLRecord instance.
146      * @throws IOException
147      */
148     public static ANVLRecord load(final InputStream is)
149     throws IOException {
150         // It doesn't look like a CRLF sequence is possible in UTF-8 without
151     	// it signifying CRLF: The top bits are set in multibyte characters.
152     	// Was thinking of recording CRLF as I was running through this first
153     	// parse but the offsets would then be incorrect if any multibyte
154     	// characters in the intervening gaps between CRLF.
155         boolean isCRLF = false;
156         boolean recordStart = false;
157         ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
158         boolean done = false;
159         int read = 0;
160         for (int c  = -1, previousCharacter; !done;) {
161             if (read++ >= MAXIMUM_SIZE) {
162                 throw new IOException("Read " + MAXIMUM_SIZE +
163                     " bytes without finding  //r//n//r//n " +
164                     "End-Of-ANVLRecord");
165             }
166             previousCharacter = c;
167             c = is.read();
168             if (c == -1) {
169                 throw new IOException("End-Of-Stream before //r//n//r//n " +
170                     "End-Of-ANVLRecord:\n" +
171                     new String(baos.toByteArray(), UTF8));
172             }
173             if (isLF((char)c) && isCR((char)previousCharacter)) {
174                 if (isCRLF) {
175                     // If we just had a CRLF, then its two CRLFs and its end of
176                     // record.  We're done.
177                     done = true;
178                 } else {
179                     isCRLF = true;
180                 }
181             } else if (!recordStart && Character.isWhitespace(c)) {
182                 // Skip any whitespace at start of ANVLRecord.
183                 continue;
184             } else {
185                 // Clear isCRLF flag if this character is NOT a '\r'.
186                 if (isCRLF && !isCR((char)c)) {
187                     isCRLF = false;
188                 }
189                 // Not whitespace so start record if we haven't already.
190                 if (!recordStart) {
191                     recordStart = true;
192                 }
193             }
194             baos.write(c);
195         }
196         return load(new String(baos.toByteArray(), UTF8));
197     }
198     
199     /*** 
200      * Parse passed String for an ANVL Record.
201      * Looked at writing javacc grammer but preprocessing is required to
202      * handle folding: See
203      * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
204      * Looked at Terence Parr's ANTLR.  More capable.  Can set lookahead count.
205      * A value of 3 would help with folding.  But its a pain defining UNICODE
206      * grammers -- needed by ANVL -- and support seems incomplete
207      * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
208      * For now, go with the below hand-rolled parser.
209      * @param s String with an ANVLRecord.
210      * @return ANVLRecord parsed from passed String.
211      * @throws IOException 
212      */
213     public static ANVLRecord load(final String s)
214     throws IOException {
215         ANVLRecord record = new ANVLRecord();
216         boolean inValue = false, inLabel = false, inComment = false, 
217             inNewLine = false;
218         String label = null;
219         StringBuilder sb = new StringBuilder(s.length());
220         for (int i = 0;  i < s.length(); i++) {
221             char c = s.charAt(i);
222            
223             // Assert I can do look-ahead.
224             if ((i + 1) > s.length()) {
225                 throw new IOException("Premature End-of-ANVLRecord:\n" +
226                     s.substring(i));
227             }
228             
229             // If at LF of a CRLF, just go around again. Eat up the LF.
230             if (inNewLine && isLF(c)) {
231                 continue;
232             }
233             
234             // If we're at a CRLF and we were just on one, exit. Found Record.
235             if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
236                 break;
237             }
238             
239             // Check if we're on a fold inside a Value. Skip multiple white
240             // space after CRLF. 
241             if (inNewLine && inValue && Character.isWhitespace(c)) {
242                 continue;
243             }
244             
245             // Else set flag if we're at a CRLF.
246             inNewLine = isCR(c) && isLF(s.charAt(i + 1));
247             
248             if (inNewLine) {
249                 if (inComment) {
250                     inComment = false;
251                 } else if (label != null && !inValue) {
252 					// Label only 'data element'.
253 					record.addLabel(label);
254 					label = null;
255 					sb.setLength(0);
256 				} else if (inValue) {
257 					// Assert I can do look-ahead past current CRLF.
258 					if ((i + 3) > s.length()) {
259 						throw new IOException("Premature End-of-ANVLRecord "
260 							+ "(2):\n" + s.substring(i));
261 					}
262 					if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3))
263 							&& Character.isWhitespace(s.charAt(i + 2))) {
264 						// Its a fold.  Let it go around. But add in a CRLF and
265 						// space and do it here.  We don't let CRLF fall through
266 						// to the sb.append on the end of this loop.
267 						sb.append(CRLF);
268 						sb.append(' ');
269 					} else {
270 						// Next line is a new SubElement, a new Comment or
271 						// Label.
272 						record.addLabelValue(label, sb.toString());
273 						sb.setLength(0);
274 						label = null;
275 						inValue = false;
276 					}
277 				} else {
278 					// We're whitespace between label and value or whitespace
279 					// before we've figured whether label or comment.
280 				}
281 				// Don't let the '\r' or CRLF through.
282 				continue;
283 			}
284             
285             if (inComment) {
286             	continue;
287             } else if (inLabel) {
288             	if (c == Label.COLON) {
289             		label = sb.toString();
290             		sb.setLength(0);
291             		inLabel = false;
292             		continue;
293             	}
294             } else {
295             	if (!inLabel && !inValue && !inComment) {
296             		// We have no state. Figure one.
297             		if (Character.isWhitespace(c)) {
298             			// If no state, and whitespace, skip. Don't record.
299             			continue;
300             		} else if (label == null && c == '#') {
301             			inComment = true;
302             			// Don't record comments.
303             			continue;
304             		} else if (label == null) {
305             			inLabel = true;
306             		} else {
307             			inValue = true;
308             		}
309             	}
310             }
311 			sb.append(c);
312         }
313         return record;
314     }
315     
316     /***
317      * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
318      * CRLFCRLF so is of size 4.  Also, expensive, since it makes String of
319      * the record so it can count bytes.
320      */
321     public synchronized int getLength() {
322         int length = -1;
323         try {
324             length = getUTF8Bytes().length;
325         } catch (UnsupportedEncodingException e) {
326             throw new RuntimeException(e);
327         }
328         return length;
329     }
330     
331     public static boolean isCROrLF(final char c) {
332         return isCR(c) || isLF(c);
333     }
334     
335     public static boolean isCR(final char c) {
336         return c == ANVLRecord.CRLF.charAt(0);
337     }
338     
339     public static boolean isLF(final char c) {
340         return c == ANVLRecord.CRLF.charAt(1);
341     }
342 }