1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.util.anvl;
26
27 import java.io.ByteArrayOutputStream;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.UnsupportedEncodingException;
31 import java.util.ArrayList;
32 import java.util.Collection;
33 import java.util.HashMap;
34 import java.util.Iterator;
35 import java.util.List;
36 import java.util.Map;
37 import java.util.logging.Level;
38 import java.util.logging.Logger;
39
40 import org.archive.io.UTF8Bytes;
41
42 /***
43 * An ordered {@link List} with 'data' {@link Element} values.
44 * ANVLRecords end with a blank line.
45 *
46 * @see <a
47 * href="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value
48 * Language (ANVL)</a>
49 * @author stack
50 */
51 public class ANVLRecord extends ArrayList<Element> implements UTF8Bytes {
52 private static final long serialVersionUID = -4610638888453052958L;
53 private static final Logger logger =
54 Logger.getLogger(ANVLRecord.class.getName());
55
56 public static final String MIMETYPE = "application/warc-fields";
57
58 public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
59
60 /***
61 * Arbitrary upper bound on maximum size of ANVL Record.
62 * Will throw an IOException if exceed this size.
63 */
64 public static final long MAXIMUM_SIZE = 1024 * 10;
65
66 /***
67 * An ANVL 'newline'.
68 * @see <a href="http://en.wikipedia.org/wiki/CRLF">http://en.wikipedia.org/wiki/CRLF</a>
69 */
70 static final String CRLF = "\r\n";
71
72 static final String FOLD_PREFIX = CRLF + ' ';
73
74 public ANVLRecord() {
75 super();
76 }
77
78 public ANVLRecord(Collection<? extends Element> c) {
79 super(c);
80 }
81
82 public ANVLRecord(int initialCapacity) {
83 super(initialCapacity);
84 }
85
86 public boolean addLabel(final String l) {
87 return super.add(new Element(new Label(l)));
88 }
89
90 public boolean addLabelValue(final String l, final String v) {
91 try {
92 return super.add(new Element(new Label(l), new Value(v)));
93 } catch (IllegalArgumentException e) {
94 logger.log(Level.WARNING, "bad label " + l + " or value " + v, e);
95 return false;
96 }
97 }
98
99 @Override
100 public String toString() {
101
102 StringBuilder sb = new StringBuilder();
103 for (final Iterator<Element> i = iterator(); i.hasNext();) {
104 sb.append(i.next());
105 sb.append(CRLF);
106 }
107
108 sb.append(CRLF);
109 return sb.toString();
110 }
111
112 public Map<String, String> asMap() {
113 Map<String, String> m = new HashMap<String, String>(size());
114 for (final Iterator<Element> i = iterator(); i.hasNext();) {
115 Element e = i.next();
116 m.put(e.getLabel().toString(),
117 e.isValue()? e.getValue().toString(): (String)null);
118 }
119 return m;
120 }
121
122 @Override
123 public ANVLRecord clone() {
124 return new ANVLRecord(this);
125 }
126
127 /***
128 * @return This ANVLRecord as UTF8 bytes.
129 */
130 public byte [] getUTF8Bytes()
131 throws UnsupportedEncodingException {
132 return toString().getBytes(UTF8);
133 }
134
135 /***
136 * Parses a single ANVLRecord from passed InputStream.
137 * Read as a single-byte stream until we get to a CRLFCRLF which
138 * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
139 * Doing it this way, while requiring a double-scan, it makes it so do not
140 * need to be passed a RepositionableStream or a Stream that supports
141 * marking. Also no danger of over-reading which can happen when we
142 * wrap passed Stream with an InputStreamReader for doing UTF-8
143 * character conversion (See the ISR class comment).
144 * @param is InputStream
145 * @return An ANVLRecord instance.
146 * @throws IOException
147 */
148 public static ANVLRecord load(final InputStream is)
149 throws IOException {
150
151
152
153
154
155 boolean isCRLF = false;
156 boolean recordStart = false;
157 ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
158 boolean done = false;
159 int read = 0;
160 for (int c = -1, previousCharacter; !done;) {
161 if (read++ >= MAXIMUM_SIZE) {
162 throw new IOException("Read " + MAXIMUM_SIZE +
163 " bytes without finding //r//n//r//n " +
164 "End-Of-ANVLRecord");
165 }
166 previousCharacter = c;
167 c = is.read();
168 if (c == -1) {
169 throw new IOException("End-Of-Stream before //r//n//r//n " +
170 "End-Of-ANVLRecord:\n" +
171 new String(baos.toByteArray(), UTF8));
172 }
173 if (isLF((char)c) && isCR((char)previousCharacter)) {
174 if (isCRLF) {
175
176
177 done = true;
178 } else {
179 isCRLF = true;
180 }
181 } else if (!recordStart && Character.isWhitespace(c)) {
182
183 continue;
184 } else {
185
186 if (isCRLF && !isCR((char)c)) {
187 isCRLF = false;
188 }
189
190 if (!recordStart) {
191 recordStart = true;
192 }
193 }
194 baos.write(c);
195 }
196 return load(new String(baos.toByteArray(), UTF8));
197 }
198
199 /***
200 * Parse passed String for an ANVL Record.
201 * Looked at writing javacc grammer but preprocessing is required to
202 * handle folding: See
203 * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
204 * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count.
205 * A value of 3 would help with folding. But its a pain defining UNICODE
206 * grammers -- needed by ANVL -- and support seems incomplete
207 * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
208 * For now, go with the below hand-rolled parser.
209 * @param s String with an ANVLRecord.
210 * @return ANVLRecord parsed from passed String.
211 * @throws IOException
212 */
213 public static ANVLRecord load(final String s)
214 throws IOException {
215 ANVLRecord record = new ANVLRecord();
216 boolean inValue = false, inLabel = false, inComment = false,
217 inNewLine = false;
218 String label = null;
219 StringBuilder sb = new StringBuilder(s.length());
220 for (int i = 0; i < s.length(); i++) {
221 char c = s.charAt(i);
222
223
224 if ((i + 1) > s.length()) {
225 throw new IOException("Premature End-of-ANVLRecord:\n" +
226 s.substring(i));
227 }
228
229
230 if (inNewLine && isLF(c)) {
231 continue;
232 }
233
234
235 if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
236 break;
237 }
238
239
240
241 if (inNewLine && inValue && Character.isWhitespace(c)) {
242 continue;
243 }
244
245
246 inNewLine = isCR(c) && isLF(s.charAt(i + 1));
247
248 if (inNewLine) {
249 if (inComment) {
250 inComment = false;
251 } else if (label != null && !inValue) {
252
253 record.addLabel(label);
254 label = null;
255 sb.setLength(0);
256 } else if (inValue) {
257
258 if ((i + 3) > s.length()) {
259 throw new IOException("Premature End-of-ANVLRecord "
260 + "(2):\n" + s.substring(i));
261 }
262 if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3))
263 && Character.isWhitespace(s.charAt(i + 2))) {
264
265
266
267 sb.append(CRLF);
268 sb.append(' ');
269 } else {
270
271
272 record.addLabelValue(label, sb.toString());
273 sb.setLength(0);
274 label = null;
275 inValue = false;
276 }
277 } else {
278
279
280 }
281
282 continue;
283 }
284
285 if (inComment) {
286 continue;
287 } else if (inLabel) {
288 if (c == Label.COLON) {
289 label = sb.toString();
290 sb.setLength(0);
291 inLabel = false;
292 continue;
293 }
294 } else {
295 if (!inLabel && !inValue && !inComment) {
296
297 if (Character.isWhitespace(c)) {
298
299 continue;
300 } else if (label == null && c == '#') {
301 inComment = true;
302
303 continue;
304 } else if (label == null) {
305 inLabel = true;
306 } else {
307 inValue = true;
308 }
309 }
310 }
311 sb.append(c);
312 }
313 return record;
314 }
315
316 /***
317 * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
318 * CRLFCRLF so is of size 4. Also, expensive, since it makes String of
319 * the record so it can count bytes.
320 */
321 public synchronized int getLength() {
322 int length = -1;
323 try {
324 length = getUTF8Bytes().length;
325 } catch (UnsupportedEncodingException e) {
326 throw new RuntimeException(e);
327 }
328 return length;
329 }
330
331 public static boolean isCROrLF(final char c) {
332 return isCR(c) || isLF(c);
333 }
334
335 public static boolean isCR(final char c) {
336 return c == ANVLRecord.CRLF.charAt(0);
337 }
338
339 public static boolean isLF(final char c) {
340 return c == ANVLRecord.CRLF.charAt(1);
341 }
342 }