1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.BufferedReader;
26 import java.io.File;
27 import java.io.FileInputStream;
28 import java.io.FileOutputStream;
29 import java.io.IOException;
30 import java.io.InputStreamReader;
31 import java.nio.ByteBuffer;
32 import java.nio.CharBuffer;
33 import java.nio.channels.FileChannel;
34 import java.nio.charset.Charset;
35 import java.util.logging.Level;
36 import java.util.logging.Logger;
37
38 import org.apache.commons.io.IOUtils;
39 import org.archive.util.FileUtils;
40
41 /***
42 * Provides a (Replay)CharSequence view on recorded streams (a prefix
43 * buffer and overflow backing file) that can handle streams of multibyte
44 * characters.
45 *
46 * For better performance on ISO-8859-1 text, use
47 * {@link Latin1ByteReplayCharSequence}.
48 *
49 * <p>Call close on this class when done so can clean up resources.
50 *
51 * <p>Implementation currently works by checking to see if content to read
52 * all fits the in-memory buffer. If so, we decode into a CharBuffer and
53 * keep this around for CharSequence operations. This CharBuffer is
54 * discarded on close.
55 *
56 * <p>If content length is greater than in-memory buffer, we decode the
57 * buffer plus backing file into a new file named for the backing file w/
58 * a suffix of the encoding we write the file as. We then run w/ a
59 * memory-mapped CharBuffer against this file to implement CharSequence.
60 * Reasons for this implemenation are that CharSequence wants to return the
61 * length of the CharSequence.
62 *
63 * <p>Obvious optimizations would keep around decodings whether the
64 * in-memory decoded buffer or the file of decodings written to disk but the
65 * general usage pattern processing URIs is that the decoding is used by one
66 * processor only. Also of note, files usually fit into the in-memory
67 * buffer.
68 *
69 * <p>We might also be able to keep up 3 windows that moved across the file
70 * decoding a window at a time trying to keep one of the buffers just in
71 * front of the regex processing returning it a length that would be only
72 * the length of current position to end of current block or else the length
73 * could be got by multipling the backing files length by the decoders'
74 * estimate of average character size. This would save us writing out the
75 * decoded file. We'd have to do the latter for files that are
76 * > Integer.MAX_VALUE.
77 *
78 * @author stack
79 * @version $Revision: 6090 $, $Date: 2008-12-09 23:36:27 +0000 (Tue, 09 Dec 2008) $
80 */
81 public class GenericReplayCharSequence implements ReplayCharSequence {
82
83 protected static Logger logger =
84 Logger.getLogger(GenericReplayCharSequence.class.getName());
85
86 /***
87 * Name of the encoding we use writing out concatenated decoded prefix
88 * buffer and decoded backing file.
89 *
90 * <p>This define is also used as suffix for the file that holds the
91 * decodings. The name of the file that holds the decoding is the name
92 * of the backing file w/ this encoding for a suffix.
93 *
94 * <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
95 */
96 private static final String WRITE_ENCODING = "UTF-16BE";
97
98 /***
99 * CharBuffer of decoded content.
100 *
101 * Content of this buffer is unicode.
102 */
103 private CharBuffer content = null;
104
105 /***
106 * File that has decoded content.
107 *
108 * Keep it around so we can remove on close.
109 */
110 private File decodedFile = null;
111
112
113 /***
114 * Constructor for all in-memory operation.
115 *
116 * @param buffer In-memory buffer of recordings prefix. We read from
117 * here first and will only go to the backing file if <code>size</code>
118 * requested is greater than <code>buffer.length</code>.
119 * @param size Total size of stream to replay in bytes. Used to find
120 * EOS. This is total length of content including HTTP headers if
121 * present.
122 * @param responseBodyStart Where the response body starts in bytes.
123 * Used to skip over the HTTP headers if present.
124 * @param backingFilename Path to backing file with content in excess of
125 * whats in <code>buffer</code>.
126 * @param encoding Encoding to use reading the passed prefix buffer and
127 * backing file. For now, should be java canonical name for the
128 * encoding. (If null is passed, we will default to
129 * ByteReplayCharSequence).
130 *
131 * @throws IOException
132 */
133 public GenericReplayCharSequence(byte[] buffer, long size,
134 long responseBodyStart, String encoding)
135 throws IOException {
136 super();
137 this.content = decodeInMemory(buffer, size, responseBodyStart,
138 encoding);
139 }
140
141 /***
142 * Constructor for overflow-to-disk-file operation.
143 *
144 * @param contentReplayInputStream inputStream of content
145 * @param backingFilename hint for name of temp file
146 * @param characterEncoding Encoding to use reading the stream.
147 * For now, should be java canonical name for the
148 * encoding.
149 *
150 * @throws IOException
151 */
152 public GenericReplayCharSequence(
153 ReplayInputStream contentReplayInputStream,
154 String backingFilename,
155 String characterEncoding)
156 throws IOException {
157 super();
158 this.content = decodeToFile(contentReplayInputStream,
159 backingFilename, characterEncoding);
160 }
161
162 /***
163 * Decode passed buffer and backing file into a CharBuffer.
164 *
165 * This method writes a new file made of the decoded concatenation of
166 * the in-memory prefix buffer and the backing file. Returns a
167 * charSequence view onto this new file.
168 *
169 * @param buffer In-memory buffer of recordings prefix. We read from
170 * here first and will only go to the backing file if <code>size</code>
171 * requested is greater than <code>buffer.length</code>.
172 * @param size Total size of stream to replay in bytes. Used to find
173 * EOS. This is total length of content including HTTP headers if
174 * present.
175 * @param responseBodyStart Where the response body starts in bytes.
176 * Used to skip over the HTTP headers if present.
177 * @param backingFilename Path to backing file with content in excess of
178 * whats in <code>buffer</code>.
179 * @param encoding Encoding to use reading the passed prefix buffer and
180 * backing file. For now, should be java canonical name for the
181 * encoding. (If null is passed, we will default to
182 * ByteReplayCharSequence).
183 *
184 * @return A CharBuffer view on decodings of the contents of passed
185 * buffer.
186 * @throws IOException
187 */
188 private CharBuffer decodeToFile(ReplayInputStream inStream,
189 String backingFilename, String encoding)
190 throws IOException {
191
192 CharBuffer charBuffer = null;
193
194 BufferedReader reader = new BufferedReader(
195 new InputStreamReader(inStream,encoding));
196
197 File backingFile = new File(backingFilename);
198 this.decodedFile = File.createTempFile(backingFile.getName(), WRITE_ENCODING, backingFile.getParentFile());
199 FileOutputStream fos;
200 fos = new FileOutputStream(this.decodedFile);
201
202 IOUtils.copy(reader, fos, WRITE_ENCODING);
203 fos.close();
204
205 charBuffer = getReadOnlyMemoryMappedBuffer(this.decodedFile).
206 asCharBuffer();
207
208 return charBuffer;
209 }
210
211 /***
212 * Decode passed buffer into a CharBuffer.
213 *
214 * This method decodes a memory buffer returning a memory buffer.
215 *
216 * @param buffer In-memory buffer of recordings prefix. We read from
217 * here first and will only go to the backing file if <code>size</code>
218 * requested is greater than <code>buffer.length</code>.
219 * @param size Total size of stream to replay in bytes. Used to find
220 * EOS. This is total length of content including HTTP headers if
221 * present.
222 * @param responseBodyStart Where the response body starts in bytes.
223 * Used to skip over the HTTP headers if present.
224 * @param encoding Encoding to use reading the passed prefix buffer and
225 * backing file. For now, should be java canonical name for the
226 * encoding. (If null is passed, we will default to
227 * ByteReplayCharSequence).
228 *
229 * @return A CharBuffer view on decodings of the contents of passed
230 * buffer.
231 */
232 private CharBuffer decodeInMemory(byte[] buffer, long size,
233 long responseBodyStart, String encoding)
234 {
235 ByteBuffer bb = ByteBuffer.wrap(buffer);
236
237 bb.position((int)responseBodyStart);
238
239 bb.limit((int)size);
240 return (Charset.forName(encoding)).decode(bb).asReadOnlyBuffer();
241 }
242
243 /***
244 * Create read-only memory-mapped buffer onto passed file.
245 *
246 * @param file File to get memory-mapped buffer on.
247 * @return Read-only memory-mapped ByteBuffer view on to passed file.
248 * @throws IOException
249 */
250 private ByteBuffer getReadOnlyMemoryMappedBuffer(File file)
251 throws IOException {
252
253 ByteBuffer bb = null;
254 FileInputStream in = null;
255 FileChannel c = null;
256 assert file.exists(): "No file " + file.getAbsolutePath();
257
258 try {
259 in = new FileInputStream(file);
260 c = in.getChannel();
261
262 int mapSize = (int)Math.min(c.size(), (long)Integer.MAX_VALUE);
263 if (mapSize < c.size()) {
264 logger.log(Level.WARNING, "only first 2GiB of temp file mapped, thread="
265 + Thread.currentThread().getName() + " file=" + file);
266 }
267
268
269
270 bb = c.map(FileChannel.MapMode.READ_ONLY, 0, mapSize).
271 asReadOnlyBuffer();
272 } finally {
273 if (c != null && c.isOpen()) {
274 c.close();
275 }
276 if (in != null) {
277 in.close();
278 }
279 }
280
281 return bb;
282 }
283
284 private void deleteFile(File fileToDelete) {
285 deleteFile(fileToDelete, null);
286 }
287
288 private void deleteFile(File fileToDelete, final Exception e) {
289 if (e != null) {
290
291
292 logger.severe("Deleting " + fileToDelete + " because of "
293 + e.toString());
294 }
295 if (fileToDelete != null && fileToDelete.exists()) {
296 FileUtils.deleteSoonerOrLater(fileToDelete);
297 }
298 }
299
300 public void close()
301 {
302 this.content = null;
303 deleteFile(this.decodedFile);
304
305
306
307 this.decodedFile = null;
308 }
309
310 protected void finalize() throws Throwable
311 {
312 super.finalize();
313
314 close();
315 }
316
317 public int length()
318 {
319 return this.content.limit();
320 }
321
322 public char charAt(int index)
323 {
324 return this.content.get(index);
325 }
326
327 public CharSequence subSequence(int start, int end) {
328 return new CharSubSequence(this, start, end);
329 }
330
331 public String toString() {
332 StringBuffer sb = new StringBuffer(length());
333
334 for (int i = 0;i<length();i++) {
335 sb.append(charAt(i));
336 }
337 return sb.toString();
338 }
339 }