1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io;
24
25 import java.io.IOException;
26 import java.io.RandomAccessFile;
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30 import org.archive.util.DevUtils;
31
32 /***
33 * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
34 * buffer and overflow backing file).
35 *
36 * Assumes the byte stream is ISO-8859-1 text, taking advantage of the fact
37 * that each byte in the stream corresponds to a single unicode character with
38 * the same numerical value as the byte.
39 *
40 * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
41 * from disk in memory; as long as the 'random access' of a CharSequence
42 * user stays within this window, access should remain fairly efficient.
43 * (So design any regexps pointed at these CharSequences to work within
44 * that range!)
45 *
46 * <p>When rereading of a location is necessary, the whole window is
47 * recentered around the location requested. (TODO: More research
48 * into whether this is the best strategy.)
49 *
50 * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
51 * to wrap the passed prefix buffer and the second, a memory-mapped
52 * ByteBuffer view into the backing file -- was consistently slower: ~10%.
53 * My tests did the following. Made a buffer filled w/ regular content.
54 * This buffer was used as the prefix buffer. The buffer content was
55 * written MULTIPLER times to a backing file. I then did accesses w/ the
56 * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
57 * read forward from byte 16-32. Repeat. Though I varied the size of the
58 * buffer to the size of the backing file,from 3-10, the difference of 10%
59 * or so seemed to persist. Same if I tried to favor get() over get(index).
60 * I used a profiler, JMP, to study times taken (St.Ack did above comment).
61 *
62 * <p>TODO determine in memory mapped files is better way to do this;
63 * probably not -- they don't offer the level of control over
64 * total memory used that this approach does.
65 *
66 * @author Gordon Mohr
67 * @version $Revision: 6512 $, $Date: 2009-09-23 03:02:29 +0000 (Wed, 23 Sep 2009) $
68 */
69 class Latin1ByteReplayCharSequence implements ReplayCharSequence {
70
71 protected static Logger logger =
72 Logger.getLogger(Latin1ByteReplayCharSequence.class.getName());
73
74 /***
75 * Buffer that holds the first bit of content.
76 *
77 * Once this is exhausted we go to the backing file.
78 */
79 private byte[] prefixBuffer;
80
81 /***
82 * Total length of character stream to replay minus the HTTP headers
83 * if present.
84 *
85 * Used to find EOS.
86 */
87 protected int length;
88
89 /***
90 * Absolute length of the stream.
91 *
92 * Includes HTTP headers. Needed doing calc. in the below figuring
93 * how much to load into buffer.
94 */
95 private int absoluteLength = -1;
96
97 /***
98 * Buffer window on to backing file.
99 */
100 private byte[] wraparoundBuffer;
101
102 /***
103 * Absolute index into underlying bytestream where wrap starts.
104 */
105 private int wrapOrigin;
106
107 /***
108 * Index in wraparoundBuffer that corresponds to wrapOrigin
109 */
110 private int wrapOffset;
111
112 /***
113 * Name of backing file we go to when we've exhausted content from the
114 * prefix buffer.
115 */
116 private String backingFilename;
117
118 /***
119 * Random access to the backing file.
120 */
121 private RandomAccessFile raFile;
122
123 /***
124 * Offset into prefix buffer at which content beings.
125 */
126 private int contentOffset;
127
128
129 /***
130 * Constructor.
131 *
132 * @param buffer In-memory buffer of recordings prefix. We read from
133 * here first and will only go to the backing file if <code>size</code>
134 * requested is greater than <code>buffer.length</code>.
135 * @param size Total size of stream to replay in bytes. Used to find
136 * EOS. This is total length of content including HTTP headers if
137 * present.
138 * @param responseBodyStart Where the response body starts in bytes.
139 * Used to skip over the HTTP headers if present.
140 * @param backingFilename Path to backing file with content in excess of
141 * whats in <code>buffer</code>.
142 *
143 * @throws IOException
144 */
145 public Latin1ByteReplayCharSequence(byte[] buffer, long size,
146 long responseBodyStart, String backingFilename)
147 throws IOException {
148
149 this.length = (int)(size - responseBodyStart);
150 this.absoluteLength = (int)size;
151 this.prefixBuffer = buffer;
152 this.contentOffset = (int)responseBodyStart;
153
154
155
156 if (size > buffer.length) {
157 this.backingFilename = backingFilename;
158 this.raFile = new RandomAccessFile(backingFilename, "r");
159 this.wraparoundBuffer = new byte[this.prefixBuffer.length];
160 this.wrapOrigin = this.prefixBuffer.length;
161 this.wrapOffset = 0;
162 loadBuffer();
163 }
164 }
165
166 /***
167 * @return Length of characters in stream to replay. Starts counting
168 * at the HTTP header/body boundary.
169 */
170 public int length() {
171 return this.length;
172 }
173
174 /***
175 * Get character at passed absolute position.
176 *
177 * Called by {@link #charAt(int)} which has a relative index into the
178 * content, one that doesn't account for HTTP header if present.
179 *
180 * @param index Index into content adjusted to accomodate initial offset
181 * to get us past the HTTP header if present (i.e.
182 * {@link #contentOffset}).
183 *
184 * @return Characater at offset <code>index</code>.
185 */
186 public char charAt(int index) {
187 int c = -1;
188
189
190 index += this.contentOffset;
191 if (index < this.prefixBuffer.length) {
192
193 c = this.prefixBuffer[index];
194 } else if (index >= this.wrapOrigin &&
195 (index - this.wrapOrigin) < this.wraparoundBuffer.length) {
196
197 c = this.wraparoundBuffer[
198 ((index - this.wrapOrigin) + this.wrapOffset) %
199 this.wraparoundBuffer.length];
200 } else {
201
202
203
204 c = faultCharAt(index);
205 }
206
207
208 return (char)(c & 0xff);
209 }
210
211 /***
212 * Get a character that's outside the current buffers.
213 *
214 * will cause the wraparoundBuffer to be changed to
215 * cover a region including the index
216 *
217 * if index is higher than the highest index in the
218 * wraparound buffer, buffer is moved forward such
219 * that requested char is last item in buffer
220 *
221 * if index is lower than lowest index in the
222 * wraparound buffer, buffet is reset centered around
223 * index
224 *
225 * @param index Index of character to fetch.
226 * @return A character that's outside the current buffers
227 */
228 private int faultCharAt(int index) {
229 if(Thread.interrupted()) {
230 throw new RuntimeException("thread interrupted");
231 }
232 if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {
233
234 while (index >= this.wrapOrigin + this.wraparoundBuffer.length)
235 {
236
237 advanceBuffer();
238 }
239 return charAt(index - this.contentOffset);
240 }
241
242 recenterBuffer(index);
243 return charAt(index - this.contentOffset);
244 }
245
246 /***
247 * Move the buffer window on backing file back centering current access
248 * position in middle of window.
249 *
250 * @param index Index of character to access.
251 */
252 private void recenterBuffer(int index) {
253 if (logger.isLoggable(Level.FINE)) {
254 logger.fine("Recentering around " + index + " in " +
255 this.backingFilename);
256 }
257 this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);
258 if(this.wrapOrigin < this.prefixBuffer.length) {
259 this.wrapOrigin = this.prefixBuffer.length;
260 }
261 this.wrapOffset = 0;
262 loadBuffer();
263 }
264
265 /***
266 * Load from backing file into the wrapper buffer.
267 */
268 private void loadBuffer()
269 {
270 long len = -1;
271 try {
272 len = this.raFile.length();
273 this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);
274 this.raFile.readFully(this.wraparoundBuffer, 0,
275 Math.min(this.wraparoundBuffer.length,
276 this.absoluteLength - this.wrapOrigin));
277 }
278
279 catch (IOException e) {
280
281 DevUtils.logger.log (
282 Level.SEVERE,
283 "raFile.seek(" +
284 (this.wrapOrigin - this.prefixBuffer.length) +
285 ")\n" +
286 "raFile.readFully(wraparoundBuffer,0," +
287 (Math.min(this.wraparoundBuffer.length,
288 this.length - this.wrapOrigin )) +
289 ")\n"+
290 "raFile.length()" + len + "\n" +
291 DevUtils.extraInfo(),
292 e);
293 throw new RuntimeException(e);
294 }
295 }
296
297 /***
298 * Roll the wraparound buffer forward one position
299 */
300 private void advanceBuffer() {
301 try {
302 this.wraparoundBuffer[this.wrapOffset] =
303 (byte)this.raFile.read();
304 this.wrapOffset++;
305 this.wrapOffset %= this.wraparoundBuffer.length;
306 this.wrapOrigin++;
307 } catch (IOException e) {
308 DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +
309 DevUtils.extraInfo(), e);
310 throw new RuntimeException(e);
311 }
312 }
313
314 public CharSequence subSequence(int start, int end) {
315 return new CharSubSequence(this, start, end);
316 }
317
318 /***
319 * Cleanup resources.
320 *
321 * @exception IOException Failed close of random access file.
322 */
323 public void close() throws IOException
324 {
325 this.prefixBuffer = null;
326 if (this.raFile != null) {
327 this.raFile.close();
328 this.raFile = null;
329 }
330 }
331
332
333
334
335 protected void finalize() throws Throwable
336 {
337 super.finalize();
338 close();
339 }
340
341 /***
342 * Convenience method for getting a substring.
343 * @deprecated please use subSequence() and then toString() directly
344 */
345 public String substring(int offset, int len) {
346 return subSequence(offset, offset+len).toString();
347 }
348
349
350
351
352 public String toString() {
353 StringBuilder sb = new StringBuilder(this.length());
354 sb.append(this);
355 return sb.toString();
356 }
357 }