View Javadoc

1   /* ByteReplayCharSequenceFactory
2    *
3    * (Re)Created on Dec 21, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.IOException;
26  import java.io.RandomAccessFile;
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.archive.util.DevUtils;
31  
32  /***
33   * Provides a (Replay)CharSequence view on recorded stream bytes (a prefix
34   * buffer and overflow backing file).
35   *
36   * Assumes the byte stream is ISO-8859-1 text, taking advantage of the fact 
37   * that each byte in the stream corresponds to a single unicode character with
38   * the same numerical value as the byte. 
39   *
40   * <p>Uses a wraparound rolling buffer of the last windowSize bytes read
41   * from disk in memory; as long as the 'random access' of a CharSequence
42   * user stays within this window, access should remain fairly efficient.
43   * (So design any regexps pointed at these CharSequences to work within
44   * that range!)
45   *
46   * <p>When rereading of a location is necessary, the whole window is
47   * recentered around the location requested. (TODO: More research
48   * into whether this is the best strategy.)
49   *
50   * <p>An implementation of a ReplayCharSequence done with ByteBuffers -- one
51   * to wrap the passed prefix buffer and the second, a memory-mapped
52   * ByteBuffer view into the backing file -- was consistently slower: ~10%.
53   * My tests did the following. Made a buffer filled w/ regular content.
54   * This buffer was used as the prefix buffer.  The buffer content was
55   * written MULTIPLER times to a backing file.  I then did accesses w/ the
56   * following pattern: Skip forward 32 bytes, then back 16 bytes, and then
57   * read forward from byte 16-32.  Repeat.  Though I varied the size of the
58   * buffer to the size of the backing file,from 3-10, the difference of 10%
59   * or so seemed to persist.  Same if I tried to favor get() over get(index).
60   * I used a profiler, JMP, to study times taken (St.Ack did above comment).
61   *
62   * <p>TODO determine in memory mapped files is better way to do this;
63   * probably not -- they don't offer the level of control over
64   * total memory used that this approach does.
65   *
66   * @author Gordon Mohr
67   * @version $Revision: 6512 $, $Date: 2009-09-23 03:02:29 +0000 (Wed, 23 Sep 2009) $
68   */
69  class Latin1ByteReplayCharSequence implements ReplayCharSequence {
70  
71      protected static Logger logger =
72          Logger.getLogger(Latin1ByteReplayCharSequence.class.getName());
73  
74      /***
75       * Buffer that holds the first bit of content.
76       *
77       * Once this is exhausted we go to the backing file.
78       */
79      private byte[] prefixBuffer;
80  
81      /***
82       * Total length of character stream to replay minus the HTTP headers
83       * if present.
84       *
85       * Used to find EOS.
86       */
87      protected int length;
88  
89      /***
90       * Absolute length of the stream.
91       *
92       * Includes HTTP headers.  Needed doing calc. in the below figuring
93       * how much to load into buffer.
94       */
95      private int absoluteLength = -1;
96  
97      /***
98       * Buffer window on to backing file.
99       */
100     private byte[] wraparoundBuffer;
101 
102     /***
103      * Absolute index into underlying bytestream where wrap starts.
104      */
105     private int wrapOrigin;
106 
107     /***
108      * Index in wraparoundBuffer that corresponds to wrapOrigin
109      */
110     private int wrapOffset;
111 
112     /***
113      * Name of backing file we go to when we've exhausted content from the
114      * prefix buffer.
115      */
116     private String backingFilename;
117 
118     /***
119      * Random access to the backing file.
120      */
121     private RandomAccessFile raFile;
122 
123     /***
124      * Offset into prefix buffer at which content beings.
125      */
126     private int contentOffset;
127 
128 
129     /***
130      * Constructor.
131      *
132      * @param buffer In-memory buffer of recordings prefix.  We read from
133      * here first and will only go to the backing file if <code>size</code>
134      * requested is greater than <code>buffer.length</code>.
135      * @param size Total size of stream to replay in bytes.  Used to find
136      * EOS. This is total length of content including HTTP headers if
137      * present.
138      * @param responseBodyStart Where the response body starts in bytes.
139      * Used to skip over the HTTP headers if present.
140      * @param backingFilename Path to backing file with content in excess of
141      * whats in <code>buffer</code>.
142      *
143      * @throws IOException
144      */
145     public Latin1ByteReplayCharSequence(byte[] buffer, long size,
146             long responseBodyStart, String backingFilename)
147         throws IOException {
148 
149         this.length = (int)(size - responseBodyStart);
150         this.absoluteLength = (int)size;
151         this.prefixBuffer = buffer;
152         this.contentOffset = (int)responseBodyStart;
153 
154         // If amount to read is > than what is in our prefix buffer, then
155         // open the backing file.
156         if (size > buffer.length) {
157             this.backingFilename = backingFilename;
158             this.raFile = new RandomAccessFile(backingFilename, "r");
159             this.wraparoundBuffer = new byte[this.prefixBuffer.length];
160             this.wrapOrigin = this.prefixBuffer.length;
161             this.wrapOffset = 0;
162             loadBuffer();
163         }
164     }
165 
166     /***
167      * @return Length of characters in stream to replay.  Starts counting
168      * at the HTTP header/body boundary.
169      */
170     public int length() {
171         return this.length;
172     }
173 
174     /***
175      * Get character at passed absolute position.
176      *
177      * Called by {@link #charAt(int)} which has a relative index into the
178      * content, one that doesn't account for HTTP header if present.
179      *
180      * @param index Index into content adjusted to accomodate initial offset
181      * to get us past the HTTP header if present (i.e.
182      * {@link #contentOffset}).
183      *
184      * @return Characater at offset <code>index</code>.
185      */
186     public char charAt(int index) {
187         int c = -1;
188         // Add to index start-of-content offset to get us over HTTP header
189         // if present.
190         index += this.contentOffset;
191         if (index < this.prefixBuffer.length) {
192             // If index is into our prefix buffer.
193             c = this.prefixBuffer[index];
194         } else if (index >= this.wrapOrigin &&
195             (index - this.wrapOrigin) < this.wraparoundBuffer.length) {
196             // If index is into our buffer window on underlying backing file.
197             c = this.wraparoundBuffer[
198                     ((index - this.wrapOrigin) + this.wrapOffset) %
199                         this.wraparoundBuffer.length];
200         } else {
201             // Index is outside of both prefix buffer and our buffer window
202             // onto the underlying backing file.  Fix the buffer window
203             // location.
204             c = faultCharAt(index);
205         }
206         // Stream is treated as single byte.  Make sure characters returned
207         // are not negative.
208         return (char)(c & 0xff);
209     }
210 
211     /***
212      * Get a character that's outside the current buffers.
213      *
214      * will cause the wraparoundBuffer to be changed to
215      * cover a region including the index
216      *
217      * if index is higher than the highest index in the
218      * wraparound buffer, buffer is moved forward such
219      * that requested char is last item in buffer
220      *
221      * if index is lower than lowest index in the
222      * wraparound buffer, buffet is reset centered around
223      * index
224      *
225      * @param index Index of character to fetch.
226      * @return A character that's outside the current buffers
227      */
228     private int faultCharAt(int index) {
229         if(Thread.interrupted()) {
230             throw new RuntimeException("thread interrupted");
231         }
232         if(index >= this.wrapOrigin + this.wraparoundBuffer.length) {
233             // Moving forward
234             while (index >= this.wrapOrigin + this.wraparoundBuffer.length)
235             {
236                 // TODO optimize this
237                 advanceBuffer();
238             }
239             return charAt(index - this.contentOffset);
240         }
241         // Moving backward
242         recenterBuffer(index);
243         return charAt(index - this.contentOffset);
244     }
245 
246     /***
247      * Move the buffer window on backing file back centering current access
248      * position in middle of window.
249      *
250      * @param index Index of character to access.
251      */
252     private void recenterBuffer(int index) {
253         if (logger.isLoggable(Level.FINE)) {
254             logger.fine("Recentering around " + index + " in " +
255                 this.backingFilename);
256         }
257         this.wrapOrigin = index - (this.wraparoundBuffer.length / 2);
258         if(this.wrapOrigin < this.prefixBuffer.length) {
259             this.wrapOrigin = this.prefixBuffer.length;
260         }
261         this.wrapOffset = 0;
262         loadBuffer();
263     }
264 
265     /***
266      * Load from backing file into the wrapper buffer.
267      */
268     private void loadBuffer()
269     {
270         long len = -1;
271         try {
272             len = this.raFile.length();
273             this.raFile.seek(this.wrapOrigin - this.prefixBuffer.length);
274             this.raFile.readFully(this.wraparoundBuffer, 0,
275                 Math.min(this.wraparoundBuffer.length,
276                      this.absoluteLength - this.wrapOrigin));
277         }
278 
279         catch (IOException e) {
280             // TODO convert this to a runtime error?
281             DevUtils.logger.log (
282                 Level.SEVERE,
283                 "raFile.seek(" +
284                 (this.wrapOrigin - this.prefixBuffer.length) +
285                 ")\n" +
286                 "raFile.readFully(wraparoundBuffer,0," +
287                 (Math.min(this.wraparoundBuffer.length,
288                     this.length - this.wrapOrigin )) +
289                 ")\n"+
290                 "raFile.length()" + len + "\n" +
291                 DevUtils.extraInfo(),
292                 e);
293             throw new RuntimeException(e);
294         }
295     }
296 
297     /***
298      * Roll the wraparound buffer forward one position
299      */
300     private void advanceBuffer() {
301         try {
302             this.wraparoundBuffer[this.wrapOffset] =
303                 (byte)this.raFile.read();
304             this.wrapOffset++;
305             this.wrapOffset %= this.wraparoundBuffer.length;
306             this.wrapOrigin++;
307         } catch (IOException e) {
308             DevUtils.logger.log(Level.SEVERE, "advanceBuffer()" +
309                 DevUtils.extraInfo(), e);
310             throw new RuntimeException(e);
311         }
312     }
313 
314     public CharSequence subSequence(int start, int end) {
315         return new CharSubSequence(this, start, end);
316     }
317 
318     /***
319      * Cleanup resources.
320      *
321      * @exception IOException Failed close of random access file.
322      */
323     public void close() throws IOException
324     {
325         this.prefixBuffer = null;
326         if (this.raFile != null) {
327             this.raFile.close();
328             this.raFile = null;
329         }
330     }
331 
332     /* (non-Javadoc)
333      * @see java.lang.Object#finalize()
334      */
335     protected void finalize() throws Throwable
336     {
337         super.finalize();
338         close();
339     }
340     
341     /***
342      * Convenience method for getting a substring. 
343      * @deprecated please use subSequence() and then toString() directly 
344      */
345     public String substring(int offset, int len) {
346         return subSequence(offset, offset+len).toString();
347     }
348 
349     /* (non-Javadoc)
350      * @see java.lang.Object#toString()
351      */
352     public String toString() {
353         StringBuilder sb = new StringBuilder(this.length());
354         sb.append(this);
355         return sb.toString();
356     }
357 }