View Javadoc

1   /* MultiByteReplayCharSequenceFactory
2    *
3    * (Re)Created on Dec 21, 2006
4    *
5    * Copyright (C) 2006 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io;
24  
25  import java.io.BufferedReader;
26  import java.io.File;
27  import java.io.FileInputStream;
28  import java.io.FileOutputStream;
29  import java.io.IOException;
30  import java.io.InputStreamReader;
31  import java.nio.ByteBuffer;
32  import java.nio.CharBuffer;
33  import java.nio.channels.FileChannel;
34  import java.nio.charset.Charset;
35  import java.util.logging.Level;
36  import java.util.logging.Logger;
37  
38  import org.apache.commons.io.IOUtils;
39  import org.archive.util.FileUtils;
40  
41  /***
42   * Provides a (Replay)CharSequence view on recorded streams (a prefix
43   * buffer and overflow backing file) that can handle streams of multibyte
44   * characters.
45   *
46   * For better performance on ISO-8859-1 text, use 
47   * {@link Latin1ByteReplayCharSequence}.
48   *
49   * <p>Call close on this class when done so can clean up resources.
50   *
51   * <p>Implementation currently works by checking to see if content to read
52   * all fits the in-memory buffer.  If so, we decode into a CharBuffer and
53   * keep this around for CharSequence operations.  This CharBuffer is
54   * discarded on close.
55   *
56   * <p>If content length is greater than in-memory buffer, we decode the
57   * buffer plus backing file into a new file named for the backing file w/
58   * a suffix of the encoding we write the file as. We then run w/ a
59   * memory-mapped CharBuffer against this file to implement CharSequence.
60   * Reasons for this implemenation are that CharSequence wants to return the
61   * length of the CharSequence.
62   *
63   * <p>Obvious optimizations would keep around decodings whether the
64   * in-memory decoded buffer or the file of decodings written to disk but the
65   * general usage pattern processing URIs is that the decoding is used by one
66   * processor only.  Also of note, files usually fit into the in-memory
67   * buffer.
68   *
69   * <p>We might also be able to keep up 3 windows that moved across the file
70   * decoding a window at a time trying to keep one of the buffers just in
71   * front of the regex processing returning it a length that would be only
72   * the length of current position to end of current block or else the length
73   * could be got by multipling the backing files length by the decoders'
74   * estimate of average character size.  This would save us writing out the
75   * decoded file.  We'd have to do the latter for files that are
76   * > Integer.MAX_VALUE.
77   *
78   * @author stack
79   * @version $Revision: 6090 $, $Date: 2008-12-09 23:36:27 +0000 (Tue, 09 Dec 2008) $
80   */
81  public class GenericReplayCharSequence implements ReplayCharSequence {
82  
83      protected static Logger logger =
84          Logger.getLogger(GenericReplayCharSequence.class.getName());
85      
86      /***
87       * Name of the encoding we use writing out concatenated decoded prefix
88       * buffer and decoded backing file.
89       *
90       * <p>This define is also used as suffix for the file that holds the
91       * decodings.  The name of the file that holds the decoding is the name
92       * of the backing file w/ this encoding for a suffix.
93       *
94       * <p>See <a ref="http://java.sun.com/j2se/1.4.2/docs/guide/intl/encoding.doc.html">Encoding</a>.
95       */
96      private static final String WRITE_ENCODING = "UTF-16BE";
97  
98      /***
99       * CharBuffer of decoded content.
100      *
101      * Content of this buffer is unicode.
102      */
103     private CharBuffer content = null;
104 
105     /***
106      * File that has decoded content.
107      *
108      * Keep it around so we can remove on close.
109      */
110     private File decodedFile = null;
111 
112 
113     /***
114      * Constructor for all in-memory operation.
115      *
116      * @param buffer In-memory buffer of recordings prefix.  We read from
117      * here first and will only go to the backing file if <code>size</code>
118      * requested is greater than <code>buffer.length</code>.
119      * @param size Total size of stream to replay in bytes.  Used to find
120      * EOS. This is total length of content including HTTP headers if
121      * present.
122      * @param responseBodyStart Where the response body starts in bytes.
123      * Used to skip over the HTTP headers if present.
124      * @param backingFilename Path to backing file with content in excess of
125      * whats in <code>buffer</code>.
126      * @param encoding Encoding to use reading the passed prefix buffer and
127      * backing file.  For now, should be java canonical name for the
128      * encoding. (If null is passed, we will default to
129      * ByteReplayCharSequence).
130      *
131      * @throws IOException
132      */
133     public GenericReplayCharSequence(byte[] buffer, long size,
134             long responseBodyStart, String encoding)
135         throws IOException {
136         super();
137         this.content = decodeInMemory(buffer, size, responseBodyStart, 
138                 encoding);
139      }
140 
141     /***
142      * Constructor for overflow-to-disk-file operation.
143      *
144      * @param contentReplayInputStream inputStream of content
145      * @param backingFilename hint for name of temp file
146      * @param characterEncoding Encoding to use reading the stream.
147      * For now, should be java canonical name for the
148      * encoding. 
149      *
150      * @throws IOException
151      */
152     public GenericReplayCharSequence(
153             ReplayInputStream contentReplayInputStream,
154             String backingFilename,
155             String characterEncoding)
156         throws IOException {
157         super();
158         this.content = decodeToFile(contentReplayInputStream, 
159                 backingFilename, characterEncoding);
160     }
161 
162     /***
163      * Decode passed buffer and backing file into a CharBuffer.
164      *
165      * This method writes a new file made of the decoded concatenation of
166      * the in-memory prefix buffer and the backing file.  Returns a
167      * charSequence view onto this new file.
168      *
169      * @param buffer In-memory buffer of recordings prefix.  We read from
170      * here first and will only go to the backing file if <code>size</code>
171      * requested is greater than <code>buffer.length</code>.
172      * @param size Total size of stream to replay in bytes.  Used to find
173      * EOS. This is total length of content including HTTP headers if
174      * present.
175      * @param responseBodyStart Where the response body starts in bytes.
176      * Used to skip over the HTTP headers if present.
177      * @param backingFilename Path to backing file with content in excess of
178      * whats in <code>buffer</code>.
179      * @param encoding Encoding to use reading the passed prefix buffer and
180      * backing file.  For now, should be java canonical name for the
181      * encoding. (If null is passed, we will default to
182      * ByteReplayCharSequence).
183      *
184      * @return A CharBuffer view on decodings of the contents of passed
185      * buffer.
186      * @throws IOException
187      */
188     private CharBuffer decodeToFile(ReplayInputStream inStream, 
189             String backingFilename, String encoding)
190         throws IOException {
191 
192         CharBuffer charBuffer = null;
193 
194         BufferedReader reader = new BufferedReader(
195                 new InputStreamReader(inStream,encoding));
196         
197         File backingFile = new File(backingFilename);
198         this.decodedFile = File.createTempFile(backingFile.getName(), WRITE_ENCODING, backingFile.getParentFile());
199         FileOutputStream fos;
200         fos = new FileOutputStream(this.decodedFile);
201 
202         IOUtils.copy(reader, fos, WRITE_ENCODING);
203         fos.close();
204         
205         charBuffer = getReadOnlyMemoryMappedBuffer(this.decodedFile).
206             asCharBuffer();
207 
208         return charBuffer;
209     }
210 
211     /***
212      * Decode passed buffer into a CharBuffer.
213      *
214      * This method decodes a memory buffer returning a memory buffer.
215      *
216      * @param buffer In-memory buffer of recordings prefix.  We read from
217      * here first and will only go to the backing file if <code>size</code>
218      * requested is greater than <code>buffer.length</code>.
219      * @param size Total size of stream to replay in bytes.  Used to find
220      * EOS. This is total length of content including HTTP headers if
221      * present.
222      * @param responseBodyStart Where the response body starts in bytes.
223      * Used to skip over the HTTP headers if present.
224      * @param encoding Encoding to use reading the passed prefix buffer and
225      * backing file.  For now, should be java canonical name for the
226      * encoding. (If null is passed, we will default to
227      * ByteReplayCharSequence).
228      *
229      * @return A CharBuffer view on decodings of the contents of passed
230      * buffer.
231      */
232     private CharBuffer decodeInMemory(byte[] buffer, long size,
233             long responseBodyStart, String encoding)
234     {
235         ByteBuffer bb = ByteBuffer.wrap(buffer);
236         // Move past the HTTP header if present.
237         bb.position((int)responseBodyStart);
238         // Set the end-of-buffer to be end-of-content.
239         bb.limit((int)size);
240         return (Charset.forName(encoding)).decode(bb).asReadOnlyBuffer();
241     }
242 
243     /***
244      * Create read-only memory-mapped buffer onto passed file.
245      *
246      * @param file File to get memory-mapped buffer on.
247      * @return Read-only memory-mapped ByteBuffer view on to passed file.
248      * @throws IOException
249      */
250     private ByteBuffer getReadOnlyMemoryMappedBuffer(File file)
251         throws IOException {
252 
253         ByteBuffer bb = null;
254         FileInputStream in = null;
255         FileChannel c = null;
256         assert file.exists(): "No file " + file.getAbsolutePath();
257 
258         try {
259             in = new FileInputStream(file);
260             c = in.getChannel();
261 
262             int mapSize = (int)Math.min(c.size(), (long)Integer.MAX_VALUE);
263             if (mapSize < c.size()) {
264                 logger.log(Level.WARNING, "only first 2GiB of temp file mapped, thread=" 
265                         + Thread.currentThread().getName() + " file=" + file);
266             }
267 
268             // TODO: Confirm the READ_ONLY works.  I recall it not working.
269             // The buffers seem to always say that the buffer is writeable.
270             bb = c.map(FileChannel.MapMode.READ_ONLY, 0, mapSize).
271                 asReadOnlyBuffer();
272         } finally {
273             if (c != null && c.isOpen()) {
274                 c.close();
275             }
276             if (in != null) {
277                 in.close();
278             }
279         }
280 
281         return bb;
282     }
283 
284     private void deleteFile(File fileToDelete) {
285         deleteFile(fileToDelete, null);        
286     }
287 
288     private void deleteFile(File fileToDelete, final Exception e) {
289         if (e != null) {
290             // Log why the delete to help with debug of java.io.FileNotFoundException:
291             // ....tt53http.ris.UTF-16BE.
292             logger.severe("Deleting " + fileToDelete + " because of "
293                 + e.toString());
294         }
295         if (fileToDelete != null && fileToDelete.exists()) {
296             FileUtils.deleteSoonerOrLater(fileToDelete); 
297         }
298     }
299 
300     public void close()
301     {
302         this.content = null;
303         deleteFile(this.decodedFile);
304         // clear decodedFile -- so that double-close (as in 
305         // finalize()) won't delete a later instance with same name
306         // see bug [ 1218961 ] "failed get of replay" in ExtractorHTML... usu: UTF-16BE
307         this.decodedFile = null;
308     }
309 
310     protected void finalize() throws Throwable
311     {
312         super.finalize();
313         // Maybe TODO: eliminate close here, requiring explicit close instead
314         close();
315     }
316 
317     public int length()
318     {
319         return this.content.limit();
320     }
321 
322     public char charAt(int index)
323     {
324         return this.content.get(index);
325     }
326 
327     public CharSequence subSequence(int start, int end) {
328         return new CharSubSequence(this, start, end);
329     }
330     
331     public String toString() {
332         StringBuffer sb = new StringBuffer(length());
333         // could use StringBuffer.append(CharSequence) if willing to do 1.5 & up
334         for (int i = 0;i<length();i++) {
335             sb.append(charAt(i)); 
336         }
337         return sb.toString();
338     }
339 }