View Javadoc

1   /* PieceTable
2   *
3   * Created on September 12, 2006
4   *
5   * Copyright (C) 2006 Internet Archive.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.util.ms;
24  
25  import java.io.IOException;
26  import java.util.logging.Level;
27  import java.util.logging.Logger;
28  
29  import org.archive.io.BufferedSeekInputStream;
30  import org.archive.io.Endian;
31  import org.archive.io.OriginSeekInputStream;
32  import org.archive.io.SafeSeekInputStream;
33  import org.archive.io.SeekInputStream;
34  
35  
36  /***
37   * The piece table of a .doc file.  
38   * 
39   * <p>The piece table maps logical character positions of a document's text
40   * stream to actual file stream positions.  The piece table is stored as two
41   * parallel arrays.  The first array contains 32-bit integers representing
42   * the logical character positions.  The second array contains 64-bit data
43   * structures that are mostly mysterious to me, except that they contain a
44   * 32-bit subfile offset.  The second array is stored immediately after the
45   * first array.  I call the first array the <i>charPos</i> array and the 
46   * second array the <i>filePos</i> array.
47   * 
48   * <p>The arrays are preceded by a special tag byte (2), followed by the
49   * combined size of both arrays in bytes.  The number of piece table entries 
50   * must be deduced from this byte size.  
51   * 
52   * <p>Because of this bizarre structure, caching piece table entries is 
53   * something of a challenge.  A single piece table entry is actually located
54   * in two different file locations.  If there are many piece table entries,
55   * then the charPos and filePos information may be separated by many bytes,
56   * potentially crossing block boundaries.  The approach I took was to use
57   * two different buffered streams.  Up to n charPos offsets and n filePos
58   * structures can be buffered in the two streams, preventing any file seeking
59   * from occurring when looking up piece information.  (File seeking must 
60   * still occur to jump from one piece to the next.)
61   * 
62   * <p>Note that the vast majority of .doc files in the world will have exactly
63   * 1 piece table entry, representing the complete text of the document.  Only
64   * those documents that were "fast-saved" should have multiple pieces.
65   * 
66   * <p>Finally, the text contained in a .doc file can either contain 16-bit
67   * unicode characters (charset UTF-16LE) or 8-bit CP1252 characters.  One
68   * .doc file can contain both kinds of pieces.  Whether or not a piece is
69   * Cp1252 is stored as a flag in the filePos value, bizarrely enough.  If
70   * the flag is set, then the actual file position is the filePos with the
71   * flag cleared, then divided by 2.
72   * 
73   * @author pjack
74   */
75  class PieceTable {
76  
77      final static Logger LOGGER
78       = Logger.getLogger(PieceTable.class.getName());
79  
80      /*** The bit that indicates if a piece uses Cp1252 or unicode. */
81      final static int CP1252_INDICATOR = 1 << 30;
82      
83      /*** The mask to use to clear the Cp1252 flag bit. */
84      final static int CP1252_MASK = ~(3 << 30);
85  
86      /*** The total number of pieces in the table. */
87      private int count;
88      
89      /*** The total number of characters in the text stream. */
90      private int maxCharPos;
91  
92      /*** The index of the current piece. */
93      private int current;
94      
95      /*** The most recently returned piece from this table. */
96      private Piece currentPiece;
97  
98  
99      /*** The buffered stream that provides character position information. */
100     private SeekInputStream charPos;
101     
102     /*** The buffered stream that provides file pointer information. */
103     private SeekInputStream filePos;
104 
105 
106     /***
107      * Constructor.
108      * 
109      * @param tableStream   the stream containing the piece table
110      * @param offset        the starting offset of the piece table
111      * @param maxCharPos     the total number of characters in the document
112      * @param cachedRecords  the number of piece table entries to cache
113      * @throws IOException   if an IO error occurs
114      */
115     public PieceTable(SeekInputStream tableStream, int offset, 
116             int maxCharPos, int cachedRecords) throws IOException {
117         tableStream.position(offset);
118         skipProperties(tableStream);
119         int sizeInBytes = Endian.littleInt(tableStream);
120         this.count = (sizeInBytes - 4) / 12;
121         cachedRecords = Math.min(cachedRecords, count);
122         long tp = tableStream.position() + 4;
123         long charPosStart = tp;
124         long filePosStart = tp + count * 4 + 2;
125         
126         this.filePos = wrap(tableStream, filePosStart, cachedRecords * 8);
127         this.charPos = wrap(tableStream, charPosStart, cachedRecords * 4);
128         this.maxCharPos = maxCharPos;
129         
130         if (LOGGER.isLoggable(Level.FINEST)) {
131             LOGGER.finest("Size in bytes: " + sizeInBytes);
132             LOGGER.finest("Piece table count: " + count);
133             for (Piece piece = next(); piece != null; piece = next()) {
134                 LOGGER.finest("#" + current + ": " + piece.toString());
135             }
136             current = 0;
137         }
138     }
139     
140     
141     /***
142      * Wraps the raw table stream.  This is used to create the charPos and
143      * filePos streams.  The streams that this method returns are "safe",
144      * meaning that the charPos and filePos position() fields never clobber
145      * each other.  They are buffered, meaning that up to <i>n</i> elements
146      * can be read before the disk is accessed again.  And they are "origined",
147      * meaning result.position(0) actually positions the stream at the 
148      * beginning of the piece table array, not the beginning of the file.
149      * 
150      * @param input   the stream to wrap
151      * @param pos     the origin for the returned stream
152      * @param cache   the number of bytes for the returned stream to buffer
153      * @return   the wrapped stream
154      * @throws IOException  if an IO error occurs
155      */
156     private SeekInputStream wrap(SeekInputStream input, long pos, int cache) 
157     throws IOException {
158         input.position(pos);
159         SeekInputStream r = new SafeSeekInputStream(input);
160         r = new OriginSeekInputStream(r, pos);
161         r = new BufferedSeekInputStream(r, cache);
162         return r;
163     }
164     
165     
166     /***
167      * Skips over any property information that may precede a piece table.
168      * These property structures contain stylesheet information that applies
169      * to the piece table.  Since we're only interested in the text itself,
170      * we just ignore this property stuff.  (I suppose a third buffered
171      * stream could be used to add style information to {@link Piece}, but
172      * we don't need it.)
173      * 
174      * @param input  the input stream containing the piece table
175      * @throws IOException  if an IO error occurs
176      */
177     private static void skipProperties(SeekInputStream input) throws IOException {
178         int tag = input.read();
179         while (tag == 1) {
180             int size = Endian.littleChar(input);
181             while (size > 0) {
182                 size -= input.skip(size);
183             }
184             tag = input.read();
185         }
186         if (tag != 2) {
187             throw new IllegalStateException();
188         }
189     }
190 
191 
192     /***
193      * Returns the maximum character position.  Put another way, returns the
194      * total number of characters in the document.
195      * 
196      * @return  the maximum character position
197      */
198     public int getMaxCharPos() {
199         return maxCharPos;
200     }
201 
202 
203     /***
204      * Returns the next piece in the piece table.
205      * 
206      * @return  the next piece in the piece table, or null if there is no 
207      *   next piece
208      * @throws IOException  if an IO error occurs
209      */
210     public Piece next() throws IOException {
211         if (current >= count) {
212             currentPiece = null;
213             return null;
214         }
215                 
216         int cp;
217         if (current == count - 1) {
218             cp = maxCharPos;
219         } else {
220             charPos.position(current * 4);
221             cp = Endian.littleInt(charPos);
222         }
223         filePos.position(current * 8);
224         int encoded = Endian.littleInt(filePos);
225 
226         if (LOGGER.isLoggable(Level.FINEST)) {
227             StringBuffer sb = new StringBuffer(Integer.toBinaryString(encoded));
228             while (sb.length() < 32) {
229                 sb.insert(0, '0');
230             }
231             LOGGER.finest("Encoded offset: " + sb.toString());
232         }
233         
234         current++;
235 
236         int start;
237         if (currentPiece == null) {
238             start = 0;
239         } else {
240             start = currentPiece.getCharPosLimit();
241         }
242         if ((encoded & CP1252_INDICATOR) == 0) {
243             Piece piece = new Piece(encoded, start, cp, true);
244             currentPiece = piece;
245             return piece;
246         } else {
247             int filePos = (encoded & CP1252_MASK) / 2;
248             Piece piece = new Piece(filePos, start, cp, false);
249             currentPiece = piece;
250             return piece;
251         }
252     }
253 
254     
255     /***
256      * Returns the piece containing the given character position.
257      * 
258      * @param charPos   the character position whose piece to return
259      * @return   that piece, or null if no such piece exists (if charPos 
260      *   is greater than getMaxCharPos())
261      * @throws IOException   if an IO error occurs
262      */
263     public Piece pieceFor(int charPos) throws IOException {
264         if (currentPiece.contains(charPos)) {
265             return currentPiece;
266         }
267      
268         // FIXME: Use binary search to find piece index
269         
270         current = 0;
271         currentPiece = null;
272         next();
273         
274         while (currentPiece != null) {
275             if (currentPiece.contains(charPos)) {
276                 return currentPiece;
277             }
278             next();
279         }
280         
281         return null;
282     }
283 
284 }