1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.util.ms;
24
25 import java.io.IOException;
26 import java.util.logging.Level;
27 import java.util.logging.Logger;
28
29 import org.archive.io.BufferedSeekInputStream;
30 import org.archive.io.Endian;
31 import org.archive.io.OriginSeekInputStream;
32 import org.archive.io.SafeSeekInputStream;
33 import org.archive.io.SeekInputStream;
34
35
36 /***
37 * The piece table of a .doc file.
38 *
39 * <p>The piece table maps logical character positions of a document's text
40 * stream to actual file stream positions. The piece table is stored as two
41 * parallel arrays. The first array contains 32-bit integers representing
42 * the logical character positions. The second array contains 64-bit data
43 * structures that are mostly mysterious to me, except that they contain a
44 * 32-bit subfile offset. The second array is stored immediately after the
45 * first array. I call the first array the <i>charPos</i> array and the
46 * second array the <i>filePos</i> array.
47 *
48 * <p>The arrays are preceded by a special tag byte (2), followed by the
49 * combined size of both arrays in bytes. The number of piece table entries
50 * must be deduced from this byte size.
51 *
52 * <p>Because of this bizarre structure, caching piece table entries is
53 * something of a challenge. A single piece table entry is actually located
54 * in two different file locations. If there are many piece table entries,
55 * then the charPos and filePos information may be separated by many bytes,
56 * potentially crossing block boundaries. The approach I took was to use
57 * two different buffered streams. Up to n charPos offsets and n filePos
58 * structures can be buffered in the two streams, preventing any file seeking
59 * from occurring when looking up piece information. (File seeking must
60 * still occur to jump from one piece to the next.)
61 *
62 * <p>Note that the vast majority of .doc files in the world will have exactly
63 * 1 piece table entry, representing the complete text of the document. Only
64 * those documents that were "fast-saved" should have multiple pieces.
65 *
66 * <p>Finally, the text contained in a .doc file can either contain 16-bit
67 * unicode characters (charset UTF-16LE) or 8-bit CP1252 characters. One
68 * .doc file can contain both kinds of pieces. Whether or not a piece is
69 * Cp1252 is stored as a flag in the filePos value, bizarrely enough. If
70 * the flag is set, then the actual file position is the filePos with the
71 * flag cleared, then divided by 2.
72 *
73 * @author pjack
74 */
75 class PieceTable {
76
77 final static Logger LOGGER
78 = Logger.getLogger(PieceTable.class.getName());
79
80 /*** The bit that indicates if a piece uses Cp1252 or unicode. */
81 final static int CP1252_INDICATOR = 1 << 30;
82
83 /*** The mask to use to clear the Cp1252 flag bit. */
84 final static int CP1252_MASK = ~(3 << 30);
85
86 /*** The total number of pieces in the table. */
87 private int count;
88
89 /*** The total number of characters in the text stream. */
90 private int maxCharPos;
91
92 /*** The index of the current piece. */
93 private int current;
94
95 /*** The most recently returned piece from this table. */
96 private Piece currentPiece;
97
98
99 /*** The buffered stream that provides character position information. */
100 private SeekInputStream charPos;
101
102 /*** The buffered stream that provides file pointer information. */
103 private SeekInputStream filePos;
104
105
106 /***
107 * Constructor.
108 *
109 * @param tableStream the stream containing the piece table
110 * @param offset the starting offset of the piece table
111 * @param maxCharPos the total number of characters in the document
112 * @param cachedRecords the number of piece table entries to cache
113 * @throws IOException if an IO error occurs
114 */
115 public PieceTable(SeekInputStream tableStream, int offset,
116 int maxCharPos, int cachedRecords) throws IOException {
117 tableStream.position(offset);
118 skipProperties(tableStream);
119 int sizeInBytes = Endian.littleInt(tableStream);
120 this.count = (sizeInBytes - 4) / 12;
121 cachedRecords = Math.min(cachedRecords, count);
122 long tp = tableStream.position() + 4;
123 long charPosStart = tp;
124 long filePosStart = tp + count * 4 + 2;
125
126 this.filePos = wrap(tableStream, filePosStart, cachedRecords * 8);
127 this.charPos = wrap(tableStream, charPosStart, cachedRecords * 4);
128 this.maxCharPos = maxCharPos;
129
130 if (LOGGER.isLoggable(Level.FINEST)) {
131 LOGGER.finest("Size in bytes: " + sizeInBytes);
132 LOGGER.finest("Piece table count: " + count);
133 for (Piece piece = next(); piece != null; piece = next()) {
134 LOGGER.finest("#" + current + ": " + piece.toString());
135 }
136 current = 0;
137 }
138 }
139
140
141 /***
142 * Wraps the raw table stream. This is used to create the charPos and
143 * filePos streams. The streams that this method returns are "safe",
144 * meaning that the charPos and filePos position() fields never clobber
145 * each other. They are buffered, meaning that up to <i>n</i> elements
146 * can be read before the disk is accessed again. And they are "origined",
147 * meaning result.position(0) actually positions the stream at the
148 * beginning of the piece table array, not the beginning of the file.
149 *
150 * @param input the stream to wrap
151 * @param pos the origin for the returned stream
152 * @param cache the number of bytes for the returned stream to buffer
153 * @return the wrapped stream
154 * @throws IOException if an IO error occurs
155 */
156 private SeekInputStream wrap(SeekInputStream input, long pos, int cache)
157 throws IOException {
158 input.position(pos);
159 SeekInputStream r = new SafeSeekInputStream(input);
160 r = new OriginSeekInputStream(r, pos);
161 r = new BufferedSeekInputStream(r, cache);
162 return r;
163 }
164
165
166 /***
167 * Skips over any property information that may precede a piece table.
168 * These property structures contain stylesheet information that applies
169 * to the piece table. Since we're only interested in the text itself,
170 * we just ignore this property stuff. (I suppose a third buffered
171 * stream could be used to add style information to {@link Piece}, but
172 * we don't need it.)
173 *
174 * @param input the input stream containing the piece table
175 * @throws IOException if an IO error occurs
176 */
177 private static void skipProperties(SeekInputStream input) throws IOException {
178 int tag = input.read();
179 while (tag == 1) {
180 int size = Endian.littleChar(input);
181 while (size > 0) {
182 size -= input.skip(size);
183 }
184 tag = input.read();
185 }
186 if (tag != 2) {
187 throw new IllegalStateException();
188 }
189 }
190
191
192 /***
193 * Returns the maximum character position. Put another way, returns the
194 * total number of characters in the document.
195 *
196 * @return the maximum character position
197 */
198 public int getMaxCharPos() {
199 return maxCharPos;
200 }
201
202
203 /***
204 * Returns the next piece in the piece table.
205 *
206 * @return the next piece in the piece table, or null if there is no
207 * next piece
208 * @throws IOException if an IO error occurs
209 */
210 public Piece next() throws IOException {
211 if (current >= count) {
212 currentPiece = null;
213 return null;
214 }
215
216 int cp;
217 if (current == count - 1) {
218 cp = maxCharPos;
219 } else {
220 charPos.position(current * 4);
221 cp = Endian.littleInt(charPos);
222 }
223 filePos.position(current * 8);
224 int encoded = Endian.littleInt(filePos);
225
226 if (LOGGER.isLoggable(Level.FINEST)) {
227 StringBuffer sb = new StringBuffer(Integer.toBinaryString(encoded));
228 while (sb.length() < 32) {
229 sb.insert(0, '0');
230 }
231 LOGGER.finest("Encoded offset: " + sb.toString());
232 }
233
234 current++;
235
236 int start;
237 if (currentPiece == null) {
238 start = 0;
239 } else {
240 start = currentPiece.getCharPosLimit();
241 }
242 if ((encoded & CP1252_INDICATOR) == 0) {
243 Piece piece = new Piece(encoded, start, cp, true);
244 currentPiece = piece;
245 return piece;
246 } else {
247 int filePos = (encoded & CP1252_MASK) / 2;
248 Piece piece = new Piece(filePos, start, cp, false);
249 currentPiece = piece;
250 return piece;
251 }
252 }
253
254
255 /***
256 * Returns the piece containing the given character position.
257 *
258 * @param charPos the character position whose piece to return
259 * @return that piece, or null if no such piece exists (if charPos
260 * is greater than getMaxCharPos())
261 * @throws IOException if an IO error occurs
262 */
263 public Piece pieceFor(int charPos) throws IOException {
264 if (currentPiece.contains(charPos)) {
265 return currentPiece;
266 }
267
268
269
270 current = 0;
271 currentPiece = null;
272 next();
273
274 while (currentPiece != null) {
275 if (currentPiece.contains(charPos)) {
276 return currentPiece;
277 }
278 next();
279 }
280
281 return null;
282 }
283
284 }