1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.util.ms;
24
25 import java.io.IOException;
26 import java.nio.ByteBuffer;
27 import java.nio.ByteOrder;
28 import java.util.Map;
29
30 import org.archive.io.SeekInputStream;
31 import org.archive.util.IoUtils;
32 import org.archive.util.LRU;
33
34
35 /***
36 * Default implementation of the Block File System.
37 *
38 * <p>The overall structure of a BlockFileSystem file (such as a .doc file) is
39 * as follows. The file is divided into blocks, which are of uniform length
40 * (512 bytes). The first block (at file pointer 0) is called the header
41 * block. It's used to look up other blocks in the file.
42 *
43 * <p>Subfiles contained within the .doc file are organized using a Block
44 * Allocation Table, or BAT. The BAT is basically a linked list; given a
45 * block number, the BAT will tell you the next block number. Note that
46 * the header block has no number; block #0 is the first block after the
47 * header. Thus, to convert a block number to a file pointer:
48 * <code>int filePointer = (blockNumber + 1) * BLOCK_SIZE</code>.
49 *
50 * <p>The BAT itself is discontinuous, however. To find the blocks that
51 * comprise the BAT, you have to look in the header block. The header block
52 * contains an array of 109 pointers to the blocks that comprise the BAT.
53 * If more than 109 BAT blocks are required (in other words, if the .doc
54 * file is larger than ~6 megabytes), then something called the
55 * XBAT comes into play.
56 *
57 * <p>XBAT blocks contain pointers to the 110th BAT block and beyond.
58 * The first XBAT block is stored at a file pointer listed in the header.
59 * The other XBAT blocks are always stored in order after the first; the
60 * XBAT table is continuous. One is inclined to wonder why the BAT itself
61 * is not so stored, but oh well.
62 *
63 * <p>The BAT only tells you the next block for a given block. To find the
64 * first block for a subfile, you have to look up that subfile's directory
65 * entry. Each directory entry is a 128 byte structure in the file, so four
66 * of them fit in a block. The number of the first block of the entry list
67 * is stored in the header. To find subsequent entry blocks, the BAT must
68 * be used.
69 *
70 * <p>I'm telling you all this so that you understand the caching that this
71 * class provides.
72 *
73 * <p>First, directory entries are not cached. It's assumed that they will
74 * be looked up at the beginning of a lengthy operation, and then forgotten
75 * about. This is certainly the case for {@link Doc#getText(BlockFileSystem)}.
76 * If you need to remember directory entries, you can manually store the Entry
77 * objects in a map or something, as they don't grow stale.
78 *
79 * <p>This class keeps all 512 bytes of the header block in memory at all
80 * times. This prevents a potentially expensive file pointer repositioning
81 * every time you're trying to figure out what comes next.
82 *
83 * <p>BAT and XBAT blocks are stored in a least-recently used cache. The
84 * <i>n</i> most recent BAT and XBAT blocks are remembered, where <i>n</i>
85 * is set at construction time. The minimum value of <i>n</i> is 1. For small
86 * files, this can prevent file pointer repositioning for BAT look ups.
87 *
88 * <p>The BAT/XBAT cache only takes up memory as needed. If the specified
89 * cache size is 100 blocks, but the file only has 4 BAT blocks, then only
90 * 2048 bytes will be used by the cache.
91 *
92 * <p>Note this class only caches BAT and XBAT blocks. It does not cache the
93 * blocks that actually make up a subfile's contents. It is assumed that those
94 * blocks will only be accessed once per operation (again, this is what
95 * {Doc.getText(BlockFileSystem)} typically requires.)
96 *
97 * @author pjack
98 * @see http://jakarta.apache.org/poi/poifs/fileformat.html
99 */
100 public class DefaultBlockFileSystem implements BlockFileSystem {
101
102
103 /***
104 * Pointers per BAT block.
105 */
106 final private static int POINTERS_PER_BAT = 128;
107
108
109 /***
110 * Size of a BAT pointer in bytes. (In other words, 4).
111 */
112 final private static int BAT_POINTER_SIZE = BLOCK_SIZE / POINTERS_PER_BAT;
113
114
115 /***
116 * The number of BAT pointers in the header block. After this many
117 * BAT blocks, the XBAT blocks must be consulted.
118 */
119 final private static int HEADER_BAT_LIMIT = 109;
120
121
122 /***
123 * The size of an entry record in bytes.
124 */
125 final private static int ENTRY_SIZE = 128;
126
127
128 /***
129 * The number of entries that can fit in a block.
130 */
131 final private static int ENTRIES_PER_BLOCK = BLOCK_SIZE / ENTRY_SIZE;
132
133
134 /***
135 * The .doc file as a stream.
136 */
137 private SeekInputStream input;
138
139
140 /***
141 * The header block.
142 */
143 private HeaderBlock header;
144
145
146 /***
147 * Cache of BAT and XBAT blocks.
148 */
149 private Map<Integer,ByteBuffer> cache;
150
151
152 /***
153 * Constructor.
154 *
155 * @param input the file to read from
156 * @param batCacheSize number of BAT and XBAT blocks to cache
157 * @throws IOException if an IO error occurs
158 */
159 public DefaultBlockFileSystem(SeekInputStream input, int batCacheSize)
160 throws IOException {
161 this.input = input;
162 byte[] temp = new byte[BLOCK_SIZE];
163 IoUtils.readFully(input, temp);
164 this.header = new HeaderBlock(ByteBuffer.wrap(temp));
165 this.cache = new LRU<Integer,ByteBuffer>(batCacheSize);
166 }
167
168
169 public Entry getRoot() throws IOException {
170
171 int block = header.getEntriesStart();
172 input.position((block + 1) * BLOCK_SIZE);
173
174
175 return new DefaultEntry(this, input, 0);
176 }
177
178
179 /***
180 * Returns the entry with the given number.
181 *
182 * @param entryNumber the number of the entry to return
183 * @return that entry, or null if no such entry exists
184 * @throws IOException if an IO error occurs
185 */
186 Entry getEntry(int entryNumber) throws IOException {
187
188 if (entryNumber < 0) {
189 return null;
190 }
191
192
193
194
195
196 int blockCount = entryNumber / ENTRIES_PER_BLOCK;
197 int remainder = entryNumber % ENTRIES_PER_BLOCK;
198 int block = header.getEntriesStart();
199 for (int i = 0; i < blockCount; i++) {
200 block = getNextBlock(block);
201 }
202
203 if (block < 0) {
204
205 return null;
206 }
207
208 int filePos = (block + 1) * BLOCK_SIZE + remainder * ENTRY_SIZE;
209 input.position(filePos);
210
211 return new DefaultEntry(this, input, entryNumber);
212 }
213
214
215 public int getNextBlock(int block) throws IOException {
216 if (block < 0) {
217 return block;
218 }
219
220
221 int headerIndex = block / POINTERS_PER_BAT;
222
223
224 int batBlockIndex = block % POINTERS_PER_BAT;
225
226 int batBlockNumber = batLookup(headerIndex);
227 ByteBuffer batBlock = getBATBlock(batBlockNumber);
228 return batBlock.getInt(batBlockIndex * BAT_POINTER_SIZE);
229 }
230
231
232 /***
233 * Looks up the block number of a BAT block.
234 *
235 * @param headerIndex
236 * @return
237 * @throws IOException
238 */
239 private int batLookup(int headerIndex) throws IOException {
240 if (headerIndex < HEADER_BAT_LIMIT + 1) {
241 return header.getBATBlockNumber(headerIndex);
242 }
243
244
245 headerIndex -= HEADER_BAT_LIMIT;
246 int xbatBlockNumber = headerIndex / POINTERS_PER_BAT;
247 xbatBlockNumber += header.getExtendedBATStart();
248 ByteBuffer xbat = getBATBlock(xbatBlockNumber);
249
250
251 int xbatBlockIndex = headerIndex % POINTERS_PER_BAT;
252 return xbat.getInt(xbatBlockIndex * BAT_POINTER_SIZE);
253 }
254
255
256 /***
257 * Returns the BAT block with the given block number.
258 * If the BAT block were previously cached, then the cached version
259 * is returned. Otherwise, the file pointer is repoisitioned to
260 * the start of the given block, and the 512 bytes are read and
261 * stored in the cache.
262 *
263 * @param block the block number of the BAT block to return
264 * @return the BAT block
265 * @throws IOException
266 */
267 private ByteBuffer getBATBlock(int block) throws IOException {
268 ByteBuffer r = cache.get(block);
269 if (r != null) {
270 return r;
271 }
272
273 byte[] buf = new byte[BLOCK_SIZE];
274 input.position((block + 1) * BLOCK_SIZE);
275 IoUtils.readFully(input, buf);
276
277 r = ByteBuffer.wrap(buf);
278 r.order(ByteOrder.LITTLE_ENDIAN);
279 cache.put(block, r);
280 return r;
281 }
282
283
284 public SeekInputStream getRawInput() {
285 return input;
286 }
287 }