View Javadoc

1   /* DefaultBlockFileSystem
2   *
3   * Created on September 12, 2006
4   *
5   * Copyright (C) 2006 Internet Archive.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.util.ms;
24  
25  import java.io.IOException;
26  import java.nio.ByteBuffer;
27  import java.nio.ByteOrder;
28  import java.util.Map;
29  
30  import org.archive.io.SeekInputStream;
31  import org.archive.util.IoUtils;
32  import org.archive.util.LRU;
33  
34  
35  /***
36   * Default implementation of the Block File System.
37   * 
38   * <p>The overall structure of a BlockFileSystem file (such as a .doc file) is
39   * as follows.  The file is divided into blocks, which are of uniform length
40   * (512 bytes).  The first block (at file pointer 0) is called the header
41   * block.  It's used to look up other blocks in the file.
42   * 
43   * <p>Subfiles contained within the .doc file are organized using a Block
44   * Allocation Table, or BAT.  The BAT is basically a linked list; given a 
45   * block number, the BAT will tell you the next block number.  Note that
46   * the header block has no number; block #0 is the first block after the
47   * header.  Thus, to convert a block number to a file pointer:
48   * <code>int filePointer = (blockNumber + 1) * BLOCK_SIZE</code>.
49   * 
50   * <p>The BAT itself is discontinuous, however.  To find the blocks that 
51   * comprise the BAT, you have to look in the header block.  The header block
52   * contains an array of 109 pointers to the blocks that comprise the BAT.
53   * If more than 109 BAT blocks are required (in other words, if the .doc
54   * file is larger than ~6 megabytes), then something called the 
55   * XBAT comes into play.
56   * 
57   * <p>XBAT blocks contain pointers to the 110th BAT block and beyond.
58   * The first XBAT block is stored at a file pointer listed in the header.
59   * The other XBAT blocks are always stored in order after the first; the 
60   * XBAT table is continuous.  One is inclined to wonder why the BAT itself
61   * is not so stored, but oh well.
62   * 
63   * <p>The BAT only tells you the next block for a given block.  To find the 
64   * first block for a subfile, you have to look up that subfile's directory
65   * entry.  Each directory entry is a 128 byte structure in the file, so four
66   * of them fit in a block.  The number of the first block of the entry list
67   * is stored in the header.  To find subsequent entry blocks, the BAT must
68   * be used.
69   * 
70   * <p>I'm telling you all this so that you understand the caching that this
71   * class provides.
72   * 
73   * <p>First, directory entries are not cached.  It's assumed that they will
74   * be looked up at the beginning of a lengthy operation, and then forgotten
75   * about.  This is certainly the case for {@link Doc#getText(BlockFileSystem)}. 
76   * If you need to remember directory entries, you can manually store the Entry 
77   * objects in a map or something, as they don't grow stale.
78   * 
79   * <p>This class keeps all 512 bytes of the header block in memory at all 
80   * times.  This prevents a potentially expensive file pointer repositioning
81   * every time you're trying to figure out what comes next.
82   * 
83   * <p>BAT and XBAT blocks are stored in a least-recently used cache.  The 
84   * <i>n</i> most recent BAT and XBAT blocks are remembered, where <i>n</i>
85   * is set at construction time.  The minimum value of <i>n</i> is 1.  For small
86   * files, this can prevent file pointer repositioning for BAT look ups.
87   * 
88   * <p>The BAT/XBAT cache only takes up memory as needed.  If the specified
89   * cache size is 100 blocks, but the file only has 4 BAT blocks, then only 
90   * 2048 bytes will be used by the cache.
91   * 
92   * <p>Note this class only caches BAT and XBAT blocks.  It does not cache the
93   * blocks that actually make up a subfile's contents.  It is assumed that those
94   * blocks will only be accessed once per operation (again, this is what
95   * {Doc.getText(BlockFileSystem)} typically requires.)
96   * 
97   * @author pjack
98   * @see http://jakarta.apache.org/poi/poifs/fileformat.html
99   */
100 public class DefaultBlockFileSystem implements BlockFileSystem {
101 
102 
103     /***
104      * Pointers per BAT block.
105      */
106     final private static int POINTERS_PER_BAT = 128;
107 
108 
109     /***
110      * Size of a BAT pointer in bytes.  (In other words, 4).
111      */
112     final private static int BAT_POINTER_SIZE = BLOCK_SIZE / POINTERS_PER_BAT;
113 
114     
115     /***
116      * The number of BAT pointers in the header block.  After this many 
117      * BAT blocks, the XBAT blocks must be consulted.
118      */
119     final private static int HEADER_BAT_LIMIT = 109;
120     
121     
122     /***
123      * The size of an entry record in bytes.
124      */
125     final private static int ENTRY_SIZE = 128;
126     
127     
128     /***
129      * The number of entries that can fit in a block.
130      */
131     final private static int ENTRIES_PER_BLOCK = BLOCK_SIZE / ENTRY_SIZE;
132 
133 
134     /***
135      * The .doc file as a stream.
136      */
137     private SeekInputStream input;
138     
139     
140     /***
141      * The header block.
142      */
143     private HeaderBlock header;
144 
145 
146     /***
147      * Cache of BAT and XBAT blocks.
148      */
149     private Map<Integer,ByteBuffer> cache;
150     
151     
152     /***
153      * Constructor.
154      * 
155      * @param input   the file to read from
156      * @param batCacheSize  number of BAT and XBAT blocks to cache
157      * @throws IOException  if an IO error occurs
158      */
159     public DefaultBlockFileSystem(SeekInputStream input, int batCacheSize)
160     throws IOException {
161         this.input = input;
162         byte[] temp = new byte[BLOCK_SIZE];
163         IoUtils.readFully(input, temp);
164         this.header = new HeaderBlock(ByteBuffer.wrap(temp));
165         this.cache = new LRU<Integer,ByteBuffer>(batCacheSize);
166     }
167 
168 
169     public Entry getRoot() throws IOException {
170         // Position to the first block of the entry list.
171         int block = header.getEntriesStart();
172         input.position((block + 1) * BLOCK_SIZE);
173         
174         // The root entry is always entry #0.
175         return new DefaultEntry(this, input, 0);
176     }
177 
178 
179     /***
180      * Returns the entry with the given number.
181      * 
182      * @param entryNumber   the number of the entry to return
183      * @return   that entry, or null if no such entry exists
184      * @throws IOException  if an IO error occurs
185      */
186     Entry getEntry(int entryNumber) throws IOException {
187         // Entry numbers < 0 typically indicate an end-of-stream.
188         if (entryNumber < 0) {
189             return null;
190         }
191         
192         // It's impossible to check against the upper bound, because the
193         // upper bound is not recorded anywhere.
194         
195         // Advance to the block containing the desired entry.
196         int blockCount = entryNumber / ENTRIES_PER_BLOCK;
197         int remainder = entryNumber % ENTRIES_PER_BLOCK;        
198         int block = header.getEntriesStart();
199         for (int i = 0; i < blockCount; i++) {
200             block = getNextBlock(block);
201         }
202         
203         if (block < 0) {
204             // Given entry number exceeded the number of available entries.
205             return null;
206         }
207 
208         int filePos = (block + 1) * BLOCK_SIZE + remainder * ENTRY_SIZE;
209         input.position(filePos);
210         
211         return new DefaultEntry(this, input, entryNumber);
212     }
213 
214 
215     public int getNextBlock(int block) throws IOException {
216         if (block < 0) {
217             return block;
218         }
219         
220         // Index into the header array of BAT blocks.
221         int headerIndex = block / POINTERS_PER_BAT;
222         
223         // Index within that BAT block of the block we're interested in.
224         int batBlockIndex = block % POINTERS_PER_BAT;
225 
226         int batBlockNumber = batLookup(headerIndex);
227         ByteBuffer batBlock = getBATBlock(batBlockNumber);
228         return batBlock.getInt(batBlockIndex * BAT_POINTER_SIZE);
229     }
230 
231     
232     /***
233      * Looks up the block number of a BAT block.
234      * 
235      * @param headerIndex  
236      * @return
237      * @throws IOException
238      */
239     private int batLookup(int headerIndex) throws IOException {
240         if (headerIndex < HEADER_BAT_LIMIT + 1) {
241             return header.getBATBlockNumber(headerIndex);
242         }
243         
244         // Find the XBAT block of interest
245         headerIndex -= HEADER_BAT_LIMIT;
246         int xbatBlockNumber = headerIndex / POINTERS_PER_BAT;
247         xbatBlockNumber += header.getExtendedBATStart();
248         ByteBuffer xbat = getBATBlock(xbatBlockNumber);
249 
250         // Find the bat Block number inside the XBAT block
251         int xbatBlockIndex = headerIndex % POINTERS_PER_BAT;
252         return xbat.getInt(xbatBlockIndex * BAT_POINTER_SIZE);
253     }
254 
255     
256     /***
257      * Returns the BAT block with the given block number.
258      * If the BAT block were previously cached, then the cached version
259      * is returned.  Otherwise, the file pointer is repoisitioned to 
260      * the start of the given block, and the 512 bytes are read and
261      * stored in the cache.
262      * 
263      * @param block   the block number of the BAT block to return
264      * @return   the BAT block
265      * @throws IOException
266      */
267     private ByteBuffer getBATBlock(int block) throws IOException {
268         ByteBuffer r = cache.get(block);
269         if (r != null) {
270             return r;
271         }
272 
273         byte[] buf = new byte[BLOCK_SIZE];
274         input.position((block + 1) * BLOCK_SIZE);
275         IoUtils.readFully(input, buf);
276 
277         r = ByteBuffer.wrap(buf);
278         r.order(ByteOrder.LITTLE_ENDIAN);
279         cache.put(block, r);
280         return r;
281     }
282 
283 
284     public SeekInputStream getRawInput() {
285         return input;
286     }
287 }