View Javadoc

1   /* Doc
2   *
3   * Created on September 12, 2006
4   *
5   * Copyright (C) 2006 Internet Archive.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.util.ms;
24  
25  
26  import java.io.File;
27  import java.io.IOException;
28  import java.io.RandomAccessFile;
29  import java.util.List;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  
33  import org.archive.io.Endian;
34  import org.archive.io.RandomAccessInputStream;
35  import org.archive.io.SeekInputStream;
36  import org.archive.io.SeekReader;
37  
38  
39  /***
40   * Reads .doc files.
41   * 
42   * @author pjack
43   */
44  public class Doc {
45      
46      
47      final private static Logger LOGGER = Logger.getLogger(Doc.class.getName());
48      
49  
50      /***
51       * Static utility library, do not instantiate.
52       */
53      private Doc() {
54      }
55  
56  
57      /***
58       * Returns the text of the .doc file with the given file name.
59       * 
60       * @param docFilename   the name of the file whose text to return
61       * @return  the text of that file
62       * @throws IOException  if an IO error occurs
63       */
64      public static SeekReader getText(String docFilename) throws IOException {
65          return getText(new File(docFilename));
66      }
67  
68  
69      /***
70       * Returns the text of the given .doc file.
71       * 
72       * @param doc   the .doc file whose text to return
73       * @return   the text of that file
74       * @throws IOException   if an IO error occurs
75       */
76      public static SeekReader getText(File doc) throws IOException {
77          RandomAccessFile raf = new RandomAccessFile(doc, "r");
78          RandomAccessInputStream rais = new RandomAccessInputStream(raf);
79          return getText(rais);
80      }
81  
82      
83      /***
84       * Returns the text of the given .doc file.
85       * 
86       * @param doc   the .doc file whose text to return
87       * @return   the text of that file
88       * @throws IOException   if an IO error occurs
89       */
90      public static SeekReader getText(SeekInputStream doc) throws IOException {
91          BlockFileSystem bfs = new DefaultBlockFileSystem(doc, 16);
92          return getText(bfs, 20);
93      }
94  
95      
96      /***
97       * Returns the text for the given .doc file.  The given cacheSize refers
98       * to the number of the .doc file's piece table entries to cache.  Most
99       * .doc files only have 1 piece table entry; however, a "fast-saved"
100      * .doc file might have several.  A cacheSize of 20 should be ample for
101      * most .doc files in the world.  Since piece table entries are small --
102      * only 12 bytes each -- caching them prevents many otherwise necessary
103      * file pointer repositionings.
104      * 
105      * @param wordDoc   the .doc file as a BlockFileSystem
106      * @param cacheSize  the number of piece table entries to cache
107      * @return   a reader that will return the text in the file
108      * @throws IOException   if an IO error occurs
109      */
110     public static SeekReader getText(BlockFileSystem wordDoc, int cacheSize) 
111     throws IOException {
112         List<Entry> entries = wordDoc.getRoot().list();
113         Entry main = find(entries, "WordDocument");
114         SeekInputStream mainStream = main.open();
115         
116         mainStream.position(10);
117         int flags = Endian.littleChar(mainStream);
118         boolean complex = (flags & 0x0004) == 0x0004;
119         boolean tableOne = (flags & 0x0200) == 0x0200;
120         String tableName = tableOne ? "1Table" : "0Table";
121         Entry table = find(entries, tableName);
122         if (LOGGER.isLoggable(Level.FINEST)) {
123             LOGGER.finest("Main entry:  " + main);
124             LOGGER.finest("Table entry: " + table);
125         }
126         SeekInputStream tableStream = table.open();
127         
128         mainStream.position(24);
129         int fcMin = Endian.littleInt(mainStream);
130         int fcMax = Endian.littleInt(mainStream);
131         
132         mainStream.position(76);
133         int cppText = Endian.littleInt(mainStream);
134         
135         mainStream.position(418);
136         int fcClx = Endian.littleInt(mainStream);
137         int fcSz = Endian.littleInt(mainStream);
138         
139         if (LOGGER.isLoggable(Level.FINE)) {
140             LOGGER.fine("fcMin: " + fcMin);
141             LOGGER.fine("fcMax: " + fcMax);
142             LOGGER.fine("FcClx: " + fcClx);
143             LOGGER.fine("szClx: " + fcSz);
144             LOGGER.fine("complex: " + complex);
145             LOGGER.fine("cppText: " + cppText);
146         }
147         PieceTable pt = new PieceTable(tableStream, fcClx, fcMax - fcMin, cacheSize);
148         return new PieceReader(pt, mainStream);
149     }
150 
151 
152     private static Entry find(List<Entry> entries, String name) {
153         for (Entry e: entries) {
154             if (e.getName().equals(name)) {
155                 return e;
156             }
157         }
158         return null;
159     }
160 
161 }