1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.util.ms;
24
25
26 import java.io.File;
27 import java.io.IOException;
28 import java.io.RandomAccessFile;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32
33 import org.archive.io.Endian;
34 import org.archive.io.RandomAccessInputStream;
35 import org.archive.io.SeekInputStream;
36 import org.archive.io.SeekReader;
37
38
39 /***
40 * Reads .doc files.
41 *
42 * @author pjack
43 */
44 public class Doc {
45
46
47 final private static Logger LOGGER = Logger.getLogger(Doc.class.getName());
48
49
50 /***
51 * Static utility library, do not instantiate.
52 */
53 private Doc() {
54 }
55
56
57 /***
58 * Returns the text of the .doc file with the given file name.
59 *
60 * @param docFilename the name of the file whose text to return
61 * @return the text of that file
62 * @throws IOException if an IO error occurs
63 */
64 public static SeekReader getText(String docFilename) throws IOException {
65 return getText(new File(docFilename));
66 }
67
68
69 /***
70 * Returns the text of the given .doc file.
71 *
72 * @param doc the .doc file whose text to return
73 * @return the text of that file
74 * @throws IOException if an IO error occurs
75 */
76 public static SeekReader getText(File doc) throws IOException {
77 RandomAccessFile raf = new RandomAccessFile(doc, "r");
78 RandomAccessInputStream rais = new RandomAccessInputStream(raf);
79 return getText(rais);
80 }
81
82
83 /***
84 * Returns the text of the given .doc file.
85 *
86 * @param doc the .doc file whose text to return
87 * @return the text of that file
88 * @throws IOException if an IO error occurs
89 */
90 public static SeekReader getText(SeekInputStream doc) throws IOException {
91 BlockFileSystem bfs = new DefaultBlockFileSystem(doc, 16);
92 return getText(bfs, 20);
93 }
94
95
96 /***
97 * Returns the text for the given .doc file. The given cacheSize refers
98 * to the number of the .doc file's piece table entries to cache. Most
99 * .doc files only have 1 piece table entry; however, a "fast-saved"
100 * .doc file might have several. A cacheSize of 20 should be ample for
101 * most .doc files in the world. Since piece table entries are small --
102 * only 12 bytes each -- caching them prevents many otherwise necessary
103 * file pointer repositionings.
104 *
105 * @param wordDoc the .doc file as a BlockFileSystem
106 * @param cacheSize the number of piece table entries to cache
107 * @return a reader that will return the text in the file
108 * @throws IOException if an IO error occurs
109 */
110 public static SeekReader getText(BlockFileSystem wordDoc, int cacheSize)
111 throws IOException {
112 List<Entry> entries = wordDoc.getRoot().list();
113 Entry main = find(entries, "WordDocument");
114 SeekInputStream mainStream = main.open();
115
116 mainStream.position(10);
117 int flags = Endian.littleChar(mainStream);
118 boolean complex = (flags & 0x0004) == 0x0004;
119 boolean tableOne = (flags & 0x0200) == 0x0200;
120 String tableName = tableOne ? "1Table" : "0Table";
121 Entry table = find(entries, tableName);
122 if (LOGGER.isLoggable(Level.FINEST)) {
123 LOGGER.finest("Main entry: " + main);
124 LOGGER.finest("Table entry: " + table);
125 }
126 SeekInputStream tableStream = table.open();
127
128 mainStream.position(24);
129 int fcMin = Endian.littleInt(mainStream);
130 int fcMax = Endian.littleInt(mainStream);
131
132 mainStream.position(76);
133 int cppText = Endian.littleInt(mainStream);
134
135 mainStream.position(418);
136 int fcClx = Endian.littleInt(mainStream);
137 int fcSz = Endian.littleInt(mainStream);
138
139 if (LOGGER.isLoggable(Level.FINE)) {
140 LOGGER.fine("fcMin: " + fcMin);
141 LOGGER.fine("fcMax: " + fcMax);
142 LOGGER.fine("FcClx: " + fcClx);
143 LOGGER.fine("szClx: " + fcSz);
144 LOGGER.fine("complex: " + complex);
145 LOGGER.fine("cppText: " + cppText);
146 }
147 PieceTable pt = new PieceTable(tableStream, fcClx, fcMax - fcMin, cacheSize);
148 return new PieceReader(pt, mainStream);
149 }
150
151
152 private static Entry find(List<Entry> entries, String name) {
153 for (Entry e: entries) {
154 if (e.getName().equals(name)) {
155 return e;
156 }
157 }
158 return null;
159 }
160
161 }