1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.util.ms;
24
25 import java.io.Closeable;
26 import java.io.File;
27 import java.io.FileInputStream;
28 import java.io.FileOutputStream;
29 import java.io.IOException;
30 import java.io.InputStreamReader;
31 import java.io.OutputStreamWriter;
32 import java.io.Reader;
33 import java.io.Writer;
34
35 import org.apache.poi.hdf.extractor.WordDocument;
36
37 import junit.framework.TestCase;
38
39
40 public class DocTest extends TestCase {
41
42
43 final private static File TEST_DIR = new File("testdata/ms");
44
45
46
47 public void testAgainstPOI() throws IOException {
48 int errors = 0;
49 long start = System.currentTimeMillis();
50 for (File f: TEST_DIR.listFiles()) try {
51 start = System.currentTimeMillis();
52 if (f.getName().endsWith(".doc")) {
53 errors += runDoc(f);
54 }
55 } finally {
56 long duration = System.currentTimeMillis() - start;
57 System.out.println("Duration in milliseconds: " + duration);
58 }
59 if (errors > 0) {
60 throw new IOException(errors + " errors, see stdout.");
61 }
62 }
63
64
65 private int runDoc(File doc) throws IOException {
66 System.out.println("===== Now processing " + doc.getName());
67 String name = doc.getName();
68 int p = name.lastIndexOf('.');
69 String expectedName = name.substring(0, p) + ".txt";
70 File expectedFile = new File(TEST_DIR, expectedName);
71 if (!expectedFile.exists()) {
72 createExpectedOutput(doc, expectedFile);
73 }
74 return runFiles(doc, expectedFile);
75 }
76
77
78 private void createExpectedOutput(File doc, File output)
79 throws IOException {
80 FileInputStream finp = new FileInputStream(doc);
81 FileOutputStream fout = new FileOutputStream(output);
82
83 try {
84 WordDocument wd = new WordDocument(finp);
85 Writer writer = new OutputStreamWriter(fout, "UTF-16BE");
86 wd.writeAllText(writer);
87 } finally {
88 close(finp);
89 close(fout);
90 }
91 }
92
93
94 private static void close(Closeable c) {
95 try {
96 c.close();
97 } catch (IOException e) {
98 e.printStackTrace();
99 }
100 }
101
102
103 private int runFiles(File doc, File expected)
104 throws IOException {
105 FileInputStream expectedIn = new FileInputStream(expected);
106 Reader expectedReader = new InputStreamReader(expectedIn, "UTF-16BE");
107 Reader docReader = Doc.getText(doc);
108 try {
109 return runReaders(docReader, expectedReader);
110 } finally {
111 close(docReader);
112 close(expectedReader);
113 }
114 }
115
116
117 private int runReaders(Reader doc, Reader expected)
118 throws IOException {
119 int count = 0;
120 int errors = 0;
121 boolean go = true;
122 while (go) {
123 int ch = doc.read();
124 int expectedCh = correctPOI(expected.read());
125 if ((ch < 0) || (expectedCh < 0)) {
126 go = false;
127 if ((ch >= 0) || (expectedCh >= 0)) {
128 errors++;
129 System.out.println("File lengths differ.");
130 }
131 }
132 if (ch != expectedCh) {
133 errors += 1;
134 report(count, expectedCh, ch);
135 }
136 count++;
137 }
138 return errors;
139 }
140
141
142 private void report(int count, int expected, int actual) {
143 StringBuilder msg = new StringBuilder("#").append(count);
144 msg.append(": Expected ");
145 msg.append(expected).append(" (").append(toChar(expected));
146 msg.append(") but got ").append(actual).append(" (");
147 msg.append(toChar(actual)).append(").");
148 System.out.println(msg);
149 }
150
151
152 private static String toChar(int ch) {
153 if (ch < 0) {
154 return "EOF";
155 } else {
156 return Character.toString((char)ch);
157 }
158 }
159
160 /***
161 * Corrects POI's Cp1252 output. There's a bug somewhere in POI that
162 * makes it produce incorrect characters. Not sure where and don't have
163 * time to track it down. But I have visually checked the input
164 * documents to verify that Doc is producing the right character, and
165 * that POI is not.
166 *
167 * @param ch the POI-produced character to check
168 * @return the corrected character
169 */
170 private static int correctPOI(int ch) {
171 switch (ch) {
172 case 8734:
173
174
175 return 176;
176 case 214:
177
178 return 8230;
179 case 237:
180
181 return 8217;
182 case 236:
183
184 return 8220;
185 case 238:
186
187 return 8221;
188 default:
189 return ch;
190 }
191 }
192
193
194 }