View Javadoc

1   /* DocTest
2   *
3   * Created on September 12, 2006
4   *
5   * Copyright (C) 2006 Internet Archive.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.util.ms;
24  
25  import java.io.Closeable;
26  import java.io.File;
27  import java.io.FileInputStream;
28  import java.io.FileOutputStream;
29  import java.io.IOException;
30  import java.io.InputStreamReader;
31  import java.io.OutputStreamWriter;
32  import java.io.Reader;
33  import java.io.Writer;
34  
35  import org.apache.poi.hdf.extractor.WordDocument;
36  
37  import junit.framework.TestCase;
38  
39  
40  public class DocTest extends TestCase {
41  
42      
43      final private static File TEST_DIR = new File("testdata/ms");
44  
45      
46      // Rename to testAgainstPOI to actually run the test.
47      public void testAgainstPOI() throws IOException {
48          int errors = 0;
49          long start = System.currentTimeMillis();
50          for (File f: TEST_DIR.listFiles()) try {
51              start = System.currentTimeMillis();
52              if (f.getName().endsWith(".doc")) {
53                  errors += runDoc(f);
54              }
55          } finally {
56              long duration = System.currentTimeMillis() - start;
57              System.out.println("Duration in milliseconds: " + duration);
58          }
59          if (errors > 0) {
60              throw new IOException(errors + " errors, see stdout.");
61          }
62      }
63  
64      
65      private int runDoc(File doc) throws IOException {
66          System.out.println("===== Now processing " + doc.getName());
67          String name = doc.getName();
68          int p = name.lastIndexOf('.');
69          String expectedName = name.substring(0, p) + ".txt";
70          File expectedFile = new File(TEST_DIR, expectedName);
71          if (!expectedFile.exists()) {
72              createExpectedOutput(doc, expectedFile);
73          }
74          return runFiles(doc, expectedFile);
75      }
76      
77      
78      private void createExpectedOutput(File doc, File output) 
79      throws IOException {
80          FileInputStream finp = new FileInputStream(doc);
81          FileOutputStream fout = new FileOutputStream(output);
82  
83          try {
84              WordDocument wd = new WordDocument(finp);        
85              Writer writer = new OutputStreamWriter(fout, "UTF-16BE");
86              wd.writeAllText(writer);
87          } finally {
88              close(finp);
89              close(fout);
90          }
91      }
92      
93      
94      private static void close(Closeable c) {
95          try {
96              c.close();
97          } catch (IOException e) {
98              e.printStackTrace();
99          }
100     }
101 
102     
103     private int runFiles(File doc, File expected) 
104     throws IOException {
105         FileInputStream expectedIn = new FileInputStream(expected);
106         Reader expectedReader = new InputStreamReader(expectedIn, "UTF-16BE");
107         Reader docReader = Doc.getText(doc);
108         try {
109             return runReaders(docReader, expectedReader);
110         } finally {
111             close(docReader);
112             close(expectedReader);
113         }
114     }
115     
116     
117     private int runReaders(Reader doc, Reader expected) 
118     throws IOException {
119         int count = 0;
120         int errors = 0;
121         boolean go = true;
122         while (go) {
123             int ch = doc.read();
124             int expectedCh = correctPOI(expected.read());
125             if ((ch < 0) || (expectedCh < 0)) {
126                 go = false;
127                 if ((ch >= 0) || (expectedCh >= 0)) {
128                     errors++;
129                     System.out.println("File lengths differ.");
130                 }
131             }
132             if (ch != expectedCh) {
133                 errors += 1;
134                 report(count, expectedCh, ch);
135             }
136             count++;
137         }
138         return errors;
139     }
140 
141     
142     private void report(int count, int expected, int actual) {
143         StringBuilder msg = new StringBuilder("#").append(count);
144         msg.append(": Expected ");
145         msg.append(expected).append(" (").append(toChar(expected));
146         msg.append(") but got ").append(actual).append(" (");
147         msg.append(toChar(actual)).append(").");
148         System.out.println(msg);
149     }
150 
151 
152     private static String toChar(int ch) {
153         if (ch < 0) {
154             return "EOF";
155         } else {
156             return Character.toString((char)ch);
157         }
158     }
159     
160     /***
161      * Corrects POI's Cp1252 output.  There's a bug somewhere in POI that
162      * makes it produce incorrect characters.  Not sure where and don't have
163      * time to track it down.  But I have visually checked the input 
164      * documents to verify that Doc is producing the right character, and
165      * that POI is not.
166      * 
167      * @param ch  the POI-produced character to check
168      * @return    the corrected character
169      */
170     private static int correctPOI(int ch) {
171         switch (ch) {
172             case 8734:
173                 // POI produced the infinity sign when it should have 
174                 // produced the degrees sign.
175                 return 176;
176             case 214:
177                 // POI produced an umat O instead of an ellipses mark.
178                 return 8230;
179             case 237:
180                 // POI produced an acute i instead of a fancy single quote
181                 return 8217;
182             case 236:
183                 // POI produced a reverse acute i instead of fancy double quote
184                 return 8220;
185             case 238:
186                 // POI produced a caret i instead of fancy double quote
187                 return 8221;
188             default:
189                 return ch;
190         }
191     }
192 
193     
194 }