View Javadoc

1   /* Cp1252
2   *
3   * Created on September 12, 2006
4   *
5   * Copyright (C) 2006 Internet Archive.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.util.ms;
24  
25  
26  import java.io.UnsupportedEncodingException;
27  
28  
29  /***
30   * A fast implementation of code page 1252.  This is used to convert bytes
31   * to characters in .doc files that don't use unicode.
32   * 
33   * <p>The Java Charset APIs seemed like overkill for these translations,
34   * since 1 byte always translates into 1 character.
35   * 
36   * @author pjack
37   */
38  public class Cp1252 {
39  
40  
41      /***
42       * The translation table.  If x is an unsigned byte from a .doc
43       * text stream, then XLAT[x] is the Unicode character that byte
44       * represents.
45       */
46      final private static char[] XLAT = createTable();
47  
48  
49      /***
50       * Static utility library, do not instantiate.
51       */            
52      private Cp1252() {
53      }
54  
55  
56      /***
57       * Generates the translation table.  The Java String API is used for each
58       * possible byte to determine the corresponding Unicode character.
59       * 
60       * @return  the Cp1252 translation table
61       */
62      private static char[] createTable() {
63          char[] result = new char[256];
64          byte[] b = new byte[1];
65          for (int i = 0; i < 256; i++) try {
66              b[0] = (byte)i;
67              String s = new String(b, "Cp1252");
68              result[i] = s.charAt(0);
69          } catch (UnsupportedEncodingException e) {
70              throw new RuntimeException(e);
71          }
72          return result;
73      }
74  
75  
76      /***
77       * Returns the Unicode character for the given Cp1252 byte.
78       * 
79       * @param b   an unsigned byte from 0 to 255
80       * @return  the Unicode character corresponding to that byte
81       */
82      public static char decode(int b) {
83          return XLAT[b];
84      }
85  
86  
87  }