View Javadoc

1   /* PieceReader
2   *
3   * Created on September 12, 2006
4   *
5   * Copyright (C) 2006 Internet Archive.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.util.ms;
24  
25  import java.io.IOException;
26  
27  import org.archive.io.Endian;
28  import org.archive.io.SeekInputStream;
29  import org.archive.io.SeekReader;
30  
31  
32  class PieceReader extends SeekReader {
33  
34  
35      private PieceTable table;
36      private SeekInputStream doc;
37      
38      private boolean unicode;
39      private int charPos;
40      private int limit;
41  
42  
43      public PieceReader(PieceTable table, SeekInputStream doc)
44      throws IOException {
45          this.table = table;
46          this.doc = doc;
47          charPos = 0;
48          limit = -1;
49      }
50  
51  
52      private void seekIfNecessary() throws IOException {
53          if (doc == null) {
54              throw new IOException("Stream closed.");
55          }
56          if (charPos >= table.getMaxCharPos()) {
57              return;
58          }
59          if (charPos < limit) {
60              return;
61          }
62          Piece piece = table.next();
63          unicode = piece.isUnicode();
64          limit = piece.getCharPosLimit();
65          doc.position(piece.getFilePos());
66      }
67  
68  
69      public int read() throws IOException {
70          seekIfNecessary();
71          if (doc == null) {
72              throw new IOException("Stream closed.");
73          }
74          if (charPos >= table.getMaxCharPos()) {
75              return -1;
76          }
77  
78          int ch;
79          if (unicode) {
80              ch = Endian.littleChar(doc);
81          } else {
82              ch = Cp1252.decode(doc.read());
83          }
84          charPos++;
85          return ch;
86      }
87  
88  
89      public int read(char[] buf, int ofs, int len) throws IOException {
90          // FIXME: Think of a faster implementation that will work with
91          // both unicode and non-unicode.
92          seekIfNecessary();
93          if (doc == null) {
94              throw new IOException("Stream closed.");
95          }
96          if (charPos >= table.getMaxCharPos()) {
97              return 0;
98          }
99          for (int i = 0; i < len; i++) {
100             int ch = read();
101             if (ch < 0) {
102                 return i;
103             }
104             buf[ofs + i] = (char)ch;
105         }
106         return len;
107     }
108     
109     
110     public void close() throws IOException {
111         doc.close();
112         table = null;
113     }
114     
115     
116     public long position() throws IOException {
117         return charPos;
118     }
119     
120     
121     public void position(long p) throws IOException {
122         if (p > Integer.MAX_VALUE) {
123             throw new IOException("File too large.");
124         }
125         int charPos = (int)p;
126         Piece piece = table.pieceFor(charPos);
127         if (piece == null) {
128             throw new IOException("Illegal position: " + p);
129         }
130         unicode = piece.isUnicode();
131         limit = piece.getCharPosLimit();
132         
133         int ofs = charPos - piece.getCharPosStart();
134         this.charPos = charPos;
135         doc.position(piece.getFilePos() + ofs);
136     }
137 }