View Javadoc

1   /* GzipHeader
2   *
3   * $Id: GzipHeader.java 4064 2005-12-20 18:11:33Z stack-sf $
4   *
5   * Created on July 5, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.io;
26  
27  import java.io.EOFException;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.util.zip.CRC32;
31  import java.util.zip.Deflater;
32  import java.util.zip.GZIPInputStream;
33  
34  /***
35   * Read in the GZIP header.
36   * 
37   * See RFC1952 for specification on what the header looks like.
38   * Assumption is that stream is cued-up with the gzip header as the
39   * next thing to be read.
40   * 
41   * <p>Of <a href="http://jguru.com/faq/view.jsp?EID=13647">Java
42   * and unsigned bytes</a>. That is, its always a signed int in
43   * java no matter what the qualifier whether byte, char, etc.
44   * 
45   * <p>Add accessors for optional filename, comment and MTIME.
46   * 
47   * @author stack
48   */
49  public class GzipHeader {
50      /***
51       * Length of minimal GZIP header.
52       *
53       * See RFC1952 for explaination of value of 10.
54       */
55      public static final int MINIMAL_GZIP_HEADER_LENGTH = 10;
56      
57      /***
58       * Total length of the gzip header.
59       */
60      protected int length = 0;
61  
62      /***
63       * The GZIP header FLG byte.
64       */
65      protected int flg;
66      
67      /***
68       * GZIP header XFL byte.
69       */
70      private int xfl;
71      
72      /***
73       * GZIP header OS byte.
74       */
75      private int os;
76      
77      /***
78       * Extra header field content.
79       */
80      private byte [] fextra = null;
81      
82      /***
83       * GZIP header MTIME field.
84       */
85      private int mtime;
86      
87      
88      /***
89       * Shutdown constructor.
90       * 
91       * Must pass an input stream.
92       */
93      public GzipHeader() {
94          super();
95      }
96      
97      /***
98       * Constructor.
99       * 
100      * This constructor advances the stream past any gzip header found.
101      * 
102      * @param in InputStream to read from.
103      * @throws IOException
104      */
105     public GzipHeader(InputStream in) throws IOException {
106         super();
107         readHeader(in);
108     }
109     
110     /***
111      * Read in gzip header.
112      * 
113      * Advances the stream past the gzip header.
114      * @param in InputStream.
115      * 
116      * @throws IOException Throws if does not start with GZIP Header.
117      */
118     public void readHeader(InputStream in) throws IOException {
119         CRC32 crc = new CRC32();
120         crc.reset();
121         if (!testGzipMagic(in, crc)) {
122             throw new NoGzipMagicException();
123         }
124         this.length += 2;
125         if (readByte(in, crc) != Deflater.DEFLATED) {
126             throw new IOException("Unknown compression");
127         }
128         this.length++;
129        
130         // Get gzip header flag.
131         this.flg = readByte(in, crc);
132         this.length++;
133         
134         // Get MTIME.
135         this.mtime = readInt(in, crc);
136         this.length += 4;
137         
138         // Read XFL and OS.
139         this.xfl = readByte(in, crc);
140         this.length++;
141         this.os = readByte(in, crc);
142         this.length++;
143         
144         // Skip optional extra field -- stuff w/ alexa stuff in it.
145         final int FLG_FEXTRA = 4;
146         if ((this.flg & FLG_FEXTRA) == FLG_FEXTRA) {
147             int count = readShort(in, crc);
148             this.length +=2;
149             this.fextra = new byte[count];
150             readByte(in, crc, this.fextra, 0, count);
151             this.length += count;
152         }   
153         
154         // Skip file name.  It ends in null.
155         final int FLG_FNAME  = 8;
156         if ((this.flg & FLG_FNAME) == FLG_FNAME) {
157             while (readByte(in, crc) != 0) {
158                 this.length++;
159             }
160         }   
161         
162         // Skip file comment.  It ends in null.
163         final int FLG_FCOMMENT = 16;   // File comment
164         if ((this.flg & FLG_FCOMMENT) == FLG_FCOMMENT) {
165             while (readByte(in, crc) != 0) {
166                 this.length++;
167             }
168         }
169         
170         // Check optional CRC.
171         final int FLG_FHCRC  = 2;
172         if ((this.flg & FLG_FHCRC) == FLG_FHCRC) {
173             int calcCrc = (int)(crc.getValue() & 0xffff);
174             if (readShort(in, crc) != calcCrc) {
175                 throw new IOException("Bad header CRC");
176             }
177             this.length += 2;
178         }
179     }
180     
181     /***
182      * Test gzip magic is next in the stream.
183      * Reads two bytes.  Caller needs to manage resetting stream.
184      * @param in InputStream to read.
185      * @return true if found gzip magic.  False otherwise
186      * or an IOException (including EOFException).
187      * @throws IOException
188      */
189     public boolean testGzipMagic(InputStream in) throws IOException {
190         return testGzipMagic(in, null);
191     }
192     
193     /***
194      * Test gzip magic is next in the stream.
195      * Reads two bytes.  Caller needs to manage resetting stream.
196      * @param in InputStream to read.
197      * @param crc CRC to update.
198      * @return true if found gzip magic.  False otherwise
199      * or an IOException (including EOFException).
200      * @throws IOException
201      */
202     public boolean testGzipMagic(InputStream in, CRC32 crc)
203             throws IOException {
204         return readShort(in, crc) == GZIPInputStream.GZIP_MAGIC;
205     }
206     
207     /***
208      * Read an int. 
209      * 
210      * We do not expect to get a -1 reading.  If we do, we throw exception.
211      * Update the crc as we go.
212      * 
213      * @param in InputStream to read.
214      * @param crc CRC to update.
215      * @return int read.
216      * 
217      * @throws IOException
218      */
219     private int readInt(InputStream in, CRC32 crc) throws IOException {
220         int s = readShort(in, crc);
221         return ((readShort(in, crc) << 16) & 0xffff0000) | s;
222     }
223     
224     /***
225      * Read a short. 
226      * 
227      * We do not expect to get a -1 reading.  If we do, we throw exception.
228      * Update the crc as we go.
229      * 
230      * @param in InputStream to read.
231      * @param crc CRC to update.
232      * @return Short read.
233      * 
234      * @throws IOException
235      */
236     private int readShort(InputStream in, CRC32 crc) throws IOException {
237         int b = readByte(in, crc);
238         return ((readByte(in, crc) << 8) & 0x00ff00) | b;
239     }
240     
241     /***
242      * Read a byte. 
243      * 
244      * We do not expect to get a -1 reading.  If we do, we throw exception.
245      * Update the crc as we go.
246      * 
247      * @param in InputStream to read.
248      * @return Byte read.
249      * 
250      * @throws IOException
251      */
252     protected int readByte(InputStream in) throws IOException {
253             return readByte(in, null);
254     }
255     
256     /***
257      * Read a byte. 
258      * 
259      * We do not expect to get a -1 reading.  If we do, we throw exception.
260      * Update the crc as we go.
261      * 
262      * @param in InputStream to read.
263      * @param crc CRC to update.
264      * @return Byte read.
265      * 
266      * @throws IOException
267      */
268     protected int readByte(InputStream in, CRC32 crc) throws IOException {
269         int b = in.read();
270         if (b == -1) {
271             throw new EOFException();
272         }
273         if (crc != null) {
274             crc.update(b);
275         }
276         return b & 0xff;
277     }
278     
279     /***
280      * Read a byte. 
281      * 
282      * We do not expect to get a -1 reading.  If we do, we throw exception.
283      * Update the crc as we go.
284      * 
285      * @param in InputStream to read.
286      * @param crc CRC to update.
287      * @param buffer Buffer to read into.
288      * @param offset Offset to start filling buffer at.
289      * @param length How much to read.
290      * @return Bytes read.
291      * 
292      * @throws IOException
293      */
294     protected int readByte(InputStream in, CRC32 crc, byte [] buffer,
295                 int offset, int length)
296             throws IOException {
297         for (int i = offset; i < length; i++) {
298             buffer[offset + i] = (byte)readByte(in, crc);   
299         }
300         return length;
301     }
302     
303     /***
304      * @return Returns the fextra.
305      */
306     public byte[] getFextra() {
307         return this.fextra;
308     }
309     
310     /***
311      * @return Returns the flg.
312      */
313     public int getFlg() {
314         return this.flg;
315     }
316     
317     /***
318      * @return Returns the os.
319      */
320     public int getOs() {
321         return this.os;
322     }
323     
324     /***
325      * @return Returns the xfl.
326      */
327     public int getXfl() {
328         return this.xfl;
329     }
330     
331     /***
332      * @return Returns the mtime.
333      */
334     public int getMtime() {
335         return this.mtime;
336     }
337     
338     /***
339      * @return Returns the length.
340      */
341     public int getLength() {
342         return length;
343     }
344 }