View Javadoc

1   /* ARCUtils
2    *
3    * Created on Aug 10, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.io.arc;
24  
25  import it.unimi.dsi.fastutil.io.RepositionableStream;
26  
27  import java.io.File;
28  import java.io.FileInputStream;
29  import java.io.FileNotFoundException;
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.net.URI;
33  import java.net.URISyntaxException;
34  
35  import org.archive.io.GzipHeader;
36  import org.archive.io.NoGzipMagicException;
37  import org.archive.net.UURI;
38  
39  public class ARCUtils implements ARCConstants {
40      /***
41       * @param pathOrUri Path or URI to extract arc filename from.
42       * @return Extracted arc file name.
43       * @throws URISyntaxException 
44       */
45      public static String parseArcFilename(final String pathOrUri)
46      throws URISyntaxException {
47          String path = pathOrUri;
48          if (UURI.hasScheme(pathOrUri)) {
49              URI url = new URI(pathOrUri);
50              path = url.getPath();
51          }
52          return (new File(path)).getName();
53      }
54      
55      /***
56       * @param arcFile File to test.
57       * @return True if <code>arcFile</code> is compressed ARC.
58       * @throws IOException
59       */
60      public static boolean isCompressed(File arcFile) throws IOException {
61          return testCompressedARCFile(arcFile);
62      }
63      
64      /***
65       * Check file is compressed and in ARC GZIP format.
66       *
67       * @param arcFile File to test if its Internet Archive ARC file
68       * GZIP compressed.
69       *
70       * @return True if this is an Internet Archive GZIP'd ARC file (It begins
71       * w/ the Internet Archive GZIP header and has the
72       * COMPRESSED_ARC_FILE_EXTENSION suffix).
73       *
74       * @exception IOException If file does not exist or is not unreadable.
75       */
76      public static boolean testCompressedARCFile(File arcFile)
77      throws IOException {
78          return testCompressedARCFile(arcFile, false);
79      }
80  
81      /***
82       * Check file is compressed and in ARC GZIP format.
83       *
84       * @param arcFile File to test if its Internet Archive ARC file
85       * GZIP compressed.
86       * @param skipSuffixCheck Set to true if we're not to test on the
87       * '.arc.gz' suffix.
88       *
89       * @return True if this is an Internet Archive GZIP'd ARC file (It begins
90       * w/ the Internet Archive GZIP header).
91       *
92       * @exception IOException If file does not exist or is not unreadable.
93       */
94      public static boolean testCompressedARCFile(File arcFile,
95              boolean skipSuffixCheck)
96      throws IOException {
97          boolean compressedARCFile = false;
98          isReadable(arcFile);
99          if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
100                 .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
101             return compressedARCFile;
102         }
103         
104         final InputStream is = new FileInputStream(arcFile);
105         try {
106             compressedARCFile = testCompressedARCStream(is);
107         } finally {
108             is.close();
109         }
110         return compressedARCFile;
111     }
112     
113     /***
114      * Tests passed stream is gzip stream by reading in the HEAD.
115      * Does not reposition the stream.  That is left up to the caller.
116      * @param is An InputStream.
117      * @return True if compressed stream.
118      * @throws IOException
119      */
120     public static boolean testCompressedARCStream(final InputStream is)
121             throws IOException {
122         boolean compressedARCFile = false;
123         GzipHeader gh = null;
124         try {
125             gh = new GzipHeader(is);
126         } catch (NoGzipMagicException e ) {
127             return compressedARCFile;
128         }
129         
130         byte[] fextra = gh.getFextra();
131         // Now make sure following bytes are IA GZIP comment.
132         // First check length. ARC_GZIP_EXTRA_FIELD includes length
133         // so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
134         // at +2.
135         if (fextra != null &&
136         		ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
137             compressedARCFile = true;
138             for (int i = 0; i < fextra.length; i++) {
139                 if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
140                     compressedARCFile = false;
141                     break;
142                 }
143             }
144         }
145         return compressedARCFile;
146     }
147     
148     /***
149      * Tests passed stream is gzip stream by reading in the HEAD.
150      * Does reposition of stream when done.
151      * @param rs An InputStream that is Repositionable.
152      * @return True if compressed stream.
153      * @throws IOException
154      */
155     public static boolean testCompressedRepositionalStream(
156             final RepositionableStream rs)
157     throws IOException {
158         boolean compressedARCFile = false;
159         long p = rs.position();
160         try {
161             compressedARCFile = testCompressedStream((InputStream)rs);
162         } finally {
163             rs.position(p);
164         }
165         return compressedARCFile; 
166     }
167     
168     /***
169      * Tests passed stream is gzip stream by reading in the HEAD.
170      * Does reposition of stream when done.
171      * @param is An InputStream.
172      * @return True if compressed stream.
173      * @throws IOException
174      */
175     public static boolean testCompressedStream(final InputStream is)
176     throws IOException {
177         boolean compressedARCFile = false;
178         try {
179             new GzipHeader(is);
180             compressedARCFile = true;
181         } catch (NoGzipMagicException e) {
182             return compressedARCFile;
183         }
184         return compressedARCFile;
185     }
186     
187     /***
188      * Check file is uncompressed ARC file.
189      * 
190      * @param arcFile
191      *            File to test if its Internet Archive ARC file uncompressed.
192      * 
193      * @return True if this is an Internet Archive ARC file.
194      * 
195      * @exception IOException
196      *                If file does not exist or is not unreadable.
197      */
198     public static boolean testUncompressedARCFile(File arcFile)
199     throws IOException {
200         boolean uncompressedARCFile = false;
201         isReadable(arcFile);
202         if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) {
203             FileInputStream fis = new FileInputStream(arcFile);
204             try {
205                 byte [] b = new byte[ARC_MAGIC_NUMBER.length()];
206                 int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length());
207                 fis.close();
208                 if (read == ARC_MAGIC_NUMBER.length()) {
209                     StringBuffer beginStr
210                         = new StringBuffer(ARC_MAGIC_NUMBER.length());
211                     for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) {
212                         beginStr.append((char)b[i]);
213                     }
214                     
215                     if (beginStr.toString().
216                             equalsIgnoreCase(ARC_MAGIC_NUMBER)) {
217                         uncompressedARCFile = true;
218                     }
219                 }
220             } finally {
221                 fis.close();
222             }
223         }
224 
225         return uncompressedARCFile;
226     }
227     
228 
229     /***
230      * @param arcFile File to test.
231      * @exception IOException If file does not exist or is not unreadable.
232      */
233     private static void isReadable(File arcFile) throws IOException {
234         if (!arcFile.exists()) {
235             throw new FileNotFoundException(arcFile.getAbsolutePath() +
236                 " does not exist.");
237         }
238 
239         if (!arcFile.canRead()) {
240             throw new FileNotFoundException(arcFile.getAbsolutePath() +
241                 " is not readable.");
242         }
243     }
244 }