1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.io.arc;
24
25 import it.unimi.dsi.fastutil.io.RepositionableStream;
26
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileNotFoundException;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.net.URI;
33 import java.net.URISyntaxException;
34
35 import org.archive.io.GzipHeader;
36 import org.archive.io.NoGzipMagicException;
37 import org.archive.net.UURI;
38
39 public class ARCUtils implements ARCConstants {
40 /***
41 * @param pathOrUri Path or URI to extract arc filename from.
42 * @return Extracted arc file name.
43 * @throws URISyntaxException
44 */
45 public static String parseArcFilename(final String pathOrUri)
46 throws URISyntaxException {
47 String path = pathOrUri;
48 if (UURI.hasScheme(pathOrUri)) {
49 URI url = new URI(pathOrUri);
50 path = url.getPath();
51 }
52 return (new File(path)).getName();
53 }
54
55 /***
56 * @param arcFile File to test.
57 * @return True if <code>arcFile</code> is compressed ARC.
58 * @throws IOException
59 */
60 public static boolean isCompressed(File arcFile) throws IOException {
61 return testCompressedARCFile(arcFile);
62 }
63
64 /***
65 * Check file is compressed and in ARC GZIP format.
66 *
67 * @param arcFile File to test if its Internet Archive ARC file
68 * GZIP compressed.
69 *
70 * @return True if this is an Internet Archive GZIP'd ARC file (It begins
71 * w/ the Internet Archive GZIP header and has the
72 * COMPRESSED_ARC_FILE_EXTENSION suffix).
73 *
74 * @exception IOException If file does not exist or is not unreadable.
75 */
76 public static boolean testCompressedARCFile(File arcFile)
77 throws IOException {
78 return testCompressedARCFile(arcFile, false);
79 }
80
81 /***
82 * Check file is compressed and in ARC GZIP format.
83 *
84 * @param arcFile File to test if its Internet Archive ARC file
85 * GZIP compressed.
86 * @param skipSuffixCheck Set to true if we're not to test on the
87 * '.arc.gz' suffix.
88 *
89 * @return True if this is an Internet Archive GZIP'd ARC file (It begins
90 * w/ the Internet Archive GZIP header).
91 *
92 * @exception IOException If file does not exist or is not unreadable.
93 */
94 public static boolean testCompressedARCFile(File arcFile,
95 boolean skipSuffixCheck)
96 throws IOException {
97 boolean compressedARCFile = false;
98 isReadable(arcFile);
99 if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
100 .endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
101 return compressedARCFile;
102 }
103
104 final InputStream is = new FileInputStream(arcFile);
105 try {
106 compressedARCFile = testCompressedARCStream(is);
107 } finally {
108 is.close();
109 }
110 return compressedARCFile;
111 }
112
113 /***
114 * Tests passed stream is gzip stream by reading in the HEAD.
115 * Does not reposition the stream. That is left up to the caller.
116 * @param is An InputStream.
117 * @return True if compressed stream.
118 * @throws IOException
119 */
120 public static boolean testCompressedARCStream(final InputStream is)
121 throws IOException {
122 boolean compressedARCFile = false;
123 GzipHeader gh = null;
124 try {
125 gh = new GzipHeader(is);
126 } catch (NoGzipMagicException e ) {
127 return compressedARCFile;
128 }
129
130 byte[] fextra = gh.getFextra();
131
132
133
134
135 if (fextra != null &&
136 ARC_GZIP_EXTRA_FIELD.length - 2 == fextra.length) {
137 compressedARCFile = true;
138 for (int i = 0; i < fextra.length; i++) {
139 if (fextra[i] != ARC_GZIP_EXTRA_FIELD[i + 2]) {
140 compressedARCFile = false;
141 break;
142 }
143 }
144 }
145 return compressedARCFile;
146 }
147
148 /***
149 * Tests passed stream is gzip stream by reading in the HEAD.
150 * Does reposition of stream when done.
151 * @param rs An InputStream that is Repositionable.
152 * @return True if compressed stream.
153 * @throws IOException
154 */
155 public static boolean testCompressedRepositionalStream(
156 final RepositionableStream rs)
157 throws IOException {
158 boolean compressedARCFile = false;
159 long p = rs.position();
160 try {
161 compressedARCFile = testCompressedStream((InputStream)rs);
162 } finally {
163 rs.position(p);
164 }
165 return compressedARCFile;
166 }
167
168 /***
169 * Tests passed stream is gzip stream by reading in the HEAD.
170 * Does reposition of stream when done.
171 * @param is An InputStream.
172 * @return True if compressed stream.
173 * @throws IOException
174 */
175 public static boolean testCompressedStream(final InputStream is)
176 throws IOException {
177 boolean compressedARCFile = false;
178 try {
179 new GzipHeader(is);
180 compressedARCFile = true;
181 } catch (NoGzipMagicException e) {
182 return compressedARCFile;
183 }
184 return compressedARCFile;
185 }
186
187 /***
188 * Check file is uncompressed ARC file.
189 *
190 * @param arcFile
191 * File to test if its Internet Archive ARC file uncompressed.
192 *
193 * @return True if this is an Internet Archive ARC file.
194 *
195 * @exception IOException
196 * If file does not exist or is not unreadable.
197 */
198 public static boolean testUncompressedARCFile(File arcFile)
199 throws IOException {
200 boolean uncompressedARCFile = false;
201 isReadable(arcFile);
202 if(arcFile.getName().toLowerCase().endsWith(ARC_FILE_EXTENSION)) {
203 FileInputStream fis = new FileInputStream(arcFile);
204 try {
205 byte [] b = new byte[ARC_MAGIC_NUMBER.length()];
206 int read = fis.read(b, 0, ARC_MAGIC_NUMBER.length());
207 fis.close();
208 if (read == ARC_MAGIC_NUMBER.length()) {
209 StringBuffer beginStr
210 = new StringBuffer(ARC_MAGIC_NUMBER.length());
211 for (int i = 0; i < ARC_MAGIC_NUMBER.length(); i++) {
212 beginStr.append((char)b[i]);
213 }
214
215 if (beginStr.toString().
216 equalsIgnoreCase(ARC_MAGIC_NUMBER)) {
217 uncompressedARCFile = true;
218 }
219 }
220 } finally {
221 fis.close();
222 }
223 }
224
225 return uncompressedARCFile;
226 }
227
228
229 /***
230 * @param arcFile File to test.
231 * @exception IOException If file does not exist or is not unreadable.
232 */
233 private static void isReadable(File arcFile) throws IOException {
234 if (!arcFile.exists()) {
235 throw new FileNotFoundException(arcFile.getAbsolutePath() +
236 " does not exist.");
237 }
238
239 if (!arcFile.canRead()) {
240 throw new FileNotFoundException(arcFile.getAbsolutePath() +
241 " is not readable.");
242 }
243 }
244 }