1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.writer;
26
27 import java.io.File;
28 import java.io.FileOutputStream;
29 import java.io.FilenameFilter;
30 import java.io.IOException;
31 import java.text.NumberFormat;
32 import java.util.Collections;
33 import java.util.HashMap;
34 import java.util.HashSet;
35 import java.util.Iterator;
36 import java.util.Map;
37 import java.util.Set;
38 import java.util.TreeMap;
39 import java.util.logging.Level;
40 import java.util.logging.Logger;
41
42 import javax.management.AttributeNotFoundException;
43
44 import org.archive.crawler.datamodel.CoreAttributeConstants;
45 import org.archive.crawler.datamodel.CrawlURI;
46 import org.archive.crawler.framework.Processor;
47 import org.archive.crawler.settings.ListType;
48 import org.archive.crawler.settings.RegularExpressionConstraint;
49 import org.archive.crawler.settings.SimpleType;
50 import org.archive.crawler.settings.StringList;
51 import org.archive.crawler.settings.Type;
52 import org.archive.io.RecordingInputStream;
53 import org.archive.io.ReplayInputStream;
54 import org.archive.net.UURI;
55 import org.archive.util.IoUtils;
56
57 /***
58 Processor module that writes the results of successful fetches to
59 files on disk.
60
61 Writes contents of one URI to one file on disk. The files are
62 arranged in a directory hierarchy based on the URI paths. In that sense
63 they mirror the file hierarchy that might exist on the servers.
64 <p>
65 There are a number of issues involved:
66 <ul>
67 <li>
68 URIs can have arbitrary length, but file systems have length constraints.
69 </li>
70 <li>
71 URIs can contain characters that file systems prohibit.
72 </li>
73 <li>
74 URI paths are case-sensitive, but some file systems are case-insensitive.
75 </li>
76 </ul>
77 This class tries very hard to map each URI into a file system path that
78 obeys all file system constraints and yet reasonably represents
79 the original URI.
80 <p>
81 There would normally be a single instance of this class per Heritrix
82 instance. This class is thread-safe; any number of threads can be in its
83 innerProcess method at once. However, conflicts can still arise in the file
84 system. For example, if several threads try to create the same directory at
85 the same time, only one can win. Therefore, there should be at most one
86 access to a server at a given time.
87
88 @author Howard Lee Gayle
89 */
90 public class MirrorWriterProcessor
91 extends Processor implements CoreAttributeConstants {
92
93 private static final long serialVersionUID = 301407556928389168L;
94
95 /***
96 * Key to use asking settings for case sensitive option.
97 */
98 public static final String ATTR_CASE_SENSITIVE = "case-sensitive";
99
100 /***
101 * Key to use asking settings for character map.
102 */
103 public static final String ATTR_CHAR_MAP = "character-map";
104
105 /***
106 * Key to use asking settings for content type map.
107 */
108 public static final String ATTR_CONTENT_TYPE_MAP = "content-type-map";
109
110 /***
111 * Key to use asking settings for dot begin replacement.
112 */
113 public static final String ATTR_DOT_BEGIN = "dot-begin";
114
115 /***
116 * Key to use asking settings for dot end replacement.
117 */
118 public static final String ATTR_DOT_END = "dot-end";
119
120 /***
121 * Key to use asking settings for directory file.
122 */
123 public static final String ATTR_DIRECTORY_FILE = "directory-file";
124
125 /***
126 * Key to use asking settings for host directory option.
127 */
128 public static final String ATTR_HOST_DIRECTORY = "host-directory";
129
130 /***
131 * Key to use asking settings for host map.
132 */
133 public static final String ATTR_HOST_MAP = "host-map";
134
135 /***
136 * Key to use asking settings for maximum file system path length.
137 */
138 public static final String ATTR_MAX_PATH_LEN = "max-path-length";
139
140 /***
141 * Key to use asking settings for maximum file system path segment length.
142 */
143 public static final String ATTR_MAX_SEG_LEN = "max-segment-length";
144
145 /***
146 * Key to use asking settings for base directory path value.
147 */
148 public static final String ATTR_PATH = "path";
149
150 /***
151 * Key to use asking settings for port directory option.
152 */
153 public static final String ATTR_PORT_DIRECTORY = "port-directory";
154
155 /***
156 * Key to use asking settings for suffix at end option.
157 */
158 public static final String ATTR_SUFFIX_AT_END = "suffix-at-end";
159
160 /***
161 * Key to use asking settings for too-long directory.
162 */
163 public static final String ATTR_TOO_LONG_DIRECTORY = "too-long-directory";
164
165 /***
166 * Key to use asking settings for underscore set.
167 */
168 public static final String ATTR_UNDERSCORE_SET = "underscore-set";
169
170 /*** Default value for ATTR_DOT_BEGIN.*/
171 private static final String DEFAULT_DOT_BEGIN = "%2E";
172
173 /*** Default maximum file system path length.*/
174 private static final int DEFAULT_MAX_PATH_LEN = 1023;
175
176 /*** Default maximum file system path segment length.*/
177 private static final int DEFAULT_MAX_SEG_LEN = 255;
178
179 /*** Default value for ATTR_TOO_LONG_DIRECTORY.*/
180 private static final String DEFAULT_TOO_LONG_DIRECTORY = "LONG";
181
182 /*** An empty Map.*/
183 private static final Map<String,String> EMPTY_MAP
184 = Collections.unmodifiableMap(new TreeMap<String,String>());
185
186 /***
187 Regular expression matching a file system path segment.
188 The intent is one or more non-file-separator characters.
189 The backslash is to quote File.separator if it's also backslash.
190 */
191 private static final String PATH_SEGMENT_RE =
192 "[^//" + File.separator + "]+";
193
194 /***
195 Regular expression constraint on ATTR_DIRECTORY_FILE.
196 The intent is one non-file-separator character,
197 followed by zero or more characters.
198 The backslash is to quote File.separator if it's also backslash.
199 */
200 private static final String TOO_LONG_DIRECTORY_RE =
201 "[^//" + File.separator + "].*";
202
203 /***
204 * Logger.
205 */
206 private static final Logger logger =
207 Logger.getLogger(MirrorWriterProcessor.class.getName());
208
209 /***
210 * @param name Name of this processor.
211 */
212 public MirrorWriterProcessor(String name) {
213 super(name, "MirrorWriter processor. " +
214 "A writer that writes each URL to a file on disk named for " +
215 "a derivative of the URL.");
216 Type e;
217 addElementToDefinition(new SimpleType(ATTR_CASE_SENSITIVE,
218 "True if the file system is case-sensitive, like UNIX. "
219 + "False if the file system is case-insensitive, "
220 + "like Macintosh HFS+ and Windows.",
221 Boolean.TRUE));
222 addElementToDefinition(new StringList(ATTR_CHAR_MAP,
223 "This list is grouped in pairs. "
224 + "The first string in each pair must have a length of one. "
225 + "If it occurs in a URI path, "
226 + "it is replaced by the second string in the pair. "
227 + "For UNIX, no character mapping is normally needed. "
228 + "For Macintosh, the recommended value is [: %%3A]. "
229 + "For Windows, the recommended value is "
230 + "[' ' %%20 " %%22 * %%2A : %%3A < %%3C "
231 + "//> %%3E ? %%3F //// %%5C ^ %%5E | %%7C]."));
232 addElementToDefinition(new StringList(ATTR_CONTENT_TYPE_MAP,
233 "This list is grouped in pairs. "
234 + "If the content type of a resource begins (case-insensitive) "
235 + "with the first string in a pair, the suffix is set to "
236 + "the second string in the pair, replacing any suffix that may "
237 + "have been in the URI. For example, to force all HTML files "
238 + "to have the same suffix, use [text/html html]."));
239 e = addElementToDefinition(new SimpleType(ATTR_DIRECTORY_FILE,
240 "Implicitly append this to a URI ending with '/'.",
241 "index.html"));
242 e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE,
243 Level.SEVERE, "This must be a simple file name."));
244 e = addElementToDefinition(new SimpleType(ATTR_DOT_BEGIN,
245 "If a segment starts with '.', the '.' is replaced by this.",
246 DEFAULT_DOT_BEGIN));
247 e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE,
248 Level.SEVERE,
249 "This must not be empty, and must not contain " + File.separator));
250 addElementToDefinition(new SimpleType(ATTR_DOT_END,
251 "If a directory name ends with '.' it is replaced by this. "
252 + "For all file systems except Windows, '.' is recommended. "
253 + "For Windows, %%2E is recommended.",
254 "."));
255 addElementToDefinition(new StringList(ATTR_HOST_MAP,
256 "This list is grouped in pairs. "
257 + "If a host name matches (case-insensitive) the first string "
258 + "in a pair, it is replaced by the second string in the pair. "
259 + "This can be used for consistency when several names are used "
260 + "for one host, for example "
261 + "[12.34.56.78 www42.foo.com]."));
262 addElementToDefinition(new SimpleType(ATTR_HOST_DIRECTORY,
263 "Create a subdirectory named for the host in the URI.",
264 Boolean.TRUE));
265 addElementToDefinition(new SimpleType(ATTR_PATH,
266 "Top-level directory for mirror files.", "mirror"));
267
268
269
270 addElementToDefinition(new SimpleType(ATTR_MAX_PATH_LEN,
271 "Maximum file system path length.",
272 new Integer(DEFAULT_MAX_PATH_LEN)));
273 addElementToDefinition(new SimpleType(ATTR_MAX_SEG_LEN,
274 "Maximum file system path segment length.",
275 new Integer(DEFAULT_MAX_SEG_LEN)));
276 addElementToDefinition(new SimpleType(ATTR_PORT_DIRECTORY,
277 "Create a subdirectory named for the port in the URI.",
278 Boolean.FALSE));
279 addElementToDefinition(new SimpleType(ATTR_SUFFIX_AT_END,
280 "If true, the suffix is placed at the end of the path, "
281 + "after the query (if any). If false, the suffix is placed "
282 + "before the query.",
283 Boolean.TRUE));
284 e = addElementToDefinition(new SimpleType(ATTR_TOO_LONG_DIRECTORY,
285 "If all the directories in the URI would exceed, "
286 + "or come close to exceeding, the file system maximum "
287 + "path length, then they are all replaced by this.",
288 DEFAULT_TOO_LONG_DIRECTORY));
289 e.addConstraint(new RegularExpressionConstraint(TOO_LONG_DIRECTORY_RE,
290 Level.SEVERE, "This must be relative and not empty."));
291 addElementToDefinition(new StringList(ATTR_UNDERSCORE_SET,
292 "If a directory name appears (case-insensitive) in this list "
293 + "then an underscore is placed before it. "
294 + "For all file systems except Windows, this is not needed. "
295 + "For Windows, the following is recommended: "
296 + "[com1 com2 com3 com4 com5 com6 com7 com8 com9 "
297 + "lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9 "
298 + "con nul prn]."));
299 }
300
301 protected void innerProcess(CrawlURI curi) {
302 if (!curi.isSuccess()) {
303 return;
304 }
305 UURI uuri = curi.getUURI();
306
307
308 String scheme = uuri.getScheme();
309 if (!"http".equalsIgnoreCase(scheme)
310 && !"https".equalsIgnoreCase(scheme)) {
311 return;
312 }
313 RecordingInputStream recis = curi.getHttpRecorder().getRecordedInput();
314 if (0L == recis.getResponseContentLength()) {
315 return;
316 }
317
318 String baseDir = null;
319 String baseSeg = null;
320 try {
321 baseSeg = (String) getAttribute(ATTR_PATH, curi);
322 } catch (AttributeNotFoundException e) {
323 logger.warning(e.getLocalizedMessage());
324 return;
325 }
326
327
328 while ((baseSeg.length() > 1) && baseSeg.endsWith(File.separator)) {
329 baseSeg = baseSeg.substring(0, baseSeg.length() - 1);
330 }
331 if (0 == baseSeg.length()) {
332 baseDir = getController().getDisk().getPath();
333 } else if ((new File(baseSeg)).isAbsolute()) {
334 baseDir = baseSeg;
335 } else {
336 baseDir = getController().getDisk().getPath() + File.separator
337 + baseSeg;
338 }
339
340
341 boolean reCrawl = curi.containsKey(A_MIRROR_PATH);
342
343
344
345
346
347
348
349
350
351
352
353
354 String mps = null;
355 File destFile = null;
356 try {
357 if (reCrawl) {
358 mps = curi.getString(A_MIRROR_PATH);
359 destFile = new File(baseDir + File.separator + mps);
360 File parent = destFile.getParentFile();
361 if (null != parent) {
362 IoUtils.ensureWriteableDirectory(parent);
363 }
364 } else {
365 URIToFileReturn r = null;
366 try {
367 r = uriToFile(baseDir, curi);
368 } catch (AttributeNotFoundException e) {
369 logger.warning(e.getLocalizedMessage());
370 return;
371 }
372 destFile = r.getFile();
373 mps = r.getRelativePath();
374 }
375 logger.info(uuri.toString() + " -> " + destFile.getPath());
376 writeToPath(recis, destFile);
377 if (!reCrawl) {
378 curi.putString(A_MIRROR_PATH, mps);
379 }
380 } catch (IOException e) {
381 curi.addLocalizedError(this.getName(), e, "Mirror");
382 }
383 }
384
385 /***
386 Gets the directory in which the file will reside.
387 Any directories needed are created.
388 @param baseDir the path to the starting directory
389 @param host the host part of the URI, or null if the host name
390 should not be part of the returned path
391 @param port the port part of the URI, or -1 if the port
392 should not be part of the returned path
393 @param segs all the segments in the URI
394 @param maxLen the maximum path length allowed to the directory;
395 this must leave some room for the file itself
396 @return the directory, or null if maxLen would be exceeded
397 @throws IOException
398 if a needed directory could not be created
399 @throws IOException
400 if a needed directory is not writeable
401 @throws IOException
402 if a non-directory file exists with the same path as a needed directory
403 */
404 private URIToFileReturn dirPath(String baseDir, String host, int port,
405 PathSegment[] segs, int maxLen)
406 throws IOException {
407
408
409 URIToFileReturn r = new URIToFileReturn(baseDir, host, port);
410 r.mkdirs();
411 for (int i = 0; (segs.length - 1) != i; ++i) {
412 segs[i].addToPath(r);
413 if (r.longerThan(maxLen)) {
414 return null;
415 }
416 }
417 return r;
418 }
419
420 /***
421 Ensures that a list contains an even number of elements.
422 If not, the last element is removed.
423 @param list the list
424 */
425 private void ensurePairs(ListType list) {
426 if (1 == (list.size() % 2)) {
427 list.remove(list.size() - 1);
428 }
429 }
430
431 /***
432 Makes a path in which a resource can be stored.
433 @param baseDir the path to the starting directory
434 @param curi the URI
435 @return a path to the file in which to store the resource
436 @throws AttributeNotFoundException
437 if a needed setting is missing
438 @throws IOException
439 if a needed directory could not be created
440 @throws IOException
441 if a needed directory is not writeable
442 @throws IOException
443 if a non-directory file exists with the same path as a needed directory
444 */
445 private URIToFileReturn uriToFile(String baseDir, CrawlURI curi)
446 throws AttributeNotFoundException, IOException {
447 UURI uuri = curi.getUURI();
448 String host = null;
449 Boolean hd = (Boolean) getAttribute(ATTR_HOST_DIRECTORY, curi);
450 if (hd.booleanValue()) {
451 host = uuri.getHost();
452 StringList hostMap = (StringList) getAttribute(ATTR_HOST_MAP, curi);
453 if ((null != hostMap) && (hostMap.size() > 1)) {
454 ensurePairs(hostMap);
455 Iterator<String> i = hostMap.typesafe().iterator();
456 for (boolean more = true; more && i.hasNext();) {
457 String h1 = i.next();
458 String h2 = i.next();
459 if (host.equalsIgnoreCase(h1)) {
460 more = false;
461 if ((null != h2) && (0 != h2.length())) {
462 host = h2;
463 }
464 }
465 }
466 }
467 }
468
469 int port =
470 ((Boolean) getAttribute(ATTR_PORT_DIRECTORY, curi)).booleanValue()
471 ? uuri.getPort()
472 : -1;
473
474 String suffix = null;
475 StringList ctm = (StringList) getAttribute(ATTR_CONTENT_TYPE_MAP, curi);
476 if ((null != ctm) && (ctm.size() > 1)) {
477 ensurePairs(ctm);
478 String contentType = curi.getContentType().toLowerCase();
479 Iterator i = ctm.iterator();
480 for (boolean more = true; more && i.hasNext();) {
481 String ct = (String) i.next();
482 String suf = (String) i.next();
483 if ((null != ct) && contentType.startsWith(ct.toLowerCase())) {
484 more = false;
485 if ((null != suf) && (0 != suf.length())) {
486 suffix = suf;
487 }
488 }
489 }
490 }
491
492 int maxSegLen =
493 ((Integer) getAttribute(ATTR_MAX_SEG_LEN, curi)).intValue();
494 if (maxSegLen < 2) {
495 maxSegLen = DEFAULT_MAX_SEG_LEN;
496 }
497
498 int maxPathLen =
499 ((Integer) getAttribute(ATTR_MAX_PATH_LEN, curi)).intValue();
500 if (maxPathLen < 2) {
501 maxPathLen = DEFAULT_MAX_PATH_LEN;
502 }
503
504 Map<String,String> characterMap = EMPTY_MAP;
505 StringList cm = (StringList) getAttribute(ATTR_CHAR_MAP, curi);
506 if ((null != cm) && (cm.size() > 1)) {
507 ensurePairs(cm);
508 characterMap = new HashMap<String,String>(cm.size());
509
510 for (Iterator i = cm.iterator(); i.hasNext();) {
511 String s1 = (String) i.next();
512 String s2 = (String) i.next();
513 if ((null != s1) && (1 == s1.length()) && (null != s2)
514 && (0 != s2.length())) {
515 characterMap.put(s1, s2);
516 }
517 }
518 }
519
520 String dotBegin = (String) getAttribute(ATTR_DOT_BEGIN, curi);
521 if (".".equals(dotBegin)) {
522 dotBegin = null;
523 }
524
525 String dotEnd = (String) getAttribute(ATTR_DOT_END, curi);
526 if (".".equals(dotEnd)) {
527 dotEnd = null;
528 }
529
530 String tld = (String) getAttribute(ATTR_TOO_LONG_DIRECTORY, curi);
531 if ((null == tld) || (0 == tld.length())
532 || (-1 != tld.indexOf(File.separatorChar))) {
533 tld = DEFAULT_TOO_LONG_DIRECTORY;
534 }
535
536 Set<String> underscoreSet = null;
537 StringList us = (StringList) getAttribute(ATTR_UNDERSCORE_SET, curi);
538 if ((null != us) && (0 != us.size())) {
539 underscoreSet = new HashSet<String>(us.size(), 0.5F);
540 for (String s: us.typesafe()) {
541 if ((null != s) && (0 != s.length())) {
542 underscoreSet.add(s.toLowerCase());
543 }
544 }
545 }
546
547 return uriToFile(curi, host, port, uuri.getPath(), uuri.getQuery(),
548 suffix, baseDir, maxSegLen, maxPathLen,
549 ((Boolean) getAttribute(ATTR_CASE_SENSITIVE, curi)).booleanValue(),
550 (String) getAttribute(ATTR_DIRECTORY_FILE, curi),
551 characterMap, dotBegin, dotEnd, tld,
552 ((Boolean) getAttribute(ATTR_SUFFIX_AT_END, curi)).booleanValue(),
553 underscoreSet);
554 }
555
556 /***
557 Makes a path in which a resource can be stored.
558 @param curi the URI
559 @param host the host part of the URI, or null if the host name
560 should not be part of the returned path
561 @param port the port part of the URI, or -1 if the port
562 should not be part of the returned path
563 @param uriPath the path part of the URI (must be absolute)
564 @param query the query part of the URI, or null if none
565 @param suffix if non-null, use this as the suffix in preference to
566 any suffix that uriPath might have
567 @param baseDir the path to the starting directory
568 @param maxSegLen the maximum number of characters allowed in one
569 file system path segment (component)
570 @param maxPathLen the maximum number of characters allowed in a
571 file system path
572 @param caseSensitive if true, the file system is assumed to be
573 case-sensitive; otherwise the file system is assumed to be
574 case-insensitive but case-preserving
575 @param dirFile the simple file name to append to a URI path
576 ending in '/'
577 @param characterMap a map from characters (as length-1 String values) in
578 the URI path and query to replacement String values
579 @param dotBegin if non-null, this replaces a '.' at
580 the beginning of a segment
581 @param dotEnd if non-null, this replaces a '.' that appears at the end
582 of a directory name
583 @param tooLongDir if the path length would exceed or be close to
584 exceeding maxPathLen then this simple name is used as a directory
585 under baseDir instead
586 @param suffixAtEnd if true, the suffix is placed at the end of the
587 path, after the query (if any); otherwise, the suffix is placed
588 before the query
589 @param underscoreSet if non-null and a segment, after conversion
590 to lower case, is in this set, then prepend an underscore
591 to the segment
592 @return a path to the file in which to store the resource
593 @throws IOException
594 if a needed directory could not be created
595 @throws IOException
596 if a needed directory is not writeable
597 @throws IOException
598 if a non-directory file exists with the same path as a needed directory
599 */
600 private URIToFileReturn uriToFile(CrawlURI curi, String host, int port,
601 String uriPath, String query, String suffix, String baseDir,
602 int maxSegLen, int maxPathLen, boolean caseSensitive,
603 String dirFile, Map characterMap, String dotBegin, String dotEnd,
604 String tooLongDir, boolean suffixAtEnd, Set underscoreSet)
605 throws IOException {
606 assert (null == host) || (0 != host.length());
607 assert 0 != uriPath.length();
608 assert '/' == uriPath.charAt(0) : "uriPath: " + uriPath;
609 assert -1 == uriPath.indexOf("//") : "uriPath: " + uriPath;
610 assert -1 == uriPath.indexOf("/./") : "uriPath: " + uriPath;
611 assert !uriPath.endsWith("/.") : "uriPath: " + uriPath;
612 assert (null == query) || (-1 == query.indexOf('/'))
613 : "query: " + query;
614 assert (null == suffix)
615 || ((0 != suffix.length()) && (-1 == suffix.indexOf('/')))
616 : "suffix: " + suffix;
617 assert 0 != baseDir.length();
618 assert maxSegLen > 2 : "maxSegLen: " + maxSegLen;
619 assert maxPathLen > 1;
620 assert maxPathLen >= maxSegLen
621 : "maxSegLen: " + maxSegLen + " maxPathLen: " + maxPathLen;
622 assert 0 != dirFile.length();
623 assert -1 == dirFile.indexOf("/") : "dirFile: " + dirFile;
624 assert null != characterMap;
625 assert (null == dotBegin) || (0 != dotBegin.length());
626 assert (null == dotEnd) || !dotEnd.endsWith(".") : "dotEnd: " + dotEnd;
627 assert 0 != tooLongDir.length();
628 assert '/' != tooLongDir.charAt(0) : "tooLongDir: " + tooLongDir;
629
630 int nSegs = 0;
631 for (int i = 0; uriPath.length() != i; ++i) {
632 if ('/' == uriPath.charAt(i)) {
633 ++nSegs;
634 }
635 }
636 assert nSegs > 0 : "uriPath: " + uriPath;
637 PathSegment[] segs = new PathSegment[nSegs];
638 int slashIndex = 0;
639 for (int i = 0; (segs.length - 1) != i; ++i) {
640 int nsi = uriPath.indexOf('/', slashIndex + 1);
641 assert nsi > slashIndex : "uriPath: " + uriPath;
642 segs[i] = new DirSegment(uriPath, slashIndex + 1, nsi,
643 maxSegLen, caseSensitive, curi,
644 characterMap, dotBegin, dotEnd,
645 underscoreSet);
646 slashIndex = nsi;
647 }
648 if (slashIndex < (uriPath.length() - 1)) {
649
650
651 segs[segs.length - 1] = new EndSegment(uriPath, slashIndex + 1,
652 uriPath.length(), maxSegLen, caseSensitive, curi,
653 characterMap, dotBegin, query, suffix, maxPathLen,
654 suffixAtEnd);
655 } else {
656
657
658 segs[segs.length - 1] = new EndSegment(dirFile, 0, dirFile.length(),
659 maxSegLen, caseSensitive, curi, characterMap, null,
660 query, suffix, maxPathLen, suffixAtEnd);
661 }
662 URIToFileReturn r = dirPath(baseDir, host, port, segs,
663 maxPathLen - maxSegLen);
664 if (null == r) {
665
666
667
668 PathSegment endSegment = segs[segs.length - 1];
669 segs = new PathSegment[2];
670 segs[0] = new DirSegment(tooLongDir, 0, tooLongDir.length(),
671 maxSegLen, caseSensitive, curi, EMPTY_MAP,
672 null, null, null);
673 segs[1] = endSegment;
674 r = dirPath(baseDir, host, port, segs, maxPathLen - maxSegLen);
675 }
676 segs[segs.length - 1].addToPath(r);
677 return r;
678 }
679
680 /***
681 Copies a resource into a file.
682 A temporary file is created and then atomically renamed to
683 the destination file.
684 This prevents leaving a partial file in case of a crash.
685 @param recis the RecordingInputStream that recorded the contents
686 of the resource
687 @param dest the destination file
688 @throws IOException on I/O error
689 @throws IOException if
690 the file rename fails
691 */
692 private void writeToPath(RecordingInputStream recis, File dest)
693 throws IOException {
694 ReplayInputStream replayis = recis.getContentReplayInputStream();
695 File tf = new File (dest.getPath() + "N");
696 FileOutputStream fos = new FileOutputStream(tf);
697 try {
698 replayis.readFullyTo(fos);
699 } finally {
700 fos.close();
701 replayis.close();
702 }
703 if (!tf.renameTo(dest)) {
704 throw new IOException("Can not rename " + tf.getAbsolutePath()
705 + " to " + dest.getAbsolutePath());
706 }
707
708 }
709
710 /***
711 This class represents one segment (component) of a URI path.
712 A segment between '/' characters is a directory segment.
713 The segment after the last '/' is the end segment.
714 */
715 abstract class PathSegment {
716 /***
717 existsMaybeCaseSensitive return code
718 for a file that does not exist.
719 */
720 protected static final int EXISTS_NOT = 1;
721
722 /***
723 existsMaybeCaseSensitive return code
724 for a file that exists.
725 Furthermore, the comparison is case-sensitive.
726 */
727 protected static final int EXISTS_EXACT_MATCH = 2;
728
729 /***
730 existsMaybeCaseSensitive return code
731 for a file that exists, using a case-insensitive comparison.
732 Furthermore, the file would not exist if the comparison
733 were case-sensitive.
734 */
735 protected static final int EXISTS_CASE_INSENSITIVE_MATCH = 3;
736
737 /*** The URI, for logging and error reporting.*/
738 protected CrawlURI curi;
739
740 /***
741 The main part of this segment.
742 For a directory segment, that's all there is.
743 For an end segment, it's the part of the URI after the last '/'
744 up to but not including the '.' before the suffix (if any).
745 */
746 protected LumpyString mainPart = null;
747
748 /***
749 The maximum number of characters allowed
750 in one file system path segment.
751 A URI segment can potentially be much longer,
752 but we'll trim it to this.
753 */
754 protected int maxSegLen;
755
756 /*** If true, the file system is assumed to be
757 case-sensitive; otherwise the file system is assumed to be
758 case-insensitive.
759 */
760 private boolean caseSensitive;
761
762 /***
763 Creates a new PathSegment.
764 @param maxSegLen the maximum number of characters
765 allowed in one path segment
766 @param caseSensitive if true, the file system is assumed to be
767 case-sensitive; otherwise the file system is assumed to be
768 case-insensitive
769 @param curi the URI
770 @throws IllegalArgumentException if
771 maxSegLen is too small
772 */
773 PathSegment(int maxSegLen, boolean caseSensitive, CrawlURI curi) {
774 if (maxSegLen < 2) {
775 throw new IllegalArgumentException("maxSegLen: " + maxSegLen);
776 }
777 this.maxSegLen = maxSegLen;
778 this.caseSensitive = caseSensitive;
779 this.curi = curi;
780 }
781
782 /***
783 Adds this segment to a file path.
784 This is the key method of this class.
785 It extends the given path by one segment,
786 named to obey all constraints.
787 A new directory is created if necessary.
788 @param currentPath the current path, to which this segment is added
789 @throws IOException
790 if a needed directory could not be created
791 @throws IOException
792 if a needed directory is not writeable
793 */
794 abstract void addToPath(URIToFileReturn currentPath) throws IOException;
795
796 /***
797 Checks if a file (including directories) exists.
798 @param fsf the directory containing the file to be checked
799 @param segStr the simple file or directory name
800 @param check the file or directory for which to check
801 @return EXISTS_NOT if check does not exist,
802 EXISTS_EXACT_MATCH if check exists with a name that matches
803 (case-sensitive) segStr, and
804 EXISTS_CASE_INSENSITIVE_MATCH if check exists
805 with a name that matches
806 segStr using a case-insensitive match but not using a
807 case-sensitive match
808 */
809 protected int existsMaybeCaseSensitive(File fsf, String segStr,
810 File check) {
811 if (caseSensitive) {
812 return check.exists() ? EXISTS_EXACT_MATCH : EXISTS_NOT;
813 }
814 if (!check.exists()) {
815 return EXISTS_NOT;
816 }
817
818
819
820
821
822
823
824
825 String[] fna = fsf.list(new CaseInsensitiveFilenameFilter(segStr));
826 for (int i = 0; fna.length != i; ++i) {
827 if (segStr.equals(fna[i])) {
828 return EXISTS_EXACT_MATCH;
829 }
830 }
831 return EXISTS_CASE_INSENSITIVE_MATCH;
832 }
833
834 /***
835 This class implements a FilenameFilter that matches
836 by name, ignoring case.
837 */
838 class CaseInsensitiveFilenameFilter implements FilenameFilter {
839 /*** The file name we're looking for. */
840 private String target;
841
842 /***
843 Creates a CaseInsensitiveFilenameFilter.
844 @param target the target file name
845 @throws IllegalArgumentException if
846 target is null or empty.
847 */
848 CaseInsensitiveFilenameFilter(String target) {
849 if (null == target) {
850 throw new IllegalArgumentException("target null");
851 }
852 if (0 == target.length()) {
853 throw new IllegalArgumentException("target empty");
854 }
855 this.target = target;
856 }
857
858 public boolean accept(File dir, String name) {
859 return target.equalsIgnoreCase(name);
860 }
861 }
862 }
863
864 /***
865 This class represents one directory segment (component) of a URI path.
866 */
867 class DirSegment extends PathSegment {
868 /*** If a segment name is in this set, prepend an underscore.*/
869 private Set underscoreSet;
870
871 /***
872 Creates a DirSegment.
873 @param uriPath the path part of the URI
874 @param beginIndex the beginning index, inclusive, of the substring
875 of uriPath to be used
876 @param endIndex the ending index, exclusive, of the substring
877 of uriPath to be used
878 @param maxSegLen the maximum number of characters allowed in one
879 file system path segment (component)
880 @param caseSensitive if true, the file system is assumed to be
881 case-sensitive; otherwise the file system is assumed to be
882 case-insensitive but case-preserving
883 @param curi the URI
884 @param characterMap a map from characters
885 (as length-1 String values) in
886 the URI path and query to replacement String values
887 @param dotBegin if non-null, this replaces a '.' at
888 the beginning of the directory name
889 @param dotEnd if non-null, this replaces a '.'
890 that appears at the end of a directory name
891 @param underscoreSet if non-null and a segment, after conversion
892 to lower case, is in this set, then prepend an underscore
893 to the segment
894 @throws IllegalArgumentException if
895 beginIndex is negative.
896 @throws IllegalArgumentException if
897 endIndex is less than beginIndex.
898 @throws IllegalArgumentException if
899 maxSegLen is too small.
900 */
901 DirSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen,
902 boolean caseSensitive, CrawlURI curi, Map characterMap,
903 String dotBegin, String dotEnd, Set underscoreSet) {
904 super(maxSegLen, caseSensitive, curi);
905 mainPart = new LumpyString(uriPath, beginIndex, endIndex,
906 (null == dotEnd) ? 0 : dotEnd.length(),
907 this.maxSegLen, characterMap, dotBegin);
908 if (null != dotEnd) {
909
910
911
912
913 int dl = dotEnd.length();
914 while (mainPart.endsWith('.')) {
915
916
917 mainPart.trimToMax(mainPart.length() - 1);
918 if ((mainPart.length() + dl) <= this.maxSegLen) {
919 mainPart.append(dotEnd);
920 }
921 }
922 }
923 this.underscoreSet = underscoreSet;
924 }
925
926 void addToPath(URIToFileReturn currentPath) throws IOException {
927 NumberFormat nf = null;
928 int startLen = mainPart.length();
929 for (int i = 0; ; ++i) {
930 if (0 != i) {
931
932
933
934 if (null == nf) {
935 nf = NumberFormat.getIntegerInstance();
936 }
937 String ending = nf.format(i);
938 mainPart.trimToMax(Math.min(startLen,
939 maxSegLen - ending.length()));
940 mainPart.append(ending);
941 }
942 String segStr = mainPart.toString();
943 if ((null != underscoreSet)
944 && underscoreSet.contains(segStr.toLowerCase())) {
945 mainPart.prepend('_');
946 ++startLen;
947 mainPart.trimToMax(maxSegLen);
948 segStr = mainPart.toString();
949 }
950 File fsf = currentPath.getFile();
951 File f = new File(fsf, segStr);
952 int er = existsMaybeCaseSensitive(fsf, segStr, f);
953 switch (er) {
954 case EXISTS_NOT:
955 if (!f.mkdir()) {
956 throw new IOException("Can not mkdir "
957 + f.getAbsolutePath());
958 }
959 currentPath.append(f, segStr);
960 return;
961
962 case EXISTS_EXACT_MATCH:
963 if (f.isDirectory()) {
964 if (!f.canWrite()) {
965 throw new IOException("Directory "
966 + f.getAbsolutePath()
967 + " not writeable.");
968 }
969
970
971
972
973
974
975
976
977
978
979
980 currentPath.append(f, segStr);
981 return;
982 }
983
984
985
986
987
988
989
990
991
992
993
994 break;
995
996 case EXISTS_CASE_INSENSITIVE_MATCH:
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009 break;
1010
1011 default:
1012 throw new IllegalStateException("Code: " + er);
1013 }
1014 }
1015 }
1016 }
1017
1018 /***
1019 This class represents the last segment (component) of a URI path.
1020 */
1021 class EndSegment extends PathSegment {
1022 /***
1023 The number of characters in the path up to this EndSegment,
1024 including the final File.separatorChar.
1025 */
1026 private int dirPathLen;
1027
1028 /***
1029 The maximum number of characters allowed in a file path, minus 1.
1030 The extra 1 is reserved for temporarily appending
1031 a character so an existing file can be replaced atomically,
1032 for example, by writing
1033 <code>foo.htmlN</code>
1034 and then renaming it to
1035 <code>foo.html</code>.
1036 */
1037 private int maxPathLen;
1038
1039 /*** The query part of the URI, or null if none.*/
1040 private LumpyString query = null;
1041
1042 /***
1043 The suffix, or null if none.
1044 This isn't a LumpyString because we'd only trim a suffix
1045 if space were very, very tight.
1046 */
1047 private String suffix = null;
1048
1049 /***
1050 True if the suffix goes at the end, after the query.
1051 False if the suffix goes before the query.
1052 */
1053 private boolean suffixAtEnd;
1054
1055 /*** Appended to mainPart if necessary to create a unique file name.*/
1056 private String uniquePart = null;
1057
1058 /***
1059 Creates an EndSegment.
1060 @param uriPath the path part of the URI
1061 @param beginIndex the beginning index, inclusive, of the substring
1062 of uriPath to be used
1063 @param endIndex the ending index, exclusive, of the substring
1064 of uriPath to be used
1065 @param maxSegLen the maximum number of characters allowed in one
1066 file system path segment (component)
1067 @param caseSensitive if true, the file system is assumed to be
1068 case-sensitive; otherwise the file system is assumed to be
1069 case-insensitive but case-preserving
1070 @param curi the URI
1071 @param characterMap maps characters (as length-1 String values) in
1072 the URI path and query to replacement String values
1073 @param dotBegin if non-null, this replaces a '.' at
1074 the beginning of the segment
1075 @param query the query part of the URI, or null if none
1076 @param suffix if non-null, use this as the suffix in preference to
1077 any suffix that uriPath might have
1078 @param maxPathLen the maximum number of characters allowed in a
1079 file system path
1080 @param suffixAtEnd if true, the suffix is placed at the end of the
1081 path, after the query (if any); otherwise, the suffix is placed
1082 before the query
1083 @throws IllegalArgumentException if
1084 beginIndex is negative.
1085 @throws IllegalArgumentException if
1086 endIndex is less than beginIndex.
1087 @throws IllegalArgumentException if
1088 maxSegLen is too small.
1089 */
1090 EndSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen,
1091 boolean caseSensitive, CrawlURI curi, Map characterMap,
1092 String dotBegin, String query, String suffix,
1093 int maxPathLen, boolean suffixAtEnd) {
1094 super(maxSegLen - 1, caseSensitive, curi);
1095 int mpe = endIndex;
1096 int ldi = uriPath.lastIndexOf('.');
1097 if ((ldi > 0) && (ldi < (endIndex - 1)) && (ldi > beginIndex)) {
1098 mpe = ldi;
1099 }
1100 this.suffix = suffix;
1101 if ((null == this.suffix) && (mpe < (endIndex - 1))) {
1102
1103
1104
1105 LumpyString ls = new LumpyString(uriPath, mpe + 1, endIndex, 0,
1106 this.maxSegLen, characterMap,
1107 null);
1108 this.suffix = ls.toString();
1109 }
1110 int pad = ((null == this.suffix) ? 0 : (1 + this.suffix.length()))
1111 + ((null == query) ? 0 : query.length());
1112 mainPart = new LumpyString(uriPath, beginIndex, mpe, pad,
1113 this.maxSegLen, characterMap, dotBegin);
1114 this.maxPathLen = maxPathLen - 1;
1115 if (null != query) {
1116 this.query = new LumpyString(query, 0, query.length(), 0,
1117 this.maxSegLen, characterMap,
1118 null);
1119 }
1120 this.suffixAtEnd = suffixAtEnd;
1121 }
1122
1123 void addToPath(URIToFileReturn currentPath) {
1124 File fsf = currentPath.getFile();
1125 NumberFormat nf = null;
1126 dirPathLen = 1 + fsf.getPath().length();
1127 for (int i = 0; ; ++i) {
1128 if (0 != i) {
1129 if (null == nf) {
1130 nf = NumberFormat.getIntegerInstance();
1131 }
1132 uniquePart = nf.format(i);
1133 }
1134 trimWithPadding((null == uniquePart) ? 0 : uniquePart.length());
1135 String segStr = joinParts();
1136 File f = new File(fsf, segStr);
1137
1138
1139 int er = existsMaybeCaseSensitive(fsf, segStr, f);
1140 switch (er) {
1141 case EXISTS_NOT:
1142 currentPath.append(f, segStr);
1143 return;
1144
1145 case EXISTS_EXACT_MATCH:
1146 if (f.isFile()) {
1147 currentPath.append(f, segStr);
1148 return;
1149 }
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159 break;
1160
1161 case EXISTS_CASE_INSENSITIVE_MATCH:
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174 break;
1175
1176 default:
1177 throw new IllegalStateException("Code: " + er);
1178 }
1179 }
1180 }
1181
1182 /***
1183 Creates a simple file name from the parts of this EndSegment.
1184 @return a simple file name constructed from the main part,
1185 unique part, query, and suffix
1186 */
1187 private String joinParts() {
1188 StringBuffer sb = new StringBuffer(length());
1189 sb.append(mainPart.asStringBuffer());
1190 if (null != uniquePart) {
1191 sb.append(uniquePart);
1192 }
1193 if (suffixAtEnd) {
1194 if (null != query) {
1195 sb.append(query);
1196 }
1197 if (null != suffix) {
1198 sb.append('.');
1199 sb.append(suffix);
1200 }
1201 } else {
1202 if (null != suffix) {
1203 sb.append('.');
1204 sb.append(suffix);
1205 }
1206 if (null != query) {
1207 sb.append(query);
1208 }
1209 }
1210 return sb.toString();
1211 }
1212
1213 /***
1214 Gets the number of available character positions.
1215 If this EndSegment were converted to a path,
1216 it would have a path length and a segment length.
1217 There are two constraints: maxSegLen and maxPathLen.
1218 The number of character positions available before bumping
1219 into the lower constraint is computed.
1220 @return the number of available positions, which may be negative
1221 */
1222 private int lenAvail() {
1223 int len = length();
1224 return Math.min(maxSegLen - len, maxPathLen - dirPathLen - len);
1225 }
1226
1227 /***
1228 Gets the length of the simple file name that would be
1229 created for this EndSegment.
1230 @return the length
1231 */
1232 private int length() {
1233 int r = mainPart.length();
1234 if (null != uniquePart) {
1235 r += uniquePart.length();
1236 }
1237 if (null != query) {
1238 r += query.length();
1239 }
1240 if (null != suffix) {
1241 r += 1 + suffix.length();
1242 }
1243 return r;
1244 }
1245
1246 /***
1247 Trims this EndSegment so a given number of characters are available.
1248 After trimming, there will be room for at least
1249 padding more characters before one of the constraints is
1250 encountered.
1251 The choices for trimming, in priority order, are:
1252 <ol>
1253 <li>Shorten the query.</li>
1254 <li>Remove the query.</li>
1255 <li>Shorten the main part.</li>
1256 <li>Shorten the suffix.</li>
1257 </ol>
1258 @param padding the number of character positions that need to be
1259 available
1260 @throws IllegalStateException
1261 if it's impossible to trim enough
1262 */
1263 private void trimWithPadding(int padding) {
1264 assert padding >= 0 : "padding: " + padding;
1265 int la = lenAvail();
1266 if (la >= padding) {
1267 return;
1268 }
1269
1270
1271
1272 if (null != query) {
1273 query.trimToMax(Math.max(0, query.length() - (padding - la)));
1274 if (0 == query.length()) {
1275 query = null;
1276 }
1277 la = lenAvail();
1278 if (la >= padding) {
1279 return;
1280 }
1281 }
1282 mainPart.trimToMax(Math.max(1, mainPart.length() - (padding - la)));
1283 la = lenAvail();
1284 if (la >= padding) {
1285 return;
1286 }
1287 if (null != suffix) {
1288 suffix = suffix.substring(0, Math.max(1, suffix.length()
1289 - (padding - la)));
1290 la = lenAvail();
1291 if (la >= padding) {
1292 return;
1293 }
1294 }
1295 throw new IllegalStateException("Can not trim " + curi.toString());
1296 }
1297 }
1298
1299 /***
1300 This class represents a dynamically growable string
1301 consisting of substrings ("lumps") that
1302 are treated atomically. If the string is shortened, then an entire
1303 lump is removed. The intent is to treat each %XX escape as a lump.
1304 This class also allows single characters in a source string to be
1305 re-mapped to a different string, possible containing more than
1306 one character.
1307 Each re-mapped character is also treated as a lump.
1308 <p>
1309 For example, suppose part of a URI, between two slashes, is
1310 <code>/VeryLongString...%3A/</code>.
1311 We want to create a corresponding file system directory, but the string
1312 is a little longer than the allowed maximum.
1313 It's better to trim the entire
1314 <code>%3A</code>
1315 off the end than part of it.
1316 This is especially true if, later, we need to append some digits
1317 to create a unique directory name.
1318 So we treat the entire
1319 <code>%3A</code>
1320 as one lump.
1321 */
1322 class LumpyString {
1323 /***
1324 Lumps are indicated by an auxiliary array aux[],
1325 indexed the same as the string. The LUMP_BEGIN bit is set
1326 for a position in the string at which a lump begins.
1327 */
1328 private static final byte LUMP_BEGIN = 0x1;
1329
1330 /*** Bit set for the end of a lump. */
1331 private static final byte LUMP_END = 0x2;
1332
1333 /***
1334 Bit set for all characters in a lump of length greater than 1,
1335 except the beginning and ending characters.
1336 */
1337 private static final byte LUMP_MID = 0x4;
1338
1339 /*** The auxiliary array. */
1340 private byte[] aux;
1341
1342 /*** Holds the string. */
1343 private StringBuffer string;
1344
1345 /***
1346 Creates a LumpyString.
1347 @param str the source string
1348 @param beginIndex the beginning index, inclusive, of the substring
1349 of str to be used
1350 @param endIndex the ending index, exclusive, of the substring
1351 of str to be used
1352 @param padding reserve this many additional character positions
1353 before dynamic growth is needed
1354 @param maxLen the maximum string length, regardless of the
1355 values of beginIndex, endIndex, and padding
1356 @param characterMap maps from characters in the source string
1357 (represented as length-one String values) to replacement String
1358 values (length at least 1).
1359 Each replacement string is treated as one lump.
1360 This is intended to cope with characters that a file system
1361 does not allow.
1362 @param dotBegin if non-null, this replaces a '.' at
1363 <code>str[beginIndex]</code>
1364 @throws IllegalArgumentException if
1365 beginIndex is negative.
1366 @throws IllegalArgumentException if
1367 endIndex is less than beginIndex.
1368 @throws IllegalArgumentException if
1369 padding is negative.
1370 @throws IllegalArgumentException if
1371 maxLen is less than one.
1372 @throws IllegalArgumentException if
1373 characterMap is null.
1374 @throws IllegalArgumentException if
1375 dotBegin is non-null but empty.
1376 */
1377 LumpyString(String str, int beginIndex, int endIndex, int padding,
1378 int maxLen, Map characterMap, String dotBegin) {
1379 if (beginIndex < 0) {
1380 throw new IllegalArgumentException("beginIndex < 0: "
1381 + beginIndex);
1382 }
1383 if (endIndex < beginIndex) {
1384 throw new IllegalArgumentException("endIndex < beginIndex "
1385 + "beginIndex: " + beginIndex + "endIndex: " + endIndex);
1386 }
1387 if (padding < 0) {
1388 throw new IllegalArgumentException("padding < 0: " + padding);
1389 }
1390 if (maxLen < 1) {
1391 throw new IllegalArgumentException("maxLen < 1: " + maxLen);
1392 }
1393 if (null == characterMap) {
1394 throw new IllegalArgumentException("characterMap null");
1395 }
1396 if ((null != dotBegin) && (0 == dotBegin.length())) {
1397 throw new IllegalArgumentException("dotBegin empty");
1398 }
1399
1400
1401
1402 int cap = Math.min(2 * (endIndex - beginIndex) + padding + 1,
1403 maxLen);
1404 string = new StringBuffer(cap);
1405 aux = new byte[cap];
1406 for (int i = beginIndex; i != endIndex; ++i) {
1407 String s = str.substring(i, i + 1);
1408 String lump;
1409 if (".".equals(s) && (i == beginIndex) && (null != dotBegin)) {
1410 lump = dotBegin;
1411 } else {
1412 lump = (String) characterMap.get(s);
1413 }
1414 if (null == lump) {
1415 if ("%".equals(s) && ((endIndex - i) > 2)
1416 && (-1 != Character.digit(str.charAt(i + 1), 16))
1417 && (-1 != Character.digit(str.charAt(i + 2), 16))) {
1418
1419
1420 lump = str.substring(i, i + 3);
1421 i += 2;
1422 } else {
1423 lump = s;
1424 }
1425 }
1426 if ((string.length() + lump.length()) > maxLen) {
1427 assert checkInvariants();
1428 return;
1429 }
1430 append(lump);
1431 }
1432 assert checkInvariants();
1433 }
1434
1435 /***
1436 Converts this LumpyString to a String.
1437 @return the current string contents
1438 */
1439 public String toString() {
1440 assert checkInvariants();
1441 return string.toString();
1442 }
1443
1444 /***
1445 Appends one lump to the end of this string.
1446 @param lump the lump (substring) to append
1447 @throws IllegalArgumentException if
1448 lump is null or empty.
1449 */
1450 void append(String lump) {
1451 if (null == lump) {
1452 throw new IllegalArgumentException("lump null");
1453 }
1454 int lumpLen = lump.length();
1455 if (0 == lumpLen) {
1456 throw new IllegalArgumentException("lump empty");
1457 }
1458 int pos = string.length();
1459 ensureCapacity(pos + lumpLen);
1460 if (1 == lumpLen) {
1461 aux[pos] = LUMP_BEGIN | LUMP_END;
1462 } else {
1463 assert lumpLen > 1;
1464 aux[pos] = LUMP_BEGIN;
1465 ++pos;
1466 for (int i = lumpLen - 2; 0 != i; --i) {
1467 aux[pos] = LUMP_MID;
1468 ++pos;
1469 }
1470 aux[pos] = LUMP_END;
1471 }
1472 string.append(lump);
1473 assert checkInvariants();
1474 }
1475
1476 /***
1477 Returns the string as a StringBuffer.
1478 The caller should <em>not</em> modify the return value.
1479 @return the string
1480 */
1481 StringBuffer asStringBuffer() {
1482 return string;
1483 }
1484
1485 /***
1486 Tests if this string ends with a character.
1487 @param ch the character to test for
1488 @return true if and only if this string ends with ch
1489 */
1490 boolean endsWith(char ch) {
1491 assert checkInvariants();
1492 int len = string.length();
1493 return (0 != len) && (string.charAt(len - 1) == ch);
1494 }
1495
1496 /***
1497 Prepends one character, as a lump, to this string.
1498 @param ch the character to prepend
1499 */
1500 void prepend(char ch) {
1501 assert checkInvariants();
1502 int oldLen = string.length();
1503 ensureCapacity(1 + oldLen);
1504 string.insert(0, ch);
1505 System.arraycopy(aux, 0, aux, 1, oldLen);
1506 aux[0] = LUMP_BEGIN | LUMP_END;
1507 assert checkInvariants();
1508 }
1509
1510 /***
1511 Gets the length of this string.
1512 @return the number of characters in this string
1513 */
1514 int length() {
1515 assert checkInvariants();
1516 return string.length();
1517 }
1518
1519 /***
1520 If necessary, trims this string to a maximum length.
1521 Any trimming is done by removing one or more complete
1522 lumps from the end of this string.
1523 @param maxLen the new maximum length.
1524 After trimming, the actual length of this string will be
1525 at most maxLen.
1526 @throws IllegalArgumentException if
1527 maxLen is negative.
1528 */
1529 void trimToMax(int maxLen) {
1530 if (maxLen < 0) {
1531 throw new IllegalArgumentException("maxLen < 0: " + maxLen);
1532 }
1533 assert checkInvariants();
1534 int cl = string.length();
1535 if (cl > maxLen) {
1536 int nl = maxLen;
1537 while ((0 != nl) && (LUMP_END != (aux[nl - 1] & LUMP_END))) {
1538 --nl;
1539 }
1540 for (int i = nl; i != cl; ++i) {
1541 aux[i] = 0;
1542 }
1543 string.setLength(nl);
1544 }
1545 assert checkInvariants();
1546 }
1547
1548 /***
1549 Checks some assertions on the instance variables.
1550 The intended usage is
1551 <code>assert checkInvariants();</code>
1552 so that if assertions are off, no call is made.
1553 @return true
1554 */
1555 private boolean checkInvariants() {
1556
1557
1558 assert aux.length >= string.length()
1559 : "aux.length: " + aux.length
1560 + " string.length(): " + string.length();
1561
1562
1563 assert (0 == string.length())
1564 || (LUMP_BEGIN == (aux[0] & LUMP_BEGIN))
1565 : "aux[0]: " + aux[0];
1566
1567
1568 assert (0 == string.length())
1569 || (LUMP_END == (aux[string.length() - 1] & LUMP_END))
1570 : "aux[end]: " + aux[string.length() - 1];
1571 return true;
1572 }
1573
1574 /***
1575 Ensures that the capacity is at least equal to the specified minimum.
1576 @param minCapacity the minimum desired capacity
1577 */
1578 private void ensureCapacity(int minCapacity) {
1579 assert checkInvariants();
1580 if (minCapacity > aux.length) {
1581 int nc = 2 * aux.length;
1582 while (nc < minCapacity) {
1583 nc *= 2;
1584 }
1585 byte[] oldAux = aux;
1586 aux = new byte[nc];
1587 System.arraycopy(oldAux, 0, aux, 0, string.length());
1588 }
1589 string.ensureCapacity(minCapacity);
1590 assert checkInvariants();
1591 }
1592 }
1593
1594 /***
1595 This class is returned by uriToFile.
1596 It represents a file system path, both as a File and as
1597 a path relative to the base directory.
1598 */
1599 class URIToFileReturn {
1600 /*** The file system path as a File.*/
1601 private File filePath;
1602
1603 /*** The relative path from baseDir.*/
1604 private StringBuffer relativePath = new StringBuffer(255);
1605
1606 /***
1607 Creates a URIToFileReturn.
1608 @param baseDir the path to the starting directory
1609 @param host the host part of the URI, or null if the host name
1610 should not be part of the path
1611 @param port the port part of the URI, or -1 if the port
1612 should not be part of the path
1613 */
1614 URIToFileReturn(String baseDir, String host, int port) {
1615
1616
1617 StringBuffer startPath = new StringBuffer(baseDir.length() + 32);
1618 startPath.append(baseDir);
1619 if (baseDir.endsWith(File.separator)) {
1620 assert 1 != baseDir.length();
1621 startPath.deleteCharAt(startPath.length() - 1);
1622 }
1623 if (null != host) {
1624 startPath.append(File.separatorChar);
1625 startPath.append(host);
1626 relativePath.append(host);
1627 }
1628 if (port > 0) {
1629 startPath.append(File.separatorChar);
1630 startPath.append(port);
1631 relativePath.append(File.separatorChar);
1632 relativePath.append(port);
1633 }
1634 filePath = new File(startPath.toString());
1635 }
1636
1637 /***
1638 Appends one more segment to this path.
1639 @param f a File representing the path with the next segment added
1640 @param nextSegment the next segment
1641 */
1642 void append(File f, String nextSegment) {
1643 filePath = f;
1644 if (0 != relativePath.length()) {
1645 relativePath.append(File.separatorChar);
1646 }
1647 relativePath.append(nextSegment);
1648 }
1649
1650 /***
1651 Gets this path as a File.
1652 @return this path
1653 */
1654 File getFile() {
1655 return filePath;
1656 }
1657
1658 /***
1659 Gets this path as a relative path from the base directory.
1660 @return the relative path
1661 */
1662 String getRelativePath() {
1663 return relativePath.toString();
1664 }
1665
1666 /***
1667 Tests if this path is longer than a given value.
1668 @param maxLen the value to test
1669 @return true if and only if this path is longer than maxLen
1670 */
1671 boolean longerThan(int maxLen) {
1672 return filePath.getPath().length() > maxLen;
1673 }
1674
1675 /***
1676 Creates all directories in this path as needed.
1677 @throws IOException
1678 if a needed directory could not be created
1679 @throws IOException
1680 if a needed directory is not writeable
1681 @throws IOException
1682 if a non-directory file exists
1683 with the same path as a needed directory
1684 */
1685 void mkdirs() throws IOException {
1686 if (!filePath.exists()) {
1687 if (!filePath.mkdirs()) {
1688 throw new IOException("Can not mkdir "
1689 + filePath.getAbsolutePath());
1690 }
1691 } else if (!filePath.canWrite()) {
1692 throw new IOException("Directory " + filePath.getAbsolutePath()
1693 + " not writeable.");
1694 } else if (!filePath.isDirectory()) {
1695 throw new IOException("File " + filePath.getAbsolutePath()
1696 + " is not a directory.");
1697 }
1698 }
1699 }
1700 }