View Javadoc

1   /* MirrorWriter
2    *
3    * $Id: MirrorWriterProcessor.java 4654 2006-09-25 20:19:54Z paul_jack $
4    *
5    * Created on 2004 October 26
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.writer;
26  
27  import java.io.File;
28  import java.io.FileOutputStream;
29  import java.io.FilenameFilter;
30  import java.io.IOException;
31  import java.text.NumberFormat;
32  import java.util.Collections;
33  import java.util.HashMap;
34  import java.util.HashSet;
35  import java.util.Iterator;
36  import java.util.Map;
37  import java.util.Set;
38  import java.util.TreeMap;
39  import java.util.logging.Level;
40  import java.util.logging.Logger;
41  
42  import javax.management.AttributeNotFoundException;
43  
44  import org.archive.crawler.datamodel.CoreAttributeConstants;
45  import org.archive.crawler.datamodel.CrawlURI;
46  import org.archive.crawler.framework.Processor;
47  import org.archive.crawler.settings.ListType;
48  import org.archive.crawler.settings.RegularExpressionConstraint;
49  import org.archive.crawler.settings.SimpleType;
50  import org.archive.crawler.settings.StringList;
51  import org.archive.crawler.settings.Type;
52  import org.archive.io.RecordingInputStream;
53  import org.archive.io.ReplayInputStream;
54  import org.archive.net.UURI;
55  import org.archive.util.IoUtils;
56  
57  /***
58     Processor module that writes the results of successful fetches to
59     files on disk.
60     
61     Writes contents of one URI to one file on disk.  The files are
62     arranged in a directory hierarchy based on the URI paths.  In that sense
63     they mirror the file hierarchy that might exist on the servers.
64     <p>
65     There are a number of issues involved:
66     <ul>
67     <li>
68     URIs can have arbitrary length, but file systems have length constraints.
69     </li>
70     <li>
71     URIs can contain characters that file systems prohibit.
72     </li>
73     <li>
74     URI paths are case-sensitive, but some file systems are case-insensitive.
75     </li>
76     </ul>
77     This class tries very hard to map each URI into a file system path that
78     obeys all file system constraints and yet reasonably represents
79     the original URI.
80     <p>
81     There would normally be a single instance of this class per Heritrix
82     instance. This class is thread-safe; any number of threads can be in its
83     innerProcess method at once. However, conflicts can still arise in the file
84     system. For example, if several threads try to create the same directory at
85     the same time, only one can win. Therefore, there should be at most one
86     access to a server at a given time.
87     
88     @author Howard Lee Gayle
89  */
90  public class MirrorWriterProcessor
91  extends Processor implements CoreAttributeConstants {
92  
93      private static final long serialVersionUID = 301407556928389168L;
94  
95      /***
96       * Key to use asking settings for case sensitive option.
97       */
98      public static final String ATTR_CASE_SENSITIVE = "case-sensitive";
99  
100     /***
101      * Key to use asking settings for character map.
102      */
103     public static final String ATTR_CHAR_MAP = "character-map";
104 
105     /***
106      * Key to use asking settings for content type map.
107      */
108     public static final String ATTR_CONTENT_TYPE_MAP = "content-type-map";
109 
110     /***
111      * Key to use asking settings for dot begin replacement.
112      */
113     public static final String ATTR_DOT_BEGIN = "dot-begin";
114 
115     /***
116      * Key to use asking settings for dot end replacement.
117      */
118     public static final String ATTR_DOT_END = "dot-end";
119 
120     /***
121      * Key to use asking settings for directory file.
122      */
123     public static final String ATTR_DIRECTORY_FILE = "directory-file";
124 
125     /***
126      * Key to use asking settings for host directory option.
127      */
128     public static final String ATTR_HOST_DIRECTORY = "host-directory";
129 
130     /***
131      * Key to use asking settings for host map.
132      */
133     public static final String ATTR_HOST_MAP = "host-map";
134 
135     /***
136      * Key to use asking settings for maximum file system path length.
137      */
138     public static final String ATTR_MAX_PATH_LEN = "max-path-length";
139 
140     /***
141      * Key to use asking settings for maximum file system path segment length.
142      */
143     public static final String ATTR_MAX_SEG_LEN = "max-segment-length";
144 
145     /***
146      * Key to use asking settings for base directory path value.
147      */
148     public static final String ATTR_PATH = "path";
149 
150     /***
151      * Key to use asking settings for port directory option.
152      */
153     public static final String ATTR_PORT_DIRECTORY = "port-directory";
154 
155     /***
156      * Key to use asking settings for suffix at end option.
157      */
158     public static final String ATTR_SUFFIX_AT_END = "suffix-at-end";
159 
160     /***
161      * Key to use asking settings for too-long directory.
162      */
163     public static final String ATTR_TOO_LONG_DIRECTORY = "too-long-directory";
164 
165     /***
166      * Key to use asking settings for underscore set.
167      */
168     public static final String ATTR_UNDERSCORE_SET = "underscore-set";
169 
170     /*** Default value for ATTR_DOT_BEGIN.*/
171     private static final String DEFAULT_DOT_BEGIN = "%2E";
172 
173     /*** Default maximum file system path length.*/
174     private static final int DEFAULT_MAX_PATH_LEN = 1023;
175 
176     /*** Default maximum file system path segment length.*/
177     private static final int DEFAULT_MAX_SEG_LEN = 255;
178 
179     /*** Default value for ATTR_TOO_LONG_DIRECTORY.*/
180     private static final String DEFAULT_TOO_LONG_DIRECTORY = "LONG";
181 
182     /*** An empty Map.*/
183     private static final Map<String,String> EMPTY_MAP
184      = Collections.unmodifiableMap(new TreeMap<String,String>());
185 
186     /***
187        Regular expression matching a file system path segment.
188        The intent is one or more non-file-separator characters.
189        The backslash is to quote File.separator if it's also backslash.
190     */
191     private static final String PATH_SEGMENT_RE =
192         "[^//" + File.separator + "]+";
193 
194     /***
195        Regular expression constraint on ATTR_DIRECTORY_FILE.
196        The intent is one non-file-separator character,
197        followed by zero or more characters.
198        The backslash is to quote File.separator if it's also backslash.
199     */
200     private static final String TOO_LONG_DIRECTORY_RE =
201         "[^//" + File.separator + "].*";
202 
203     /***
204      * Logger.
205      */
206     private static final Logger logger =
207         Logger.getLogger(MirrorWriterProcessor.class.getName());
208 
209     /***
210      * @param name Name of this processor.
211      */
212     public MirrorWriterProcessor(String name) {
213         super(name, "MirrorWriter processor. " +
214             "A writer that writes each URL to a file on disk named for " +
215             "a derivative of the URL.");
216         Type e; // Current element.
217         addElementToDefinition(new SimpleType(ATTR_CASE_SENSITIVE,
218             "True if the file system is case-sensitive, like UNIX. "
219             + "False if the file system is case-insensitive, "
220             + "like Macintosh HFS+ and Windows.",
221             Boolean.TRUE));
222         addElementToDefinition(new StringList(ATTR_CHAR_MAP,
223             "This list is grouped in pairs. "
224             + "The first string in each pair must have a length of one. "
225             + "If it occurs in a URI path, "
226             + "it is replaced by the second string in the pair. "
227             + "For UNIX, no character mapping is normally needed. "
228             + "For Macintosh, the recommended value is [: %%3A]. "
229             + "For Windows, the recommended value is "
230             + "[' ' %%20  &quot; %%22  * %%2A  : %%3A  < %%3C "
231             + "//> %%3E ? %%3F  //// %%5C  ^ %%5E  | %%7C]."));
232         addElementToDefinition(new StringList(ATTR_CONTENT_TYPE_MAP,
233             "This list is grouped in pairs. "
234             + "If the content type of a resource begins (case-insensitive) "
235             + "with the first string in a pair, the suffix is set to "
236             + "the second string in the pair, replacing any suffix that may "
237             + "have been in the URI.  For example, to force all HTML files "
238             + "to have the same suffix, use [text/html html]."));
239         e = addElementToDefinition(new SimpleType(ATTR_DIRECTORY_FILE,
240             "Implicitly append this to a URI ending with '/'.",
241             "index.html"));
242         e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE,
243             Level.SEVERE, "This must be a simple file name."));
244         e = addElementToDefinition(new SimpleType(ATTR_DOT_BEGIN,
245             "If a segment starts with '.', the '.' is replaced by this.",
246             DEFAULT_DOT_BEGIN));
247         e.addConstraint(new RegularExpressionConstraint(PATH_SEGMENT_RE,
248             Level.SEVERE,
249             "This must not be empty, and must not contain " + File.separator));
250         addElementToDefinition(new SimpleType(ATTR_DOT_END,
251             "If a directory name ends with '.' it is replaced by this.  "
252             + "For all file systems except Windows, '.' is recommended.  "
253             + "For Windows, %%2E is recommended.",
254             "."));
255         addElementToDefinition(new StringList(ATTR_HOST_MAP,
256             "This list is grouped in pairs. "
257             + "If a host name matches (case-insensitive) the first string "
258             + "in a pair, it is replaced by the second string in the pair.  "
259             + "This can be used for consistency when several names are used "
260             + "for one host, for example "
261             + "[12.34.56.78 www42.foo.com]."));
262         addElementToDefinition(new SimpleType(ATTR_HOST_DIRECTORY,
263             "Create a subdirectory named for the host in the URI.",
264             Boolean.TRUE));
265         addElementToDefinition(new SimpleType(ATTR_PATH,
266             "Top-level directory for mirror files.", "mirror"));
267 
268         // TODO: Add a new Constraint subclass so ATTR_MAX_PATH_LEN and
269         // ATTR_MAX_SEG_LEN can be constained to reasonable values.
270         addElementToDefinition(new SimpleType(ATTR_MAX_PATH_LEN,
271             "Maximum file system path length.",
272             new Integer(DEFAULT_MAX_PATH_LEN)));
273         addElementToDefinition(new SimpleType(ATTR_MAX_SEG_LEN,
274             "Maximum file system path segment length.",
275             new Integer(DEFAULT_MAX_SEG_LEN)));
276         addElementToDefinition(new SimpleType(ATTR_PORT_DIRECTORY,
277             "Create a subdirectory named for the port in the URI.",
278             Boolean.FALSE));
279         addElementToDefinition(new SimpleType(ATTR_SUFFIX_AT_END,
280             "If true, the suffix is placed at the end of the path, "
281             + "after the query (if any).  If false, the suffix is placed "
282             + "before the query.",
283             Boolean.TRUE));
284         e = addElementToDefinition(new SimpleType(ATTR_TOO_LONG_DIRECTORY,
285             "If all the directories in the URI would exceed, "
286             + "or come close to exceeding, the file system maximum "
287             + "path length, then they are all replaced by this.",
288             DEFAULT_TOO_LONG_DIRECTORY));
289         e.addConstraint(new RegularExpressionConstraint(TOO_LONG_DIRECTORY_RE,
290             Level.SEVERE, "This must be relative and not empty."));
291         addElementToDefinition(new StringList(ATTR_UNDERSCORE_SET,
292             "If a directory name appears (case-insensitive) in this list "
293             + "then an underscore is placed before it.  "
294             + "For all file systems except Windows, this is not needed.  "
295             + "For Windows, the following is recommended: "
296             + "[com1 com2 com3 com4 com5 com6 com7 com8 com9 "
297             + "lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9 "
298             + "con nul prn]."));
299     }
300 
301     protected void innerProcess(CrawlURI curi) {
302         if (!curi.isSuccess()) {
303             return;
304         }
305         UURI uuri = curi.getUURI(); // Current URI.
306 
307         // Only http and https schemes are supported.
308         String scheme = uuri.getScheme();
309         if (!"http".equalsIgnoreCase(scheme)
310                 && !"https".equalsIgnoreCase(scheme)) {
311             return;
312         }
313         RecordingInputStream recis = curi.getHttpRecorder().getRecordedInput();
314         if (0L == recis.getResponseContentLength()) {
315             return;
316         }
317 
318         String baseDir = null; // Base directory.
319         String baseSeg = null; // ATTR_PATH value.
320         try {
321             baseSeg = (String) getAttribute(ATTR_PATH, curi);
322         } catch (AttributeNotFoundException e) {
323             logger.warning(e.getLocalizedMessage());
324             return;
325         }
326 
327         // Trim any trailing File.separatorChar characters from baseSeg.
328         while ((baseSeg.length() > 1) && baseSeg.endsWith(File.separator)) {
329             baseSeg = baseSeg.substring(0, baseSeg.length() - 1);
330         }
331         if (0 == baseSeg.length()) {
332             baseDir = getController().getDisk().getPath();
333         } else if ((new File(baseSeg)).isAbsolute()) {
334             baseDir = baseSeg;
335         } else {
336             baseDir = getController().getDisk().getPath() + File.separator
337                 + baseSeg;
338         }
339 
340         // Already have a path for this URI.
341         boolean reCrawl = curi.containsKey(A_MIRROR_PATH);
342 
343         /*
344           The file system path, relative to the value of ATTR_PATH, where
345           this resource should be written.  The intent is to
346           add later a persistent mapping from URI to path.
347           This will allow a URI to be re-crawled and updated
348           if it has changed.  If the resource has already been fetched
349           and written to a file before, the path to that file
350           has already been obtained from the persistent mapping
351           and placed on the AList by some other module,
352           such as the frontier.
353         */
354         String mps = null;
355         File destFile = null; // Write resource contents to this file.
356         try {
357             if (reCrawl) {
358                 mps = curi.getString(A_MIRROR_PATH);
359                 destFile = new File(baseDir + File.separator + mps);
360                 File parent = destFile.getParentFile();
361                 if (null != parent) {
362                     IoUtils.ensureWriteableDirectory(parent);
363                 }
364             } else {
365                 URIToFileReturn r = null; // Return from uriToFile().
366                 try {
367                      r = uriToFile(baseDir, curi);
368                 } catch (AttributeNotFoundException e) {
369                     logger.warning(e.getLocalizedMessage());
370                     return;
371                 }
372                 destFile = r.getFile();
373                 mps = r.getRelativePath();
374             }
375             logger.info(uuri.toString() + " -> " + destFile.getPath());
376             writeToPath(recis, destFile);
377             if (!reCrawl) {
378                 curi.putString(A_MIRROR_PATH, mps);
379             }
380         } catch (IOException e) {
381             curi.addLocalizedError(this.getName(), e, "Mirror");
382         }
383     }
384 
385     /***
386        Gets the directory in which the file will reside.
387        Any directories needed are created.
388        @param baseDir the path to the starting directory
389        @param host the host part of the URI, or null if the host name
390        should not be part of the returned path
391        @param port the port part of the URI, or -1 if the port
392        should not be part of the returned path
393        @param segs all the segments in the URI
394        @param maxLen the maximum path length allowed to the directory;
395        this must leave some room for the file itself
396        @return the directory, or null if maxLen would be exceeded
397        @throws IOException
398        if a needed directory could not be created
399        @throws IOException
400        if a needed directory is not writeable
401        @throws IOException
402        if a non-directory file exists with the same path as a needed directory
403     */
404     private URIToFileReturn dirPath(String baseDir, String host, int port,
405                                     PathSegment[] segs, int maxLen)
406         throws IOException {
407 
408         // Return value.
409         URIToFileReturn r = new URIToFileReturn(baseDir, host, port);
410         r.mkdirs();
411         for (int i = 0; (segs.length - 1) != i; ++i) {
412             segs[i].addToPath(r);
413             if (r.longerThan(maxLen)) {
414                 return null;
415             }
416         }
417         return r;
418     }
419 
420     /***
421        Ensures that a list contains an even number of elements.
422        If not, the last element is removed.
423        @param list the list
424     */
425     private void ensurePairs(ListType list) {
426         if (1 == (list.size() % 2)) {
427             list.remove(list.size() - 1);
428         }
429     }
430 
431     /***
432        Makes a path in which a resource can be stored.
433        @param baseDir the path to the starting directory
434        @param curi the URI
435        @return a path to the file in which to store the resource
436        @throws AttributeNotFoundException
437        if a needed setting is missing
438        @throws IOException
439        if a needed directory could not be created
440        @throws IOException
441        if a needed directory is not writeable
442        @throws IOException
443        if a non-directory file exists with the same path as a needed directory
444     */
445     private URIToFileReturn uriToFile(String baseDir, CrawlURI curi)
446         throws AttributeNotFoundException, IOException {
447         UURI uuri = curi.getUURI(); // Current URI.
448         String host = null;
449         Boolean hd = (Boolean) getAttribute(ATTR_HOST_DIRECTORY, curi);
450         if (hd.booleanValue()) {
451             host = uuri.getHost();
452             StringList hostMap = (StringList) getAttribute(ATTR_HOST_MAP, curi);
453             if ((null != hostMap) && (hostMap.size() > 1)) {
454                 ensurePairs(hostMap);
455                 Iterator<String> i = hostMap.typesafe().iterator();
456                 for (boolean more = true; more && i.hasNext();) {
457                     String h1 = i.next();
458                     String h2 = i.next();
459                     if (host.equalsIgnoreCase(h1)) {
460                         more = false;
461                         if ((null != h2) && (0 != h2.length())) {
462                             host = h2;
463                         }
464                     }
465                 }
466             }
467         }
468 
469         int port =
470             ((Boolean) getAttribute(ATTR_PORT_DIRECTORY, curi)).booleanValue()
471             ? uuri.getPort()
472             : -1;
473 
474         String suffix = null; // Replacement suffix.
475         StringList ctm = (StringList) getAttribute(ATTR_CONTENT_TYPE_MAP, curi);
476         if ((null != ctm) && (ctm.size() > 1)) {
477             ensurePairs(ctm);
478             String contentType = curi.getContentType().toLowerCase();
479             Iterator i = ctm.iterator();
480             for (boolean more = true; more && i.hasNext();) {
481                 String ct = (String) i.next();
482                 String suf = (String) i.next();
483                 if ((null != ct) && contentType.startsWith(ct.toLowerCase())) {
484                     more = false;
485                     if ((null != suf) && (0 != suf.length())) {
486                         suffix = suf;
487                     }
488                 }
489             }
490         }
491 
492         int maxSegLen =
493             ((Integer) getAttribute(ATTR_MAX_SEG_LEN, curi)).intValue();
494         if (maxSegLen < 2) {
495             maxSegLen = DEFAULT_MAX_SEG_LEN;
496         }
497 
498         int maxPathLen =
499             ((Integer) getAttribute(ATTR_MAX_PATH_LEN, curi)).intValue();
500         if (maxPathLen < 2) {
501             maxPathLen = DEFAULT_MAX_PATH_LEN;
502         }
503 
504         Map<String,String> characterMap = EMPTY_MAP;
505         StringList cm = (StringList) getAttribute(ATTR_CHAR_MAP, curi);
506         if ((null != cm) && (cm.size() > 1)) {
507             ensurePairs(cm);
508             characterMap = new HashMap<String,String>(cm.size()); 
509             // Above will be half full.
510             for (Iterator i = cm.iterator(); i.hasNext();) {
511                 String s1 = (String) i.next();
512                 String s2 = (String) i.next();
513                 if ((null != s1) && (1 == s1.length()) && (null != s2)
514                         && (0 != s2.length())) {
515                     characterMap.put(s1, s2);
516                 }
517             }
518         }
519 
520         String dotBegin = (String) getAttribute(ATTR_DOT_BEGIN, curi);
521         if (".".equals(dotBegin)) {
522             dotBegin = null;
523         }
524 
525         String dotEnd = (String) getAttribute(ATTR_DOT_END, curi);
526         if (".".equals(dotEnd)) {
527             dotEnd = null;
528         }
529 
530         String tld = (String) getAttribute(ATTR_TOO_LONG_DIRECTORY, curi);
531         if ((null == tld) || (0 == tld.length())
532                 || (-1 != tld.indexOf(File.separatorChar))) {
533             tld = DEFAULT_TOO_LONG_DIRECTORY;
534         }
535 
536         Set<String> underscoreSet = null;
537         StringList us = (StringList) getAttribute(ATTR_UNDERSCORE_SET, curi);
538         if ((null != us) && (0 != us.size())) {
539             underscoreSet = new HashSet<String>(us.size(), 0.5F);
540             for (String s: us.typesafe()) {
541                 if ((null != s) && (0 != s.length())) {
542                     underscoreSet.add(s.toLowerCase());
543                 }
544             }
545         }
546 
547         return uriToFile(curi, host, port, uuri.getPath(), uuri.getQuery(),
548             suffix, baseDir, maxSegLen, maxPathLen,
549             ((Boolean) getAttribute(ATTR_CASE_SENSITIVE, curi)).booleanValue(),
550             (String) getAttribute(ATTR_DIRECTORY_FILE, curi),
551             characterMap, dotBegin, dotEnd, tld,
552             ((Boolean) getAttribute(ATTR_SUFFIX_AT_END, curi)).booleanValue(),
553             underscoreSet);
554     }
555 
556     /***
557        Makes a path in which a resource can be stored.
558        @param curi the URI
559        @param host the host part of the URI, or null if the host name
560        should not be part of the returned path
561        @param port the port part of the URI, or -1 if the port
562        should not be part of the returned path
563        @param uriPath the path part of the URI (must be absolute)
564        @param query the query part of the URI, or null if none
565        @param suffix if non-null, use this as the suffix in preference to
566        any suffix that uriPath might have
567        @param baseDir the path to the starting directory
568        @param maxSegLen the maximum number of characters allowed in one
569        file system path segment (component)
570        @param maxPathLen the maximum number of characters allowed in a
571        file system path
572        @param caseSensitive if true, the file system is assumed to be
573        case-sensitive; otherwise the file system is assumed to be
574        case-insensitive but case-preserving
575        @param dirFile the simple file name to append to a URI path
576        ending in '/'
577        @param characterMap a map from characters (as length-1 String values) in
578        the URI path and query to replacement String values
579        @param dotBegin if non-null, this replaces a '.' at
580        the beginning of a segment
581        @param dotEnd if non-null, this replaces a '.' that appears at the end
582        of a directory name
583        @param tooLongDir if the path length would exceed or be close to
584        exceeding maxPathLen then this simple name is used as a directory
585        under baseDir instead
586        @param suffixAtEnd if true, the suffix is placed at the end of the
587        path, after the query (if any); otherwise, the suffix is placed
588        before the query
589        @param underscoreSet if non-null and a segment, after conversion
590        to lower case, is in this set, then prepend an underscore
591        to the segment
592        @return a path to the file in which to store the resource
593        @throws IOException
594        if a needed directory could not be created
595        @throws IOException
596        if a needed directory is not writeable
597        @throws IOException
598        if a non-directory file exists with the same path as a needed directory
599     */
600     private URIToFileReturn uriToFile(CrawlURI curi, String host, int port,
601             String uriPath, String query, String suffix, String baseDir,
602             int maxSegLen, int maxPathLen, boolean caseSensitive,
603             String dirFile, Map characterMap, String dotBegin, String dotEnd,
604             String tooLongDir, boolean suffixAtEnd, Set underscoreSet)
605             throws IOException {
606         assert (null == host) || (0 != host.length());
607         assert 0 != uriPath.length();
608         assert '/' == uriPath.charAt(0) : "uriPath: " + uriPath;
609         assert -1 == uriPath.indexOf("//") : "uriPath: " + uriPath;
610         assert -1 == uriPath.indexOf("/./") : "uriPath: " + uriPath;
611         assert !uriPath.endsWith("/.") : "uriPath: " + uriPath;
612         assert (null == query) || (-1 == query.indexOf('/'))
613             : "query: " + query;
614         assert (null == suffix)
615             || ((0 != suffix.length()) && (-1 == suffix.indexOf('/')))
616             : "suffix: " + suffix;
617         assert 0 != baseDir.length();
618         assert maxSegLen > 2 : "maxSegLen: " + maxSegLen;
619         assert maxPathLen > 1;
620         assert maxPathLen >= maxSegLen
621             : "maxSegLen: " + maxSegLen + " maxPathLen: " + maxPathLen;
622         assert 0 != dirFile.length();
623         assert -1 == dirFile.indexOf("/") : "dirFile: " + dirFile;
624         assert null != characterMap;
625         assert (null == dotBegin) || (0 != dotBegin.length());
626         assert (null == dotEnd) || !dotEnd.endsWith(".") : "dotEnd: " + dotEnd;
627         assert 0 != tooLongDir.length();
628         assert '/' != tooLongDir.charAt(0) : "tooLongDir: " + tooLongDir;
629 
630         int nSegs = 0; // Number of segments in the URI path.
631         for (int i = 0; uriPath.length() != i; ++i) {
632             if ('/' == uriPath.charAt(i)) {
633                 ++nSegs; // Just count slashes.
634             }
635         }
636         assert nSegs > 0 : "uriPath: " + uriPath;
637         PathSegment[] segs = new PathSegment[nSegs]; // The segments.
638         int slashIndex = 0; // Index in uriPath of current /.
639         for (int i = 0; (segs.length - 1) != i; ++i) {
640             int nsi = uriPath.indexOf('/', slashIndex + 1); // Next index.
641             assert nsi > slashIndex : "uriPath: " + uriPath;
642             segs[i] = new DirSegment(uriPath, slashIndex + 1, nsi,
643                                      maxSegLen, caseSensitive, curi,
644                                      characterMap, dotBegin, dotEnd,
645                                      underscoreSet);
646             slashIndex = nsi;
647         }
648         if (slashIndex < (uriPath.length() - 1)) {
649 
650             // There's something after the last /.
651             segs[segs.length - 1] = new EndSegment(uriPath, slashIndex + 1,
652                     uriPath.length(), maxSegLen, caseSensitive, curi,
653                     characterMap, dotBegin, query, suffix, maxPathLen,
654                     suffixAtEnd);
655         } else {
656 
657             // The URI ends with a /.
658             segs[segs.length - 1] = new EndSegment(dirFile, 0, dirFile.length(),
659                     maxSegLen, caseSensitive, curi, characterMap, null,
660                     query, suffix, maxPathLen, suffixAtEnd);
661         }
662         URIToFileReturn r = dirPath(baseDir, host, port, segs,
663                                     maxPathLen - maxSegLen);
664         if (null == r) {
665 
666             // The path is too long.
667             // Replace all the segment directories by tooLongDir.
668             PathSegment endSegment = segs[segs.length - 1];
669             segs = new PathSegment[2];
670             segs[0] = new DirSegment(tooLongDir, 0, tooLongDir.length(),
671                                      maxSegLen, caseSensitive, curi, EMPTY_MAP,
672                                      null, null, null);
673             segs[1] = endSegment;
674             r = dirPath(baseDir, host, port, segs, maxPathLen - maxSegLen);
675         }
676         segs[segs.length - 1].addToPath(r);
677         return r;
678     }
679 
680     /***
681        Copies a resource into a file.
682        A temporary file is created and then atomically renamed to
683        the destination file.
684        This prevents leaving a partial file in case of a crash.
685        @param recis the RecordingInputStream that recorded the contents
686        of the resource
687        @param dest the destination file
688        @throws IOException on I/O error
689        @throws IOException if
690        the file rename fails
691     */
692     private void writeToPath(RecordingInputStream recis, File dest)
693         throws IOException {
694         ReplayInputStream replayis = recis.getContentReplayInputStream();
695         File tf = new File (dest.getPath() + "N");
696         FileOutputStream fos = new FileOutputStream(tf);
697         try {
698             replayis.readFullyTo(fos);
699         } finally {
700             fos.close();
701             replayis.close();
702         }
703         if (!tf.renameTo(dest)) {
704             throw new IOException("Can not rename " + tf.getAbsolutePath()
705                                   + " to " + dest.getAbsolutePath());
706         }
707 
708     }
709 
710     /***
711        This class represents one segment (component) of a URI path.
712        A segment between '/' characters is a directory segment.
713        The segment after the last '/' is the end segment.
714     */
715     abstract class PathSegment {
716         /***
717            existsMaybeCaseSensitive return code
718            for a file that does not exist.
719         */
720         protected static final int EXISTS_NOT = 1;
721 
722         /***
723            existsMaybeCaseSensitive return code
724            for a file that exists.
725            Furthermore, the comparison is case-sensitive.
726         */
727         protected static final int EXISTS_EXACT_MATCH = 2;
728 
729         /***
730            existsMaybeCaseSensitive return code
731            for a file that exists, using a case-insensitive comparison.
732            Furthermore, the file would not exist if the comparison
733            were case-sensitive.
734         */
735         protected static final int EXISTS_CASE_INSENSITIVE_MATCH = 3;
736 
737         /*** The URI, for logging and error reporting.*/
738         protected CrawlURI curi;
739 
740         /***
741            The main part of this segment.
742            For a directory segment, that's all there is.
743            For an end segment, it's the part of the URI after the last '/'
744            up to but not including the '.' before the suffix (if any).
745         */
746         protected LumpyString mainPart = null;
747 
748         /***
749            The maximum number of characters allowed
750            in one file system path segment.
751            A URI segment can potentially be much longer,
752            but we'll trim it to this.
753         */
754         protected int maxSegLen;
755 
756         /*** If true, the file system is assumed to be
757             case-sensitive; otherwise the file system is assumed to be
758             case-insensitive.
759         */
760         private boolean caseSensitive;
761 
762         /***
763            Creates a new PathSegment.
764            @param maxSegLen the maximum number of characters
765            allowed in one path segment
766            @param caseSensitive if true, the file system is assumed to be
767            case-sensitive; otherwise the file system is assumed to be
768            case-insensitive
769            @param curi the URI
770            @throws IllegalArgumentException if
771            maxSegLen is too small
772         */
773         PathSegment(int maxSegLen, boolean caseSensitive, CrawlURI curi) {
774             if (maxSegLen < 2) {
775                 throw new IllegalArgumentException("maxSegLen: " + maxSegLen);
776             }
777             this.maxSegLen = maxSegLen;
778             this.caseSensitive = caseSensitive;
779             this.curi = curi;
780         }
781 
782         /***
783            Adds this segment to a file path.
784            This is the key method of this class.
785            It extends the given path by one segment,
786            named to obey all constraints.
787            A new directory is created if necessary.
788            @param currentPath the current path, to which this segment is added
789            @throws IOException
790            if a needed directory could not be created
791            @throws IOException
792            if a needed directory is not writeable
793         */
794         abstract void addToPath(URIToFileReturn currentPath) throws IOException;
795 
796         /***
797            Checks if a file (including directories) exists.
798            @param fsf the directory containing the file to be checked
799            @param segStr the simple file or directory name
800            @param check the file or directory for which to check
801            @return EXISTS_NOT if check does not exist,
802            EXISTS_EXACT_MATCH if check exists with a name that matches
803            (case-sensitive) segStr, and
804            EXISTS_CASE_INSENSITIVE_MATCH if check exists
805            with a name that matches
806            segStr using a case-insensitive match but not using a
807            case-sensitive match
808         */
809         protected int existsMaybeCaseSensitive(File fsf, String segStr,
810                                                File check) {
811             if (caseSensitive) {
812                 return check.exists() ? EXISTS_EXACT_MATCH : EXISTS_NOT;
813             }
814             if (!check.exists()) {
815                 return EXISTS_NOT;
816             }
817 
818             /*
819               The JVM says the file exists, but the file system is assumed to be
820               case-insensitive, so do we have an exact match or just a
821               case-insensitive match?  We get an array of all the
822               file names that match (case-insensitive) the one we're
823               checking, then we can look for a case-sensitive match.
824             */
825             String[] fna = fsf.list(new CaseInsensitiveFilenameFilter(segStr));
826             for (int i = 0; fna.length != i; ++i) {
827                 if (segStr.equals(fna[i])) {
828                   return EXISTS_EXACT_MATCH;
829                 }
830             }
831             return EXISTS_CASE_INSENSITIVE_MATCH;
832         }
833 
834         /***
835            This class implements a FilenameFilter that matches
836            by name, ignoring case.
837         */
838         class CaseInsensitiveFilenameFilter implements FilenameFilter {
839             /*** The file name we're looking for. */
840             private String target;
841 
842             /***
843                Creates a CaseInsensitiveFilenameFilter.
844                @param target the target file name
845                @throws IllegalArgumentException if
846                target is null or empty.
847             */
848             CaseInsensitiveFilenameFilter(String target) {
849                 if (null == target) {
850                     throw new IllegalArgumentException("target null");
851                 }
852                 if (0 == target.length()) {
853                     throw new IllegalArgumentException("target empty");
854                 }
855                 this.target = target;
856             }
857 
858             public boolean accept(File dir, String name) {
859                 return target.equalsIgnoreCase(name);
860             }
861         }
862     }
863 
864     /***
865        This class represents one directory segment (component) of a URI path.
866     */
867     class DirSegment extends PathSegment {
868         /*** If a segment name is in this set, prepend an underscore.*/
869         private Set underscoreSet;
870 
871         /***
872            Creates a DirSegment.
873            @param uriPath the path part of the URI
874            @param beginIndex the beginning index, inclusive, of the substring
875            of uriPath to be used
876            @param endIndex the ending index, exclusive, of the substring
877            of uriPath to be used
878            @param maxSegLen the maximum number of characters allowed in one
879            file system path segment (component)
880            @param caseSensitive if true, the file system is assumed to be
881            case-sensitive; otherwise the file system is assumed to be
882            case-insensitive but case-preserving
883            @param curi the URI
884            @param characterMap a map from characters
885            (as length-1 String values) in
886            the URI path and query to replacement String values
887            @param dotBegin if non-null, this replaces a '.' at
888            the beginning of the directory name
889            @param dotEnd if non-null, this replaces a '.'
890            that appears at the end of a directory name
891            @param underscoreSet if non-null and a segment, after conversion
892            to lower case, is in this set, then prepend an underscore
893            to the segment
894            @throws IllegalArgumentException if
895            beginIndex is negative.
896            @throws IllegalArgumentException if
897            endIndex is less than beginIndex.
898            @throws IllegalArgumentException if
899            maxSegLen is too small.
900         */
901         DirSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen,
902                    boolean caseSensitive, CrawlURI curi, Map characterMap,
903                    String dotBegin, String dotEnd, Set underscoreSet) {
904             super(maxSegLen, caseSensitive, curi);
905             mainPart = new LumpyString(uriPath, beginIndex, endIndex,
906                                        (null == dotEnd) ? 0 : dotEnd.length(),
907                                        this.maxSegLen, characterMap, dotBegin);
908             if (null != dotEnd) {
909 
910                 // We might get a segment like /VeryLong............../
911                 // so we have to loop to guarantee the segment doesn't
912                 // end with a dot.
913                 int dl = dotEnd.length();
914                 while (mainPart.endsWith('.')) {
915 
916                     // Chop off the dot at the end.
917                     mainPart.trimToMax(mainPart.length() - 1);
918                     if ((mainPart.length() + dl) <= this.maxSegLen) {
919                         mainPart.append(dotEnd);
920                     }
921                 }
922             }
923             this.underscoreSet = underscoreSet;
924         }
925 
926         void addToPath(URIToFileReturn currentPath) throws IOException {
927             NumberFormat nf = null;
928             int startLen = mainPart.length(); // Starting length.
929             for (int i = 0; ; ++i) {
930                 if (0 != i) {
931 
932                     // Try to create a unique file name by appending a
933                     // number.
934                     if (null == nf) {
935                         nf = NumberFormat.getIntegerInstance();
936                     }
937                     String ending = nf.format(i);
938                     mainPart.trimToMax(Math.min(startLen,
939                                                 maxSegLen - ending.length()));
940                     mainPart.append(ending);
941                 }
942                 String segStr = mainPart.toString();
943                 if ((null != underscoreSet)
944                         && underscoreSet.contains(segStr.toLowerCase())) {
945                     mainPart.prepend('_');
946                     ++startLen;
947                     mainPart.trimToMax(maxSegLen);
948                     segStr = mainPart.toString();
949                 }
950                 File fsf = currentPath.getFile();
951                 File f = new File(fsf, segStr);
952                 int er = existsMaybeCaseSensitive(fsf, segStr, f);
953                 switch (er) {
954                 case EXISTS_NOT:
955                     if (!f.mkdir()) {
956                         throw new IOException("Can not mkdir "
957                                               + f.getAbsolutePath());
958                     }
959                     currentPath.append(f, segStr);
960                     return; // Created new directory.
961 
962                 case EXISTS_EXACT_MATCH:
963                     if (f.isDirectory()) {
964                         if (!f.canWrite()) {
965                             throw new IOException("Directory "
966                                                   + f.getAbsolutePath()
967                                                   + " not writeable.");
968                         }
969 
970                         /*
971                           A writeable directory already exists.
972                           Assume it's the one we want.
973                           This assumption fails for cases like
974                           http://foo.com/a*256/b.html
975                           followed by
976                           http://foo.com/a*256z/b.html
977                           where a*256 means a sequence of the maximum allowed
978                           number of "a"s.
979                         */
980                         currentPath.append(f, segStr);
981                         return;
982                     }
983 
984                     /*
985                       A segment already exists but isn't a directory.
986                       This could arise from, for example,
987                       http://foo.com/a*256
988                       followed by
989                       http://foo.com/a*256b/b.html
990                       We need to find a directory we created before in this
991                       situation, or make a new directory with a unique name.
992                       Going around the loop should eventually do that.
993                     */
994                     break;
995 
996                 case EXISTS_CASE_INSENSITIVE_MATCH:
997                     /*
998                       A segment already exists that's a case-insensitive match
999                       but not an exact match.  It may or may not be a directory.
1000                       This could arise, on a case-insensitive, case-preserving
1001                       file system (such as Macintosh HFS+).  For example,
1002                       http://foo.com/bar/z.html
1003                       followed by
1004                       http://foo.com/BAR/z.html
1005                       would do it.  We want bar and BAR to turn into different
1006                       directories.
1007                       Going around the loop should eventually do that.
1008                     */
1009                     break;
1010 
1011                 default:
1012                     throw new IllegalStateException("Code: " + er);
1013                 }
1014             }
1015         }
1016     }
1017 
1018     /***
1019        This class represents the last segment (component) of a URI path.
1020     */
1021     class EndSegment extends PathSegment {
1022         /***
1023            The number of characters in the path up to this EndSegment,
1024            including the final File.separatorChar.
1025         */
1026         private int dirPathLen;
1027 
1028         /***
1029            The maximum number of characters allowed in a file path, minus 1.
1030            The extra 1 is reserved for temporarily appending
1031            a character so an existing file can be replaced atomically,
1032            for example, by writing
1033            <code>foo.htmlN</code>
1034            and then renaming it to
1035            <code>foo.html</code>.
1036         */
1037         private int maxPathLen;
1038 
1039         /*** The query part of the URI, or null if none.*/
1040         private LumpyString query = null;
1041 
1042         /***
1043            The suffix, or null if none.
1044            This isn't a LumpyString because we'd only trim a suffix
1045            if space were very, very tight.
1046         */
1047         private String suffix = null;
1048 
1049         /***
1050            True if the suffix goes at the end, after the query.
1051            False if the suffix goes before the query.
1052         */
1053         private boolean suffixAtEnd;
1054 
1055         /*** Appended to mainPart if necessary to create a unique file name.*/
1056         private String uniquePart = null;
1057 
1058         /***
1059            Creates an EndSegment.
1060            @param uriPath the path part of the URI
1061            @param beginIndex the beginning index, inclusive, of the substring
1062            of uriPath to be used
1063            @param endIndex the ending index, exclusive, of the substring
1064            of uriPath to be used
1065            @param maxSegLen the maximum number of characters allowed in one
1066            file system path segment (component)
1067            @param caseSensitive if true, the file system is assumed to be
1068            case-sensitive; otherwise the file system is assumed to be
1069            case-insensitive but case-preserving
1070            @param curi the URI
1071            @param characterMap maps characters (as length-1 String values) in
1072            the URI path and query to replacement String values
1073            @param dotBegin if non-null, this replaces a '.' at
1074            the beginning of the segment
1075            @param query the query part of the URI, or null if none
1076            @param suffix if non-null, use this as the suffix in preference to
1077            any suffix that uriPath might have
1078            @param maxPathLen the maximum number of characters allowed in a
1079            file system path
1080            @param suffixAtEnd if true, the suffix is placed at the end of the
1081            path, after the query (if any); otherwise, the suffix is placed
1082            before the query
1083            @throws IllegalArgumentException if
1084            beginIndex is negative.
1085            @throws IllegalArgumentException if
1086            endIndex is less than beginIndex.
1087            @throws IllegalArgumentException if
1088            maxSegLen is too small.
1089         */
1090         EndSegment(String uriPath, int beginIndex, int endIndex, int maxSegLen,
1091                    boolean caseSensitive, CrawlURI curi, Map characterMap,
1092                    String dotBegin, String query, String suffix,
1093                    int maxPathLen, boolean suffixAtEnd) {
1094             super(maxSegLen - 1, caseSensitive, curi);
1095             int mpe = endIndex; // endIndex for the main part (no suffix).
1096             int ldi = uriPath.lastIndexOf('.'); // Index of last dot.
1097             if ((ldi > 0) && (ldi < (endIndex - 1)) && (ldi > beginIndex)) {
1098                 mpe = ldi; // uriPath has a suffix.
1099             }
1100             this.suffix = suffix;
1101             if ((null == this.suffix) && (mpe < (endIndex - 1))) {
1102 
1103                 // There's no replacement suffix and uriPath has a suffix.
1104                 // Run it through a LumpyString to do the character mapping.
1105                 LumpyString ls = new LumpyString(uriPath, mpe + 1, endIndex, 0,
1106                                                  this.maxSegLen, characterMap,
1107                                                  null);
1108                 this.suffix = ls.toString();
1109             }
1110             int pad = ((null == this.suffix) ? 0 : (1 + this.suffix.length()))
1111                 + ((null == query) ? 0 : query.length());
1112             mainPart = new LumpyString(uriPath, beginIndex, mpe, pad,
1113                                        this.maxSegLen, characterMap, dotBegin);
1114             this.maxPathLen = maxPathLen - 1;
1115             if (null != query) {
1116                 this.query = new LumpyString(query, 0, query.length(), 0,
1117                                              this.maxSegLen, characterMap,
1118                                              null);
1119             }
1120             this.suffixAtEnd = suffixAtEnd;
1121         }
1122 
1123         void addToPath(URIToFileReturn currentPath) {
1124             File fsf = currentPath.getFile();
1125             NumberFormat nf = null;
1126             dirPathLen = 1 + fsf.getPath().length();
1127             for (int i = 0; ; ++i) {
1128                 if (0 != i) {
1129                     if (null == nf) {
1130                         nf = NumberFormat.getIntegerInstance();
1131                     }
1132                     uniquePart = nf.format(i);
1133                 }
1134                 trimWithPadding((null == uniquePart) ? 0 : uniquePart.length());
1135                 String segStr = joinParts(); // This EndSegment as a String.
1136                 File f = new File(fsf, segStr);
1137 
1138                 // Code for whether file exists.
1139                 int er = existsMaybeCaseSensitive(fsf, segStr, f);
1140                 switch (er) {
1141                 case EXISTS_NOT:
1142                     currentPath.append(f, segStr);
1143                     return;
1144 
1145                 case EXISTS_EXACT_MATCH:
1146                     if (f.isFile()) {
1147                         currentPath.append(f, segStr);
1148                         return;
1149                     }
1150 
1151                     /*
1152                       A file already exists but isn't an ordinary file.
1153                       It might be a directory, special file, named pipe,
1154                       whatever.
1155                       We need to find an unused file name,
1156                       or an ordinary file.
1157                       Going around the loop should eventually do that.
1158                     */
1159                     break;
1160 
1161                 case EXISTS_CASE_INSENSITIVE_MATCH:
1162                     /*
1163                       A file already exists that's a case-insensitive match
1164                       but not an exact match.
1165                       This could arise, on a case-insensitive, case-preserving
1166                       file system (such as Macintosh HFS+).  For example,
1167                       http://foo.com/files.zip
1168                       followed by
1169                       http://foo.com/FILES.ZIP
1170                       would do it.  We want files.zip and FILES.ZIP to turn into
1171                       different files. Going around the loop should eventually
1172                       do that.
1173                     */
1174                     break;
1175 
1176                 default:
1177                     throw new IllegalStateException("Code: " + er);
1178                 }
1179             }
1180         }
1181 
1182         /***
1183            Creates a simple file name from the parts of this EndSegment.
1184            @return a simple file name constructed from the main part,
1185            unique part, query, and suffix
1186         */
1187         private String joinParts() {
1188             StringBuffer sb = new StringBuffer(length());
1189             sb.append(mainPart.asStringBuffer());
1190             if (null != uniquePart) {
1191                 sb.append(uniquePart);
1192             }
1193             if (suffixAtEnd) {
1194                 if (null != query) {
1195                     sb.append(query);
1196                 }
1197                 if (null != suffix) {
1198                     sb.append('.');
1199                     sb.append(suffix);
1200                 }
1201             } else {
1202                 if (null != suffix) {
1203                     sb.append('.');
1204                     sb.append(suffix);
1205                 }
1206                 if (null != query) {
1207                     sb.append(query);
1208                 }
1209             }
1210             return sb.toString();
1211         }
1212 
1213         /***
1214            Gets the number of available character positions.
1215            If this EndSegment were converted to a path,
1216            it would have a path length and a segment length.
1217            There are two constraints: maxSegLen and maxPathLen.
1218            The number of character positions available before bumping
1219            into the lower constraint is computed.
1220            @return the number of available positions, which may be negative
1221         */
1222         private int lenAvail() {
1223             int len = length();
1224             return Math.min(maxSegLen - len, maxPathLen - dirPathLen - len);
1225         }
1226 
1227         /***
1228            Gets the length of the simple file name that would be
1229            created for this EndSegment.
1230            @return the length
1231         */
1232         private int length() {
1233             int r = mainPart.length(); // Return value.
1234             if (null != uniquePart) {
1235                 r += uniquePart.length();
1236             }
1237             if (null != query) {
1238                 r += query.length();
1239             }
1240             if (null != suffix) {
1241                 r += 1 + suffix.length(); // 1 for the '.'
1242             }
1243             return r;
1244         }
1245 
1246         /***
1247            Trims this EndSegment so a given number of characters are available.
1248            After trimming, there will be room for at least
1249            padding more characters before one of the constraints is
1250            encountered.
1251            The choices for trimming, in priority order, are:
1252            <ol>
1253            <li>Shorten the query.</li>
1254            <li>Remove the query.</li>
1255            <li>Shorten the main part.</li>
1256            <li>Shorten the suffix.</li>
1257            </ol>
1258            @param padding the number of character positions that need to be
1259            available
1260            @throws IllegalStateException
1261            if it's impossible to trim enough
1262         */
1263         private void trimWithPadding(int padding) {
1264             assert padding >= 0 : "padding: " + padding;
1265             int la = lenAvail();
1266             if (la >= padding) {
1267                 return;
1268             }
1269 
1270             // We need space for (padding - la) characters.
1271             // la might be negative.
1272             if (null != query) {
1273                 query.trimToMax(Math.max(0, query.length() - (padding - la)));
1274                 if (0 == query.length()) {
1275                     query = null;
1276                 }
1277                 la = lenAvail();
1278                 if (la >= padding) {
1279                     return;
1280                 }
1281             }
1282             mainPart.trimToMax(Math.max(1, mainPart.length() - (padding - la)));
1283             la = lenAvail();
1284             if (la >= padding) {
1285                 return;
1286             }
1287             if (null != suffix) {
1288                 suffix = suffix.substring(0, Math.max(1, suffix.length()
1289                                                       - (padding - la)));
1290                 la = lenAvail();
1291                 if (la >= padding) {
1292                     return;
1293                 }
1294             }
1295             throw new IllegalStateException("Can not trim " + curi.toString());
1296         }
1297     }
1298 
1299     /***
1300        This class represents a dynamically growable string
1301        consisting of substrings ("lumps") that
1302        are treated atomically.  If the string is shortened, then an entire
1303        lump is removed.  The intent is to treat each %XX escape as a lump.
1304        This class also allows single characters in a source string to be
1305        re-mapped to a different string, possible containing more than
1306        one character.
1307        Each re-mapped character is also treated as a lump.
1308        <p>
1309        For example, suppose part of a URI, between two slashes, is
1310        <code>/VeryLongString...%3A/</code>.
1311        We want to create a corresponding file system directory, but the string
1312        is a little longer than the allowed maximum.
1313        It's better to trim the entire
1314        <code>%3A</code>
1315        off the end than part of it.
1316        This is especially true if, later, we need to append some digits
1317        to create a unique directory name.
1318        So we treat the entire
1319        <code>%3A</code>
1320        as one lump.
1321     */
1322     class LumpyString {
1323         /***
1324            Lumps are indicated by an auxiliary array aux[],
1325            indexed the same as the string.  The LUMP_BEGIN bit is set
1326            for a position in the string at which a lump begins.
1327         */
1328         private static final byte LUMP_BEGIN = 0x1;
1329 
1330         /*** Bit set for the end of a lump. */
1331         private static final byte LUMP_END = 0x2;
1332 
1333         /***
1334            Bit set for all characters in a lump of length greater than 1,
1335            except the beginning and ending characters.
1336         */
1337         private static final byte LUMP_MID = 0x4;
1338 
1339         /*** The auxiliary array. */
1340         private byte[] aux;
1341 
1342         /*** Holds the string. */
1343         private StringBuffer string;
1344 
1345         /***
1346            Creates a LumpyString.
1347            @param str the source string
1348            @param beginIndex the beginning index, inclusive, of the substring
1349            of str to be used
1350            @param endIndex the ending index, exclusive, of the substring
1351            of str to be used
1352            @param padding reserve this many additional character positions
1353            before dynamic growth is needed
1354            @param maxLen the maximum string length, regardless of the
1355            values of beginIndex, endIndex, and padding
1356            @param characterMap maps from characters in the source string
1357            (represented as length-one String values) to replacement String
1358            values (length at least 1).
1359            Each replacement string is treated as one lump.
1360            This is intended to cope with characters that a file system
1361            does not allow.
1362            @param dotBegin if non-null, this replaces a '.' at
1363            <code>str[beginIndex]</code>
1364            @throws IllegalArgumentException if
1365            beginIndex is negative.
1366            @throws IllegalArgumentException if
1367            endIndex is less than beginIndex.
1368            @throws IllegalArgumentException if
1369            padding is negative.
1370            @throws IllegalArgumentException if
1371            maxLen is less than one.
1372            @throws IllegalArgumentException if
1373            characterMap is null.
1374            @throws IllegalArgumentException if
1375            dotBegin is non-null but empty.
1376         */
1377         LumpyString(String str, int beginIndex, int endIndex, int padding,
1378                     int maxLen, Map characterMap, String dotBegin) {
1379             if (beginIndex < 0) {
1380                 throw new IllegalArgumentException("beginIndex < 0: "
1381                                                    + beginIndex);
1382             }
1383             if (endIndex < beginIndex) {
1384                 throw new IllegalArgumentException("endIndex < beginIndex "
1385                     + "beginIndex: " + beginIndex + "endIndex: " + endIndex);
1386             }
1387             if (padding < 0) {
1388                 throw new IllegalArgumentException("padding < 0: " + padding);
1389             }
1390             if (maxLen < 1) {
1391                 throw new IllegalArgumentException("maxLen < 1: " + maxLen);
1392             }
1393             if (null == characterMap) {
1394                 throw new IllegalArgumentException("characterMap null");
1395             }
1396             if ((null != dotBegin) && (0 == dotBegin.length())) {
1397                 throw new IllegalArgumentException("dotBegin empty");
1398             }
1399 
1400             // Initial capacity.  Leave some room for %XX lumps.
1401             // Guaranteed positive.
1402             int cap = Math.min(2 * (endIndex - beginIndex) + padding + 1,
1403                                maxLen);
1404             string = new StringBuffer(cap);
1405             aux = new byte[cap];
1406             for (int i = beginIndex; i != endIndex; ++i) {
1407                 String s = str.substring(i, i + 1);
1408                 String lump; // Next lump.
1409                 if (".".equals(s) && (i == beginIndex) && (null != dotBegin)) {
1410                     lump = dotBegin;
1411                 } else {
1412                     lump = (String) characterMap.get(s);
1413                 }
1414                 if (null == lump) {
1415                     if ("%".equals(s) && ((endIndex - i) > 2)
1416                             && (-1 != Character.digit(str.charAt(i + 1), 16))
1417                             && (-1 != Character.digit(str.charAt(i + 2), 16))) {
1418 
1419                         // %XX escape; treat as one lump.
1420                         lump = str.substring(i, i + 3);
1421                         i += 2;
1422                     } else {
1423                         lump = s;
1424                     }
1425                 }
1426                 if ((string.length() + lump.length()) > maxLen) {
1427                     assert checkInvariants();
1428                     return;
1429                 }
1430                 append(lump);
1431             }
1432             assert checkInvariants();
1433         }
1434 
1435         /***
1436            Converts this LumpyString to a String.
1437            @return the current string contents
1438         */
1439         public String toString() {
1440             assert checkInvariants();
1441             return string.toString();
1442         }
1443 
1444         /***
1445            Appends one lump to the end of this string.
1446            @param lump the lump (substring) to append
1447            @throws IllegalArgumentException if
1448            lump is null or empty.
1449         */
1450         void append(String lump) {
1451             if (null == lump) {
1452                 throw new IllegalArgumentException("lump null");
1453             }
1454             int lumpLen = lump.length();
1455             if (0 == lumpLen) {
1456                 throw new IllegalArgumentException("lump empty");
1457             }
1458             int pos = string.length(); // Current end of string.
1459             ensureCapacity(pos + lumpLen);
1460             if (1 == lumpLen) {
1461                 aux[pos] = LUMP_BEGIN | LUMP_END;
1462             } else {
1463                 assert lumpLen > 1;
1464                 aux[pos] = LUMP_BEGIN;
1465                 ++pos;
1466                 for (int i = lumpLen - 2; 0 != i; --i) {
1467                     aux[pos] = LUMP_MID;
1468                     ++pos;
1469                 }
1470                 aux[pos] = LUMP_END;
1471             }
1472             string.append(lump);
1473             assert checkInvariants();
1474         }
1475 
1476         /***
1477            Returns the string as a StringBuffer.
1478            The caller should <em>not</em> modify the return value.
1479            @return the string
1480         */
1481         StringBuffer asStringBuffer() {
1482             return string;
1483         }
1484 
1485         /***
1486            Tests if this string ends with a character.
1487            @param ch the character to test for
1488            @return true if and only if this string ends with ch
1489         */
1490         boolean endsWith(char ch) {
1491             assert checkInvariants();
1492             int len = string.length();
1493             return (0 != len) && (string.charAt(len - 1) == ch);
1494         }
1495 
1496         /***
1497            Prepends one character, as a lump, to this string.
1498            @param ch the character to prepend
1499         */
1500         void prepend(char ch) {
1501             assert checkInvariants();
1502             int oldLen = string.length();
1503             ensureCapacity(1 + oldLen);
1504             string.insert(0, ch);
1505             System.arraycopy(aux, 0, aux, 1, oldLen);
1506             aux[0] = LUMP_BEGIN | LUMP_END;
1507             assert checkInvariants();
1508         }
1509 
1510         /***
1511            Gets the length of this string.
1512            @return the number of characters in this string
1513         */
1514         int length() {
1515             assert checkInvariants();
1516             return string.length();
1517         }
1518 
1519         /***
1520            If necessary, trims this string to a maximum length.
1521            Any trimming is done by removing one or more complete
1522            lumps from the end of this string.
1523            @param maxLen the new maximum length.
1524            After trimming, the actual length of this string will be
1525            at most maxLen.
1526            @throws IllegalArgumentException if
1527            maxLen is negative.
1528         */
1529         void trimToMax(int maxLen) {
1530             if (maxLen < 0) {
1531                 throw new IllegalArgumentException("maxLen < 0: " + maxLen);
1532             }
1533             assert checkInvariants();
1534             int cl = string.length(); // Current length.
1535             if (cl > maxLen) {
1536                 int nl = maxLen; // New length.
1537                 while ((0 != nl) && (LUMP_END != (aux[nl - 1] & LUMP_END))) {
1538                     --nl;
1539                 }
1540                 for (int i = nl; i != cl; ++i) {
1541                     aux[i] = 0;
1542                 }
1543                 string.setLength(nl);
1544             }
1545             assert checkInvariants();
1546         }
1547 
1548         /***
1549            Checks some assertions on the instance variables.
1550            The intended usage is
1551            <code>assert checkInvariants();</code>
1552            so that if assertions are off, no call is made.
1553            @return true
1554         */
1555         private boolean checkInvariants() {
1556 
1557             // There's an aux[] element for every character in the StringBuffer.
1558             assert aux.length >= string.length()
1559                 : "aux.length: " + aux.length
1560                 + " string.length(): " + string.length();
1561 
1562             // The first character starts a lump.
1563             assert (0 == string.length())
1564                 || (LUMP_BEGIN == (aux[0] & LUMP_BEGIN))
1565                 : "aux[0]: " + aux[0];
1566 
1567             // The last character ends a lump.
1568             assert (0 == string.length())
1569                 || (LUMP_END == (aux[string.length() - 1] & LUMP_END))
1570                 : "aux[end]: " + aux[string.length() - 1];
1571             return true;
1572         }
1573 
1574         /***
1575            Ensures that the capacity is at least equal to the specified minimum.
1576            @param minCapacity the minimum desired capacity
1577         */
1578         private void ensureCapacity(int minCapacity) {
1579             assert checkInvariants();
1580             if (minCapacity > aux.length) {
1581                 int nc = 2 * aux.length; // New capacity.
1582                 while (nc < minCapacity) {
1583                     nc *= 2;
1584                 }
1585                 byte[] oldAux = aux;
1586                 aux = new byte[nc];
1587                 System.arraycopy(oldAux, 0, aux, 0, string.length());
1588             }
1589             string.ensureCapacity(minCapacity);
1590             assert checkInvariants();
1591         }
1592     }
1593 
1594     /***
1595        This class is returned by uriToFile.
1596        It represents a file system path, both as a File and as
1597        a path relative to the base directory.
1598     */
1599     class URIToFileReturn {
1600         /*** The file system path as a File.*/
1601         private File filePath;
1602 
1603         /*** The relative path from baseDir.*/
1604         private StringBuffer relativePath = new StringBuffer(255);
1605 
1606         /***
1607            Creates a URIToFileReturn.
1608            @param baseDir the path to the starting directory
1609            @param host the host part of the URI, or null if the host name
1610            should not be part of the path
1611            @param port the port part of the URI, or -1 if the port
1612            should not be part of the path
1613         */
1614         URIToFileReturn(String baseDir, String host, int port) {
1615 
1616             // The initial path.
1617             StringBuffer startPath = new StringBuffer(baseDir.length() + 32);
1618             startPath.append(baseDir);
1619             if (baseDir.endsWith(File.separator)) {
1620                 assert 1 != baseDir.length();
1621                 startPath.deleteCharAt(startPath.length() - 1);
1622             }
1623             if (null != host) {
1624                 startPath.append(File.separatorChar);
1625                 startPath.append(host);
1626                 relativePath.append(host);
1627             }
1628             if (port > 0) {
1629                 startPath.append(File.separatorChar);
1630                 startPath.append(port);
1631                 relativePath.append(File.separatorChar);
1632                 relativePath.append(port);
1633             }
1634             filePath = new File(startPath.toString());
1635         }
1636 
1637         /***
1638            Appends one more segment to this path.
1639            @param f a File representing the path with the next segment added
1640            @param nextSegment the next segment
1641         */
1642         void append(File f, String nextSegment) {
1643             filePath = f;
1644             if (0 != relativePath.length()) {
1645                 relativePath.append(File.separatorChar);
1646             }
1647             relativePath.append(nextSegment);
1648         }
1649 
1650         /***
1651            Gets this path as a File.
1652            @return this path
1653         */
1654         File getFile() {
1655             return filePath;
1656         }
1657 
1658         /***
1659            Gets this path as a relative path from the base directory.
1660            @return the relative path
1661         */
1662         String getRelativePath() {
1663             return relativePath.toString();
1664         }
1665 
1666         /***
1667            Tests if this path is longer than a given value.
1668            @param maxLen the value to test
1669            @return true if and only if this path is longer than maxLen
1670         */
1671         boolean longerThan(int maxLen) {
1672             return filePath.getPath().length() > maxLen;
1673         }
1674 
1675         /***
1676            Creates all directories in this path as needed.
1677            @throws IOException
1678            if a needed directory could not be created
1679            @throws IOException
1680            if a needed directory is not writeable
1681            @throws IOException
1682            if a non-directory file exists
1683            with the same path as a needed directory
1684         */
1685         void mkdirs() throws IOException {
1686             if (!filePath.exists()) {
1687                 if (!filePath.mkdirs()) {
1688                     throw new IOException("Can not mkdir "
1689                                           + filePath.getAbsolutePath());
1690                 }
1691             } else if (!filePath.canWrite()) {
1692                 throw new IOException("Directory " + filePath.getAbsolutePath()
1693                                       + " not writeable.");
1694             } else if (!filePath.isDirectory()) {
1695                 throw new IOException("File " + filePath.getAbsolutePath()
1696                                       + " is not a directory.");
1697             }
1698         }
1699     }
1700 }