View Javadoc

1   /*
2    * ArchiveUtils
3    *
4    * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/util/ArchiveUtils.java,v 1.38 2007/01/23 00:29:48 gojomo Exp $
5    *
6    * Created on Jul 7, 2003
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   *
26   */
27  package org.archive.util;
28  
29  import java.io.BufferedReader;
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.io.InputStreamReader;
33  import java.io.PrintWriter;
34  import java.io.StringWriter;
35  import java.text.NumberFormat;
36  import java.text.ParseException;
37  import java.text.SimpleDateFormat;
38  import java.util.Calendar;
39  import java.util.Date;
40  import java.util.GregorianCalendar;
41  import java.util.HashSet;
42  import java.util.Locale;
43  import java.util.Set;
44  import java.util.TimeZone;
45  import java.util.logging.Level;
46  import java.util.logging.Logger;
47  
48  import org.apache.commons.io.IOUtils;
49  
50  /***
51   * Miscellaneous useful methods.
52   *
53   * @contributor gojomo & others
54   */
55  public class ArchiveUtils {
56      private static final Logger LOGGER = Logger.getLogger(ArchiveUtils.class.getName());
57  
58      /***
59       * Arc-style date stamp in the format yyyyMMddHHmm and UTC time zone.
60       */
61      private static final ThreadLocal<SimpleDateFormat> 
62          TIMESTAMP12 = threadLocalDateFormat("yyyyMMddHHmm");;
63      
64      /***
65       * Arc-style date stamp in the format yyyyMMddHHmmss and UTC time zone.
66       */
67      private static final ThreadLocal<SimpleDateFormat> 
68         TIMESTAMP14 = threadLocalDateFormat("yyyyMMddHHmmss");
69      /***
70       * Arc-style date stamp in the format yyyyMMddHHmmssSSS and UTC time zone.
71       */
72      private static final ThreadLocal<SimpleDateFormat> 
73          TIMESTAMP17 = threadLocalDateFormat("yyyyMMddHHmmssSSS");
74  
75      /***
76       * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
77       * UTC time zone is assumed.
78       */
79      private static final ThreadLocal<SimpleDateFormat> 
80          TIMESTAMP17ISO8601Z = threadLocalDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
81      
82      /***
83       * Log-style date stamp in the format yyyy-MM-dd'T'HH:mm:ss'Z'
84       * UTC time zone is assumed.
85       */
86      private static final ThreadLocal<SimpleDateFormat>
87          TIMESTAMP14ISO8601Z = threadLocalDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
88      
89      /***
90       * Default character to use padding strings.
91       */
92      private static final char DEFAULT_PAD_CHAR = ' ';
93  
94      /*** milliseconds in an hour */ 
95      private static final int HOUR_IN_MS = 60 * 60 * 1000;
96      /*** milliseconds in a day */
97      private static final int DAY_IN_MS = 24 * HOUR_IN_MS;
98      
99      private static ThreadLocal<SimpleDateFormat> threadLocalDateFormat(final String pattern) {
100         ThreadLocal<SimpleDateFormat> tl = new ThreadLocal<SimpleDateFormat>() {
101             protected SimpleDateFormat initialValue() {
102                 SimpleDateFormat df = new SimpleDateFormat(pattern);
103                 df.setTimeZone(TimeZone.getTimeZone("GMT"));
104                 return df;
105             }
106         };
107         return tl;
108     }
109     
110     public static int MAX_INT_CHAR_WIDTH =
111         Integer.toString(Integer.MAX_VALUE).length();
112     
113     /***
114      * Utility function for creating arc-style date stamps
115      * in the format yyyMMddHHmmssSSS.
116      * Date stamps are in the UTC time zone
117      * @return the date stamp
118      */
119     public static String get17DigitDate(){
120         return TIMESTAMP17.get().format(new Date());
121     }
122 
123     /***
124      * Utility function for creating arc-style date stamps
125      * in the format yyyMMddHHmmss.
126      * Date stamps are in the UTC time zone
127      * @return the date stamp
128      */
129     public static String get14DigitDate(){
130         return TIMESTAMP14.get().format(new Date());
131     }
132 
133     /***
134      * Utility function for creating arc-style date stamps
135      * in the format yyyMMddHHmm.
136      * Date stamps are in the UTC time zone
137      * @return the date stamp
138      */
139     public static String get12DigitDate(){
140         return TIMESTAMP12.get().format(new Date());
141     }
142 
143     /***
144      * Utility function for creating log timestamps, in
145      * W3C/ISO8601 format, assuming UTC. Use current time. 
146      * 
147      * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
148      * 
149      * @return the date stamp
150      */
151     public static String getLog17Date(){
152         return TIMESTAMP17ISO8601Z.get().format(new Date());
153     }
154     
155     /***
156      * Utility function for creating log timestamps, in
157      * W3C/ISO8601 format, assuming UTC. 
158      * 
159      * Format is yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
160      * @param date Date to format.
161      * 
162      * @return the date stamp
163      */
164     public static String getLog17Date(long date){
165         return TIMESTAMP17ISO8601Z.get().format(new Date(date));
166     }
167     
168     /***
169      * Utility function for creating log timestamps, in
170      * W3C/ISO8601 format, assuming UTC. Use current time. 
171      * 
172      * Format is yyyy-MM-dd'T'HH:mm:ss'Z'
173      * 
174      * @return the date stamp
175      */
176     public static String getLog14Date(){
177         return TIMESTAMP14ISO8601Z.get().format(new Date());
178     }
179     
180     /***
181      * Utility function for creating log timestamps, in
182      * W3C/ISO8601 format, assuming UTC. 
183      * 
184      * Format is yyyy-MM-dd'T'HH:mm:ss'Z'
185      * @param date long timestamp to format.
186      * 
187      * @return the date stamp
188      */
189     public static String getLog14Date(long date){
190         return TIMESTAMP14ISO8601Z.get().format(new Date(date));
191     }
192     
193     /***
194      * Utility function for creating log timestamps, in
195      * W3C/ISO8601 format, assuming UTC. 
196      * 
197      * Format is yyyy-MM-dd'T'HH:mm:ss'Z'
198      * @param date Date to format.
199      * 
200      * @return the date stamp
201      */
202     public static String getLog14Date(Date date){
203         return TIMESTAMP14ISO8601Z.get().format(date);
204     }
205     
206     /***
207      * Utility function for creating arc-style date stamps
208      * in the format yyyyMMddHHmmssSSS.
209      * Date stamps are in the UTC time zone
210      *
211      * @param date milliseconds since epoc
212      * @return the date stamp
213      */
214     public static String get17DigitDate(long date){
215         return TIMESTAMP17.get().format(new Date(date));
216     }
217     
218     public static String get17DigitDate(Date date){
219         return TIMESTAMP17.get().format(date);
220     }
221 
222     /***
223      * Utility function for creating arc-style date stamps
224      * in the format yyyyMMddHHmmss.
225      * Date stamps are in the UTC time zone
226      *
227      * @param date milliseconds since epoc
228      * @return the date stamp
229      */
230     public static String get14DigitDate(long date){
231         return TIMESTAMP14.get().format(new Date(date));
232     }
233 
234     public static String get14DigitDate(Date d) {
235         return TIMESTAMP14.get().format(d);
236     }
237 
238     /***
239      * Utility function for creating arc-style date stamps
240      * in the format yyyyMMddHHmm.
241      * Date stamps are in the UTC time zone
242      *
243      * @param date milliseconds since epoc
244      * @return the date stamp
245      */
246     public static String get12DigitDate(long date){
247         return TIMESTAMP12.get().format(new Date(date));
248     }
249     
250     public static String get12DigitDate(Date d) {
251         return TIMESTAMP12.get().format(d);
252     }
253     
254     /***
255      * Parses an ARC-style date.  If passed String is < 12 characters in length,
256      * we pad.  At a minimum, String should contain a year (>=4 characters).
257      * Parse will also fail if day or month are incompletely specified.  Depends
258      * on the above getXXDigitDate methods.
259      * @param A 4-17 digit date in ARC style (<code>yyyy</code> to
260      * <code>yyyyMMddHHmmssSSS</code>) formatting.  
261      * @return A Date object representing the passed String. 
262      * @throws ParseException
263      */
264     public static Date getDate(String d) throws ParseException {
265         Date date = null;
266         if (d == null) {
267             throw new IllegalArgumentException("Passed date is null");
268         }
269         switch (d.length()) {
270         case 14:
271             date = ArchiveUtils.parse14DigitDate(d);
272             break;
273 
274         case 17:
275             date = ArchiveUtils.parse17DigitDate(d);
276             break;
277 
278         case 12:
279             date = ArchiveUtils.parse12DigitDate(d);
280             break;
281            
282         case 0:
283         case 1:
284         case 2:
285         case 3:
286             throw new ParseException("Date string must at least contain a" +
287                 "year: " + d, d.length());
288             
289         default:
290             if (!(d.startsWith("19") || d.startsWith("20"))) {
291                 throw new ParseException("Unrecognized century: " + d, 0);
292             }
293             if (d.length() < 8 && (d.length() % 2) != 0) {
294                 throw new ParseException("Incomplete month/date: " + d,
295                     d.length());
296             }
297             StringBuilder sb = new StringBuilder(d);
298             if (sb.length() < 8) {
299                 for (int i = sb.length(); sb.length() < 8; i += 2) {
300                     sb.append("01");
301                 }
302             }
303             if (sb.length() < 12) {
304                 for (int i = sb.length(); sb.length() < 12; i++) {
305                     sb.append("0");
306                 }
307             }
308             date = ArchiveUtils.parse12DigitDate(sb.toString());
309         }
310 
311         return date;
312     }
313 
314     /***
315      * Utility function for parsing arc-style date stamps
316      * in the format yyyMMddHHmmssSSS.
317      * Date stamps are in the UTC time zone.  The whole string will not be
318      * parsed, only the first 17 digits.
319      *
320      * @param date an arc-style formatted date stamp
321      * @return the Date corresponding to the date stamp string
322      * @throws ParseException if the inputstring was malformed
323      */
324     public static Date parse17DigitDate(String date) throws ParseException {
325         return TIMESTAMP17.get().parse(date);
326     }
327 
328     /***
329      * Utility function for parsing arc-style date stamps
330      * in the format yyyMMddHHmmss.
331      * Date stamps are in the UTC time zone.  The whole string will not be
332      * parsed, only the first 14 digits.
333      *
334      * @param date an arc-style formatted date stamp
335      * @return the Date corresponding to the date stamp string
336      * @throws ParseException if the inputstring was malformed
337      */
338     public static Date parse14DigitDate(String date) throws ParseException{
339         return TIMESTAMP14.get().parse(date);
340     }
341 
342     /***
343      * Utility function for parsing arc-style date stamps
344      * in the format yyyMMddHHmm.
345      * Date stamps are in the UTC time zone.  The whole string will not be
346      * parsed, only the first 12 digits.
347      *
348      * @param date an arc-style formatted date stamp
349      * @return the Date corresponding to the date stamp string
350      * @throws ParseException if the inputstring was malformed
351      */
352     public static Date parse12DigitDate(String date) throws ParseException{
353         return TIMESTAMP12.get().parse(date);
354     }
355     
356     /***
357      * Convert 17-digit date format timestamps (as found in crawl.log, for
358      * example) into a GregorianCalendar object. + * Useful so you can convert
359      * into milliseconds-since-epoch. Note: it is possible to compute
360      * milliseconds-since-epoch + * using {@link #parse17DigitDate}.UTC(), but
361      * that method is deprecated in favor of using Calendar.getTimeInMillis(). + *
362      * <p/>I probably should have dug into all the utility methods in
363      * DateFormat.java to parse the timestamp, but this was + * easier. If
364      * someone wants to fix this to use those methods, please have at it! <p/>
365      * Mike Schwartz, schwartz at CodeOnTheRoad dot com.
366      * 
367      * @param timestamp17String
368      * @return Calendar set to <code>timestamp17String</code>.
369      */
370     public static Calendar timestamp17ToCalendar(String timestamp17String) {
371         GregorianCalendar calendar = new GregorianCalendar();
372         int year = Integer.parseInt(timestamp17String.substring(0, 4));
373         int dayOfMonth = Integer.parseInt(timestamp17String.substring(6, 8));
374         // Month is 0-based
375         int month = Integer.parseInt(timestamp17String.substring(4, 6)) - 1;
376         int hourOfDay = Integer.parseInt(timestamp17String.substring(8, 10));
377         int minute = Integer.parseInt(timestamp17String.substring(10, 12));
378         int second = Integer.parseInt(timestamp17String.substring(12, 14));
379         int milliseconds = Integer
380                 .parseInt(timestamp17String.substring(14, 17));
381         calendar.set(Calendar.YEAR, year);
382         calendar.set(Calendar.MONTH, month);
383         calendar.set(Calendar.DAY_OF_MONTH, dayOfMonth);
384         calendar.set(Calendar.HOUR_OF_DAY, hourOfDay);
385         calendar.set(Calendar.MINUTE, minute);
386         calendar.set(Calendar.SECOND, second);
387         calendar.set(Calendar.MILLISECOND, milliseconds);
388         return calendar;
389     }
390     
391     /***
392      * @param timestamp A 14-digit timestamp or the suffix for a 14-digit
393      * timestamp: E.g. '20010909014640' or '20010101' or '1970'.
394      * @return Seconds since the epoch as a string zero-pre-padded so always
395      * Integer.MAX_VALUE wide (Makes it so sorting of resultant string works
396      * properly).
397      * @throws ParseException 
398      */
399     public static String secondsSinceEpoch(String timestamp)
400     throws ParseException {
401         return zeroPadInteger((int)
402             (getSecondsSinceEpoch(timestamp).getTime()/1000));
403     }
404     
405     /***
406      * @param timestamp A 14-digit timestamp or the suffix for a 14-digit
407      * timestamp: E.g. '20010909014640' or '20010101' or '1970'.
408      * @return A date.
409      * @see #secondsSinceEpoch(String)
410      * @throws ParseException 
411      */
412     public static Date getSecondsSinceEpoch(String timestamp)
413     throws ParseException {
414         if (timestamp.length() < 14) {
415             if (timestamp.length() < 10 && (timestamp.length() % 2) == 1) {
416                 throw new IllegalArgumentException("Must have year, " +
417                     "month, date, hour or second granularity: " + timestamp);
418             }
419             if (timestamp.length() == 4) {
420                 // Add first month and first date.
421                 timestamp = timestamp + "01010000";
422             }
423             if (timestamp.length() == 6) {
424                 // Add a date of the first.
425                 timestamp = timestamp + "010000";
426             }
427             if (timestamp.length() < 14) {
428                 timestamp = timestamp +
429                     ArchiveUtils.padTo("", 14 - timestamp.length(), '0');
430             }
431         }
432         return ArchiveUtils.parse14DigitDate(timestamp);
433     }
434     
435     /***
436      * @param i Integer to add prefix of zeros too.  If passed
437      * 2005, will return the String <code>0000002005</code>. String
438      * width is the width of Integer.MAX_VALUE as a string (10
439      * digits).
440      * @return Padded String version of <code>i</code>.
441      */
442     public static String zeroPadInteger(int i) {
443         return ArchiveUtils.padTo(Integer.toString(i),
444                 MAX_INT_CHAR_WIDTH, '0');
445     }
446 
447     /*** 
448      * Convert an <code>int</code> to a <code>String</code>, and pad it to
449      * <code>pad</code> spaces.
450      * @param i the int
451      * @param pad the width to pad to.
452      * @return String w/ padding.
453      */
454     public static String padTo(final int i, final int pad) {
455         String n = Integer.toString(i);
456         return padTo(n, pad);
457     }
458     
459     /*** 
460      * Pad the given <code>String</code> to <code>pad</code> characters wide
461      * by pre-pending spaces.  <code>s</code> should not be <code>null</code>.
462      * If <code>s</code> is already wider than <code>pad</code> no change is
463      * done.
464      *
465      * @param s the String to pad
466      * @param pad the width to pad to.
467      * @return String w/ padding.
468      */
469     public static String padTo(final String s, final int pad) {
470         return padTo(s, pad, DEFAULT_PAD_CHAR);
471     }
472 
473     /*** 
474      * Pad the given <code>String</code> to <code>pad</code> characters wide
475      * by pre-pending <code>padChar</code>.
476      * 
477      * <code>s</code> should not be <code>null</code>. If <code>s</code> is
478      * already wider than <code>pad</code> no change is done.
479      *
480      * @param s the String to pad
481      * @param pad the width to pad to.
482      * @param padChar The pad character to use.
483      * @return String w/ padding.
484      */
485     public static String padTo(final String s, final int pad,
486             final char padChar) {
487         String result = s;
488         int l = s.length();
489         if (l < pad) {
490             StringBuffer sb = new StringBuffer(pad);
491             while(l < pad) {
492                 sb.append(padChar);
493                 l++;
494             }
495             sb.append(s);
496             result = sb.toString();
497         }
498         return result;
499     }
500 
501     /*** check that two byte arrays are equal.  They may be <code>null</code>.
502      *
503      * @param lhs a byte array
504      * @param rhs another byte array.
505      * @return <code>true</code> if they are both equal (or both
506      * <code>null</code>)
507      */
508     public static boolean byteArrayEquals(final byte[] lhs, final byte[] rhs) {
509         if (lhs == null && rhs != null || lhs != null && rhs == null) {
510             return false;
511         }
512         if (lhs==rhs) {
513             return true;
514         }
515         if (lhs.length != rhs.length) {
516             return false;
517         }
518         for(int i = 0; i<lhs.length; i++) {
519             if (lhs[i]!=rhs[i]) {
520                 return false;
521             }
522         }
523         return true;
524     }
525 
526     /***
527      * Converts a double to a string.
528      * @param val The double to convert
529      * @param precision How many characters to include after '.'
530      * @return the double as a string.
531      */
532     public static String doubleToString(double val, int maxFractionDigits){
533         return doubleToString(val, maxFractionDigits, 0);
534     }
535 
536     private static String doubleToString(double val, int maxFractionDigits, int minFractionDigits) {
537         NumberFormat f = NumberFormat.getNumberInstance(Locale.US); 
538         f.setMaximumFractionDigits(maxFractionDigits);
539         f.setMinimumFractionDigits(minFractionDigits);
540         return f.format(val); 
541     }
542 
543     /***
544      * Takes a byte size and formats it for display with 'friendly' units. 
545      * <p>
546      * This involves converting it to the largest unit 
547      * (of B, KB, MB, GB, TB) for which the amount will be > 1.
548      * <p>
549      * Additionally, at least 2 significant digits are always displayed. 
550      * <p>
551      * Displays as bytes (B): 0-1023
552      * Displays as kilobytes (KB): 1024 - 2097151 (~2Mb)
553      * Displays as megabytes (MB): 2097152 - 4294967295 (~4Gb)
554      * Displays as gigabytes (GB): 4294967296 - infinity
555      * <p>
556      * Negative numbers will be returned as '0 B'.
557      *
558      * @param amount the amount of bytes
559      * @return A string containing the amount, properly formated.
560      */
561     public static String formatBytesForDisplay(long amount) {
562         double displayAmount = (double) amount;
563         int unitPowerOf1024 = 0; 
564 
565         if(amount <= 0){
566             return "0 B";
567         }
568         
569         while(displayAmount>=1024 && unitPowerOf1024 < 4) {
570             displayAmount = displayAmount / 1024;
571             unitPowerOf1024++;
572         }
573         
574         // TODO: get didactic, make these KiB, MiB, GiB, TiB
575         final String[] units = { " B", " KB", " MB", " GB", " TB" };
576         
577         // ensure at least 2 significant digits (#.#) for small displayValues
578         int fractionDigits = (displayAmount < 10) ? 1 : 0; 
579         return doubleToString(displayAmount, fractionDigits, fractionDigits) 
580                    + units[unitPowerOf1024];
581     }
582 
583     /***
584      * Convert milliseconds value to a human-readable duration
585      * @param time
586      * @return Human readable string version of passed <code>time</code>
587      */
588     public static String formatMillisecondsToConventional(long time) {
589         return formatMillisecondsToConventional(time,true);
590     }
591     
592     /***
593      * Convert milliseconds value to a human-readable duration
594      * @param time
595      * @param toMs whether to print to the ms
596      * @return Human readable string version of passed <code>time</code>
597      */
598     public static String formatMillisecondsToConventional(long time, boolean toMs) {
599         StringBuffer sb = new StringBuffer();
600         if(time<0) {
601             sb.append("-");
602         }
603         long absTime = Math.abs(time);
604         if(!toMs && absTime < 1000) {
605             return "0s";
606         }
607         if(absTime > DAY_IN_MS) {
608             // days
609             sb.append(absTime / DAY_IN_MS + "d");
610             absTime = absTime % DAY_IN_MS;
611         }
612         if (absTime > HOUR_IN_MS) {
613             //got hours.
614             sb.append(absTime / HOUR_IN_MS + "h");
615             absTime = absTime % HOUR_IN_MS;
616         }
617         if (absTime > 60000) {
618             sb.append(absTime / 60000 + "m");
619             absTime = absTime % 60000;
620         }
621         if (absTime > 1000) {
622             sb.append(absTime / 1000 + "s");
623             absTime = absTime % 1000;
624         }
625         if(toMs) {
626             sb.append(absTime + "ms");
627         }
628         return sb.toString();
629     }
630 
631 
632     /***
633      * Generate a long UID based on the given class and version number.
634      * Using this instead of the default will assume serialization
635      * compatibility across class changes unless version number is
636      * intentionally bumped.
637      *
638      * @param class1
639      * @param version
640      * @return UID based off class and version number.
641      */
642     public static long classnameBasedUID(Class<?> class1, int version) {
643         String callingClassname = class1.getName();
644         return (long)callingClassname.hashCode() << 32 + version;
645     }
646     
647     /***
648      * Copy the raw bytes of a long into a byte array, starting at
649      * the specified offset.
650      * 
651      * @param l
652      * @param array
653      * @param offset
654      */
655     public static void longIntoByteArray(long l, byte[] array, int offset) {
656         int i, shift;
657                   
658         for(i = 0, shift = 56; i < 8; i++, shift -= 8)
659         array[offset+i] = (byte)(0xFF & (l >> shift));
660     }
661     
662     public static long byteArrayIntoLong(byte [] bytearray) {
663         return byteArrayIntoLong(bytearray, 0);
664     }
665     
666     /***
667      * Byte array into long.
668      * @param bytearray Array to convert to a long.
669      * @param offset Offset into array at which we start decoding the long.
670      * @return Long made of the bytes of <code>array</code> beginning at
671      * offset <code>offset</code>.
672      * @see #longIntoByteArray(long, byte[], int)
673      */
674     public static long byteArrayIntoLong(byte [] bytearray,
675             int offset) {
676         long result = 0;
677         for (int i = offset; i < 8 /*Bytes in long*/; i++) {
678             result = (result << 8 /*Bits in byte*/) |
679                 (0xff & (byte)(bytearray[i] & 0xff));
680         }
681         return result;
682     }
683 
684     /***
685      * Given a string that may be a plain host or host/path (without
686      * URI scheme), add an implied http:// if necessary. 
687      * 
688      * @param u string to evaluate
689      * @return string with http:// added if no scheme already present
690      */
691     public static String addImpliedHttpIfNecessary(String u) {
692         if(u.indexOf(':') == -1 || u.indexOf('.') < u.indexOf(':')) {
693             // No scheme present; prepend "http://"
694             u = "http://" + u;
695         }
696         return u;
697     }
698 
699     /***
700      * Verify that the array begins with the prefix. 
701      * 
702      * @param array
703      * @param prefix
704      * @return true if array is identical to prefix for the first prefix.length
705      * positions 
706      */
707     public static boolean startsWith(byte[] array, byte[] prefix) {
708         if(prefix.length>array.length) {
709             return false;
710         }
711         for(int i = 0; i < prefix.length; i++) {
712             if(array[i]!=prefix[i]) {
713                 return false; 
714             }
715         }
716         return true; 
717     }
718 
719     /***
720      * Utility method to get a String singleLineReport from Reporter
721      * @param rep  Reporter to get singleLineReport from
722      * @return String of report
723      */
724     public static String singleLineReport(Reporter rep) {
725         StringWriter sw = new StringWriter();
726         PrintWriter pw = new PrintWriter(sw);
727         try {
728             rep.singleLineReportTo(pw);
729         } catch (IOException e) {
730             // not really possible
731             e.printStackTrace();
732         }
733         pw.flush();
734         return sw.toString();
735     }
736 
737     /***
738      * Compose the requested report into a String. DANGEROUS IF REPORT
739      * CAN BE LARGE.
740      * 
741      * @param rep Reported
742      * @param name String name of report to compose
743      * @return String of report
744      */
745     public static String writeReportToString(Reporter rep, String name) {
746         StringWriter sw = new StringWriter();
747         PrintWriter pw = new PrintWriter(sw);
748         rep.reportTo(name,pw);
749         pw.flush();
750         return sw.toString();
751     }
752     
753     public static Set<String> TLDS;
754     
755     static {
756         TLDS = new HashSet<String>();
757         InputStream is = ArchiveUtils.class.getResourceAsStream("tlds-alpha-by-domain.txt");
758         try {
759             BufferedReader reader = new BufferedReader(new InputStreamReader(is));
760             String line; 
761             while((line = reader.readLine())!=null) {
762                 if (line.startsWith("#")) {
763                     continue;
764                 }
765                 TLDS.add(line.trim().toLowerCase()); 
766             }
767         } catch (Exception e) { 
768             LOGGER.log(Level.SEVERE,"TLD list unavailable",e);
769         } finally {
770             IOUtils.closeQuietly(is); 
771         }
772     }
773     /***
774      * Return whether the given string represents a known 
775      * top-level-domain (like "com", "org", etc.) per IANA
776      * as of 20100419
777      * 
778      * @param dom candidate string
779      * @return boolean true if recognized as TLD
780      */
781     public static boolean isTld(String dom) {
782         return TLDS.contains(dom.toLowerCase());
783     }
784 }
785