View Javadoc

1   /* StatisticsSummary 
2    * 
3    * $Id: StatisticsSummary.java 6428 2009-08-04 01:22:52Z gojomo $$ 
4    * 
5    * Created on July 27, 2006
6    * 
7    * Copyright (C) 2009 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.admin;
26  
27  import java.io.File;
28  import java.io.FileReader;
29  import java.io.BufferedReader;
30  import java.io.IOException;
31  import java.util.Comparator;
32  import java.util.Hashtable;
33  import java.util.Iterator;
34  import java.util.Map;
35  import java.util.SortedMap;
36  import java.util.TreeMap;
37  import java.util.TreeSet;
38  import java.util.concurrent.atomic.AtomicLong;
39  import java.util.logging.Level;
40  import java.util.logging.Logger;
41  
42  
43  
44  /***
45   * This class provides descriptive statistics of a finished crawl job by
46   * using the crawl report files generated by StatisticsTracker.  Any formatting
47   * changes to the way StatisticsTracker writes to the summary crawl reports will
48   * require changes to this class.
49   * <p>
50   * The following statistics are accessible from this class:
51   * <ul>
52   *   <li> Successfully downloaded documents per fetch status code
53   *   <li> Successfully downloaded documents per document mime type
54   *   <li> Amount of data per mime type
55   *   <li> Successfully downloaded documents per host
56   *   <li> Amount of data per host
57   *   <li> Successfully downloaded documents per top-level domain name (TLD)
58   *   <li> Disposition of all seeds 
59   *   <li> Successfully downloaded documents per host per source
60   * </ul>
61   *
62   * <p>TODO: Make it so summarizing is not done all in RAM so we avoid
63   * OOME.
64   *
65   * @author Frank McCown
66   *
67   * @see org.archive.crawler.admin.StatisticsTracker
68   */
69  public class StatisticsSummary {
70      /***
71       * Messages from the StatisticsSummary.
72       */
73      private final static Logger logger =
74          Logger.getLogger(StatisticsSummary.class.getName());
75      
76      private boolean stats = true;
77      
78      /*** Crawl job whose summary we want to view */
79      private CrawlJob cjob;
80          
81      protected long totalDnsStatusCodeDocuments = 0;
82      protected long totalStatusCodeDocuments = 0;
83      protected long totalFileTypeDocuments = 0;
84      protected long totalMimeTypeDocuments = 0;
85      protected long totalDnsMimeTypeDocuments = 0;
86      protected long totalDnsHostDocuments = 0;
87      protected long totalHostDocuments = 0;
88      protected long totalMimeSize = 0;
89      protected long totalDnsMimeSize = 0;
90      protected long totalHostSize = 0;
91      protected long totalDnsHostSize = 0;
92      protected long totalTldDocuments = 0;
93      protected long totalTldSize = 0;
94      protected long totalHosts = 0;
95      
96      protected String durationTime;
97      protected String processedDocsPerSec;
98      protected String bandwidthKbytesPerSec;
99      protected String totalDataWritten;
100     
101     /*** Keep track of the file types we see (mime type -> count) */
102     protected Hashtable<String,AtomicLong> mimeTypeDistribution = new Hashtable<String,AtomicLong>();
103     protected Hashtable<String,AtomicLong> mimeTypeBytes = new Hashtable<String,AtomicLong>();
104     protected Hashtable<String,AtomicLong> mimeTypeDnsDistribution = new Hashtable<String,AtomicLong>();
105     protected Hashtable<String,AtomicLong> mimeTypeDnsBytes = new Hashtable<String,AtomicLong>();
106     
107     /*** Keep track of status codes */
108     protected Hashtable<String,AtomicLong> statusCodeDistribution = new Hashtable<String,AtomicLong>();
109     protected Hashtable<String,AtomicLong> dnsStatusCodeDistribution
110      = new Hashtable<String,AtomicLong>();
111     
112     /*** Keep track of hosts */
113     protected Hashtable<String,AtomicLong> hostsDistribution = new Hashtable<String,AtomicLong>(); 
114     protected Hashtable<String,AtomicLong> hostsBytes = new Hashtable<String,AtomicLong>(); 
115     protected Hashtable<String,AtomicLong> hostsDnsDistribution = new Hashtable<String,AtomicLong>();
116     protected Hashtable<String,AtomicLong> hostsDnsBytes = new Hashtable<String,AtomicLong>(); 
117 
118     /*** Keep track of TLDs */
119     protected Hashtable<String,AtomicLong> tldDistribution = new Hashtable<String,AtomicLong>();
120     protected Hashtable<String,AtomicLong> tldBytes = new Hashtable<String,AtomicLong>();
121     protected Hashtable<String,AtomicLong> tldHostDistribution = new Hashtable<String,AtomicLong>();
122 
123     /*** Keep track of processed seeds */
124     protected transient Map<String,SeedRecord> processedSeedsRecords 
125      = new Hashtable<String,SeedRecord>();
126 
127     /***
128      * Constructor
129      * 
130      * @param cjob
131      * 				Completed crawl job
132      */
133     public StatisticsSummary(CrawlJob cjob) {
134     	this.cjob = cjob;
135     	
136     	// Read all stats for this crawl job
137     	this.stats = calculateStatusCodeDistribution();
138     	if (calculateMimeTypeDistribution()) {
139     		this.stats = true;
140     	}
141     	if (calculateHostsDistribution()) {
142     		this.stats = true;
143     	}
144     	if (readCrawlReport()) {
145     		this.stats = true;
146     	}
147     	if (readSeedReport()) {
148     		this.stats = true;
149     	}
150     }
151     
152     
153     /***
154      * Increment a counter for a key in a given HashMap. Used for various
155      * aggregate data.
156      *
157      * @param map The HashMap
158      * @param key The key for the counter to be incremented, if it does not
159      *               exist it will be added (set to 1).  If null it will
160      *            increment the counter "unknown".
161      */
162     protected static void incrementMapCount(Map<String,AtomicLong> map, 
163             String key) {
164     	incrementMapCount(map,key,1);
165     }
166 
167     /***
168      * Increment a counter for a key in a given HashMap by an arbitrary amount.
169      * Used for various aggregate data. The increment amount can be negative.
170      *
171      * @param map
172      *            The HashMap
173      * @param key
174      *            The key for the counter to be incremented, if it does not
175      *            exist it will be added (set to equal to
176      *            <code>increment</code>).
177      *            If null it will increment the counter "unknown".
178      * @param increment
179      *            The amount to increment counter related to the
180      *            <code>key</code>.
181      */
182     protected static void incrementMapCount(Map<String,AtomicLong> map, 
183             String key, long increment) {
184         if (key == null) {
185             key = "unknown";
186         }
187         // FIXME: for true safety this may need to use ConcurrenMap methods
188         AtomicLong lw = map.get(key);
189         if(lw == null) {
190             map.put(key, new AtomicLong(increment));
191         } else {
192             lw.addAndGet(increment);
193         }
194     }
195   
196     /*** Returns a HashMap that contains information about distributions of
197      *  encountered mime types.  Key/value pairs represent
198      *  mime type -> count.
199      * <p>
200      * <b>Note:</b> All the values are wrapped with a
201      * {@link AtomicLong AtomicLong}
202      * @return mimeTypeDistribution
203      */
204     public Hashtable getMimeDistribution() {
205         return mimeTypeDistribution;
206     }
207     
208     public long getTotalMimeTypeDocuments() {
209        	return totalMimeTypeDocuments;
210     }
211     
212     public long getTotalDnsMimeTypeDocuments() {
213        	return totalDnsMimeTypeDocuments;
214     }
215     
216     public long getTotalMimeSize() {
217     	return totalMimeSize;
218     }
219     
220     public long getTotalDnsMimeSize() {
221     	return totalDnsMimeSize;
222     }
223    
224     /***
225      * Return a HashMap representing the distribution of HTTP status codes for
226      * successfully fetched curis, as represented by a hashmap where key -&gt;
227      * val represents (string)code -&gt; (integer)count.
228      * 
229      * <b>Note: </b> All the values are wrapped with a
230      * {@link AtomicLong AtomicLong}
231      * 
232      * @return statusCodeDistribution
233      */
234     public Hashtable getStatusCodeDistribution() {    	
235         return statusCodeDistribution;
236     }
237    
238     /***
239      * Return a HashMap representing the distribution of DNS status codes for
240      * successfully fetched curis, as represented by a hashmap where key -&gt;
241      * val represents (string)code -&gt; (integer)count.
242      * 
243      * <b>Note: </b> All the values are wrapped with a
244      * {@link AtomicLong AtomicLong}
245      * 
246      * @return dnsStatusCodeDistribution
247      */
248     public Hashtable getDnsStatusCodeDistribution() {
249     	return dnsStatusCodeDistribution;
250     }
251     
252     public Hashtable getDnsMimeDistribution() {
253         return mimeTypeDnsDistribution;
254     }
255 
256     public long getTotalDnsStatusCodeDocuments() {
257     	return totalDnsStatusCodeDocuments;
258     }
259     
260     public long getTotalStatusCodeDocuments() {
261     	return totalStatusCodeDocuments;
262     }  
263     
264     public long getTotalHostDocuments() {
265        	return totalHostDocuments;
266     }
267     
268     public long getTotalDnsHostDocuments() {
269        	return totalDnsHostDocuments;
270     }
271     
272     public Hashtable getHostsDnsDistribution() {
273     	return hostsDnsDistribution;
274     }
275     
276     public long getTotalHostDnsDocuments() {
277     	return totalDnsHostDocuments;
278     }
279     
280     public long getTotalHostSize() {
281     	return totalHostSize;
282     }
283     
284     public long getTotalDnsHostSize() {
285     	return totalDnsHostSize;
286     }
287     
288     public Hashtable getTldDistribution() {
289     	return tldDistribution;
290     }
291     
292     public Hashtable getTldBytes() {
293     	return tldBytes;
294     }
295     
296     public long getTotalTldDocuments() {
297     	return totalTldDocuments;
298     }
299     
300     public long getTotalTldSize() {
301     	return totalTldSize;
302     }
303     
304     public Hashtable getTldHostDistribution() {
305     	return tldHostDistribution;
306     }
307     
308     public long getTotalHosts() {
309     	return totalHosts;
310     }
311     
312     public String getDurationTime() {
313     	return durationTime;
314     }
315     
316     public String getProcessedDocsPerSec() {
317     	return processedDocsPerSec;
318     }
319     
320     public String getBandwidthKbytesPerSec() {
321     	return bandwidthKbytesPerSec;
322     }
323     
324     public String getTotalDataWritten() {
325     	return totalDataWritten;
326     }
327 
328     /***
329      * Sort the entries of the given HashMap in descending order by their
330      * values, which must be <code>AtomicLong</code>s.
331      * <p>
332      * Elements are sorted by value from largest to smallest. Equal values are
333      * sorted in an arbitrary, but consistent manner by their keys. Only items
334      * with identical value and key are considered equal.
335      *
336      * If the passed-in map requires access to be synchronized, the caller
337      * should ensure this synchronization. 
338      * 
339      * @param mapOfAtomicLongValues
340      *            Assumes values are AtomicLongs.
341      * @return a sorted set containing the same elements as the map.
342      */
343     public TreeMap<String,AtomicLong> getReverseSortedCopy(
344             final Map<String,AtomicLong> mapOfAtomicLongValues) {
345         TreeMap<String,AtomicLong> sortedMap = new TreeMap<String,AtomicLong>(
346           new Comparator<String>() {
347             public int compare(String e1, String e2) {
348                 long firstVal = mapOfAtomicLongValues.get(e1).get();
349                 long secondVal = mapOfAtomicLongValues.get(e2).get();
350                 if (firstVal < secondVal) {
351                     return 1;
352                 }
353                 if (secondVal < firstVal) {
354                     return -1;
355                 }
356                 // If the values are the same, sort by keys.
357                 return e1.compareTo(e2);
358             }
359         });
360         try {
361             sortedMap.putAll(mapOfAtomicLongValues);
362         } catch (UnsupportedOperationException e) {
363             for (String key: mapOfAtomicLongValues.keySet()) {
364                 sortedMap.put(key, mapOfAtomicLongValues.get(key));
365             }
366         }
367         return sortedMap;
368     }
369      
370     /***
371      * Get the number of hosts with a particular TLD.
372      * @param tld
373      * 				top-level domain name
374      * @return		Total crawled hosts
375      */
376     public long getHostsPerTld(String tld) {
377     	AtomicLong lw = (AtomicLong)tldHostDistribution.get(tld);
378     	return (lw == null ? 0 : lw.get());
379     }
380     
381     /***
382      * Read status code distribution from responsecode-report.txt.
383      * DNS and HTTP status codes are separated when read.
384      * @return True if we found some stats.
385      */
386     private boolean calculateStatusCodeDistribution() {
387     	// Read from responsecode-report.txt
388     	File f = new File(cjob.getDirectory(), "responsecode-report.txt");
389     	if (!f.exists()) {
390     		return false;
391     	}
392     	BufferedReader br = null;
393     	try {
394 	    	FileReader reader = new FileReader(f);
395 	    	br = new BufferedReader(reader);
396 	    	String line = br.readLine();  // Ignore heading
397 	    	line = br.readLine();
398 	    	while (line != null) {  	  
399 	    	  // Get status code and # urls which are seperated by a space
400 	    	  
401 	    	  String[] items = line.split(" ");
402 	    	  if (items.length < 2) {
403 	    		  logger.log(Level.WARNING,
404                           "Unexpected formatting on line [" + line + "]");
405 	    	  }
406 	    	  else {
407 	    		  // See if DNS or HTTP status code
408 	    		  if (items[0].length() < 3) {
409 	    			  // DNS status code
410 	    			  long total = Long.parseLong(items[1]);
411 	    			  dnsStatusCodeDistribution.put(items[0], 
412 	    					  new AtomicLong(total));
413 	    			  totalDnsStatusCodeDocuments += total;
414 	    		  }
415 	    		  else {
416 	    			  // HTTP status code
417 	    			  long total = Long.parseLong(items[1]);
418 	    			  statusCodeDistribution.put(items[0], 
419 	    					  new AtomicLong(total));
420 	    			  totalStatusCodeDocuments += total;
421 	    		  }
422 	    	  }
423 	    	  line = br.readLine();
424 	    	}
425     	} catch (IOException e) {
426     		logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(),
427     			e);
428     	} finally {
429     		if (br != null) {
430     			try {
431 					br.close();
432 				} catch (IOException e) {
433 					logger.log(Level.SEVERE,
434 						"Closing " + f.getAbsolutePath(), e);
435 				}
436     		}
437     	}
438     	return true;
439     }
440     
441     /***
442      * Read MIME type data from mimetype-report.txt.
443      * MIME type of text/dns is separated from other MIME types.
444      * @return True if we found some stats.
445      */
446     private boolean calculateMimeTypeDistribution() {    	
447     	File f = new File(cjob.getDirectory(), "mimetype-report.txt");
448     	if (!f.exists()) {
449     		return false;
450     	}
451     	BufferedReader br = null;
452     	try {
453 	    	FileReader reader = new FileReader(f);
454 	    	br = new BufferedReader(reader);
455 	    	String line = br.readLine();  // Ignore heading
456 	    	line = br.readLine();
457 	    	while (line != null) {	    			    	  
458 	    		// Get num urls, num bytes, and MIME type (seperated by a space)
459 	    		// Example: 12 134279 text/html
460   
461 	    		String[] items = line.split(" ");
462 	    		if (items.length < 3) {
463 	    			logger.log(Level.WARNING,
464                             "Unexpected formatting on line [" + line + "]");
465 	    		}
466 	    		else {
467 	    			long total = Long.parseLong(items[0]);
468 	    			long bytes = Long.parseLong(items[1]);
469 	    			String mime = items[2];
470 
471 	    			// Seperate DNS reconrds from HTTP
472 	    			if (mime.equalsIgnoreCase("text/dns")) {
473 	    				mimeTypeDnsDistribution.put(mime,
474                                 new AtomicLong(total));
475 	    				mimeTypeDnsBytes.put(mime, new AtomicLong(bytes));
476 	    				totalDnsMimeTypeDocuments += total;
477 	    				totalDnsMimeSize += bytes;
478 	    			}
479 	    			else {
480 	    				mimeTypeDistribution.put(mime, new AtomicLong(total));
481 	    				mimeTypeBytes.put(mime, new AtomicLong(bytes));
482 	    				totalMimeTypeDocuments += total;
483 	    				totalMimeSize += bytes;
484 	    			}
485 	    		}
486 	    		line = br.readLine();
487 	    	}
488     	} catch (IOException e) {
489     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
490     	} finally {
491     		if (br != null) {
492     			try {
493     				br.close();
494     			} catch (IOException e) {
495     				logger.log(Level.SEVERE,
496     					"Closing " + f.getAbsolutePath(), e);
497     			}
498     		}
499     	}
500     	return true;
501     }
502     
503     /***
504      * Read number of URLs and total bytes for each host name from
505      * hosts-report.txt.
506      * Host name of "dns:" is separated from others.
507      * @return true if stats found.
508      */
509     private boolean calculateHostsDistribution() {
510     	File f = new File(cjob.getDirectory(), "hosts-report.txt");
511     	if (!f.exists()) {
512     		return false;
513     	}
514     	BufferedReader br = null;
515     	try {
516 	    	FileReader reader = new FileReader(f);
517 	    	br = new BufferedReader(reader);
518 	    	String line = br.readLine();  // Ignore heading
519 	    	line = br.readLine();
520 	    	while (line != null) {    	  
521 	    		// Get num urls, num bytes, and host name (seperated by a space)
522 	    		// Example: 9 7468 www.blogger.com
523 
524 	    		String[] items = line.split(" ");
525 	    		if (items.length < 3) {
526 	    			logger.log(Level.WARNING,
527                             "Unexpected formatting on line [" + line + "]");
528 	    		}
529 	    		else {
530 	    			long total = Long.parseLong(items[0]);
531 	    			long bytes = Long.parseLong(items[1]);
532 	    			String host = items[2];
533 
534 	    			// Seperate DNS reconrds from HTTP
535 	    			if (host.startsWith("dns:", 0)) {
536 	    				hostsDnsDistribution.put(host, new AtomicLong(total));
537 	    				hostsDnsBytes.put(host, new AtomicLong(bytes));
538 	    				totalDnsHostDocuments += total;
539 	    				totalDnsHostSize += bytes;
540 	    			}
541 	    			else {
542 	    				hostsDistribution.put(host, new AtomicLong(total));
543 	    				hostsBytes.put(host, new AtomicLong(bytes));
544 	    				totalHostDocuments += total;
545 	    				totalHostSize += bytes;
546 
547 	    				// Count top level domain (TLD)
548 	    				String tld = host.substring(host.lastIndexOf('.')+1);
549 	    				incrementMapCount(tldDistribution, tld, total);   
550 	    				incrementMapCount(tldBytes, tld, bytes);
551 	    				incrementMapCount(tldHostDistribution, tld);
552 	    				totalTldDocuments += total;
553 	    				totalTldSize += bytes;
554 
555 	    				totalHosts++;
556 	    			}
557 	    		}
558 	    		line = br.readLine();
559 	    	}
560     	} catch (IOException e) {
561     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
562     	} finally {
563     		if (br != null) {
564     			try {
565     				br.close();
566     			} catch (IOException e) {
567     				logger.log(Level.SEVERE,
568     					"Closing " + f.getAbsolutePath(), e);
569     			}
570     		}
571     	}
572     	return true;
573     }
574 
575     /***
576      * Returns the accumulated number of bytes downloaded from a given host.
577      * @param host name of the host
578      * @return the accumulated number of bytes downloaded from a given host
579      */
580     public long getBytesPerHost(String host) { 
581     	long bytes = -1;
582     	
583     	bytes = host != null && host.startsWith("dns:", 0) ? 
584 	    	((AtomicLong)hostsDnsBytes.get(host)).get() :
585 	    	((AtomicLong)hostsBytes.get(host)).get();	    
586     	
587     	return bytes;
588     }
589     
590     /***
591      * Returns the total number of bytes downloaded for a given TLD.
592      * @param tld TLD
593      * @return the total number of bytes downloaded for a given TLD
594      */
595     public long getBytesPerTld(String tld) {
596     	AtomicLong lw = (AtomicLong)tldBytes.get(tld);
597     	return (lw == null ? 0 : lw.get());
598     }
599 
600     /***
601      * Returns the accumulated number of bytes from files of a given file type.
602      * @param filetype Filetype to check.
603      * @return the accumulated number of bytes from files of a given mime type
604      */
605     public long getBytesPerMimeType(String filetype) {
606     	long bytes = -1;
607     	
608     	if (filetype != null) {    	
609 	    	if (filetype.equals("text/dns")) {	    		
610 	    		bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 :
611 	    			((AtomicLong)mimeTypeDnsBytes.get(filetype)).get();
612 	    	}
613 	    	else {
614 	    		bytes = mimeTypeBytes.get(filetype) == null ? 0 :
615 	    			((AtomicLong)mimeTypeBytes.get(filetype)).get();
616 	    	}
617     	}
618     	return bytes;
619     }
620     
621     /***
622      * Reads duration time, processed docs/sec, bandwidth, and total size
623      * of crawl from crawl-report.txt.
624      * @return true if stats found.
625      */
626     public boolean readCrawlReport() {
627     	File f = new File(cjob.getDirectory(), "crawl-report.txt");
628     	if (!f.exists()) {
629     		return false;
630     	}
631     	BufferedReader br = null;
632     	try {
633 	    	FileReader reader = new FileReader(f);
634 	    	br = new BufferedReader(reader);
635 	    	String line = br.readLine();  
636 	    	while (line != null) {
637 	    		if (line.startsWith("Duration Time")) {
638 	    			durationTime = line.substring(line.indexOf(':')+1);
639 	    		}
640 	    		else if (line.startsWith("Processed docs/sec")) {
641 	    			processedDocsPerSec = line.substring(line.indexOf(':')+1);
642 	    		}
643 	    		else if (line.startsWith("Bandwidth in Kbytes/sec")) {
644 	    			bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1);
645 	    		}
646 	    		else if (line.startsWith("Total Raw Data Size in Bytes")) {
647 	    			totalDataWritten = line.substring(line.indexOf(':')+1);
648 	    		}
649 
650 	    		line = br.readLine();
651 	    	}
652     	}
653     	catch (IOException e) {
654     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);		
655     	} finally {
656     		if (br != null) {
657     			try {
658 					br.close();
659 				} catch (IOException e) {
660 					logger.log(Level.SEVERE,
661 					    "Failed close of " + f.getAbsolutePath(), e);
662 				}
663     		}
664     	}
665     	return true;
666     }
667   
668     /***
669      * Returns sorted Iterator of seeds records based on status code.
670      * @return sorted Iterator of seeds records
671      */
672     public Iterator<SeedRecord> getSeedRecordsSortedByStatusCode() {
673         TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
674           new Comparator<SeedRecord>() {
675             public int compare(SeedRecord sr1, SeedRecord sr2) {
676                 int code1 = sr1.getStatusCode();
677                 int code2 = sr2.getStatusCode();
678                 if (code1 == code2) {
679                     // If the values are equal, sort by URIs.
680                     return sr1.getUri().compareTo(sr2.getUri());
681                 }
682                 // mirror and shift the nubmer line so as to
683                 // place zero at the beginning, then all negatives 
684                 // in order of ascending absolute value, then all 
685                 // positives descending
686                 code1 = -code1 - Integer.MAX_VALUE;
687                 code2 = -code2 - Integer.MAX_VALUE;
688                 
689                 return new Integer(code1).compareTo(new Integer(code2));
690             }
691         });
692         for (SeedRecord sr: processedSeedsRecords.values()) {
693             sortedSet.add(sr);
694         }
695         
696         return sortedSet.iterator();
697     }
698     
699     /***
700      * Reads seed data from seeds-report.txt.
701      * @return True if stats found.
702      */
703     private boolean readSeedReport() {
704     	File f = new File(cjob.getDirectory(), "seeds-report.txt");
705     	if (!f.exists()) {
706     		return false;
707     	}
708     	BufferedReader br = null;
709     	try {
710 	    	FileReader reader = new FileReader(f);
711 	    	br = new BufferedReader(reader);
712 	    	
713 	    	// Ignore heading: [code] [status] [seed] [redirect]
714 	    	String line = br.readLine();  
715 	    	line = br.readLine();
716 	    	while (line != null) {
717 	    		// Example lines:
718 	    		// 302 CRAWLED http://www.ashlandcitytimes.com/ http://www.ashlandcitytimes.com/apps/pbcs.dll/section?Category=MTCN01
719 	    		// 200 CRAWLED http://noleeo.com/
720 
721 	    		String[] items = line.split(" ");
722 
723 	    		if (items.length < 3) {
724 	    			logger.log(Level.WARNING,
725                             "Unexpected formatting on line [" + line + "]");
726 	    		}
727 	    		else {
728 	    			String statusCode = items[0];
729 	    			String crawlStatus = items[1];
730 	    			String seed = items[2];
731 	    			String redirect = items.length > 3 ? items[3] : null;
732 
733 	    			// All values should be CRAWLED or NOTCRAWLED
734 	    			if (crawlStatus.equals("CRAWLED")) {
735 	    				crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;	    		  
736 	    			}
737 	    			else {
738 	    				crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;
739 	    			}
740 	    			SeedRecord sr = new SeedRecord(seed, crawlStatus, 
741 	    					Integer.parseInt(statusCode), redirect);
742 	    			processedSeedsRecords.put(seed, sr);
743 	    		}
744 
745 	    		line = br.readLine();
746 	    	}
747     	} catch (IOException e) {
748     		logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);   		
749     	} finally {
750     		if (br != null) {
751     			try {
752 					br.close();
753 				} catch (IOException e) {
754 					logger.log(Level.SEVERE,
755 						"Closing " + f.getAbsolutePath(), e);
756 				}
757     		}
758     	}
759     	return true;
760     }
761         
762     /***
763      * Return a copy of the hosts distribution in reverse-sorted
764      * (largest first) order.
765      *  
766      * @return SortedMap of hosts distribution
767      */
768     public SortedMap getReverseSortedHostsDistribution() {
769         return getReverseSortedCopy(hostsDistribution);  
770     }    
771     
772     /***
773      * @return True if we compiled stats, false if none to compile (e.g.
774      * there are no reports files on disk).
775      */
776     public boolean isStats() {
777     	return this.stats;
778     }
779 }