1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.admin;
26
27 import java.io.File;
28 import java.io.FileReader;
29 import java.io.BufferedReader;
30 import java.io.IOException;
31 import java.util.Comparator;
32 import java.util.Hashtable;
33 import java.util.Iterator;
34 import java.util.Map;
35 import java.util.SortedMap;
36 import java.util.TreeMap;
37 import java.util.TreeSet;
38 import java.util.concurrent.atomic.AtomicLong;
39 import java.util.logging.Level;
40 import java.util.logging.Logger;
41
42
43
44 /***
45 * This class provides descriptive statistics of a finished crawl job by
46 * using the crawl report files generated by StatisticsTracker. Any formatting
47 * changes to the way StatisticsTracker writes to the summary crawl reports will
48 * require changes to this class.
49 * <p>
50 * The following statistics are accessible from this class:
51 * <ul>
52 * <li> Successfully downloaded documents per fetch status code
53 * <li> Successfully downloaded documents per document mime type
54 * <li> Amount of data per mime type
55 * <li> Successfully downloaded documents per host
56 * <li> Amount of data per host
57 * <li> Successfully downloaded documents per top-level domain name (TLD)
58 * <li> Disposition of all seeds
59 * <li> Successfully downloaded documents per host per source
60 * </ul>
61 *
62 * <p>TODO: Make it so summarizing is not done all in RAM so we avoid
63 * OOME.
64 *
65 * @author Frank McCown
66 *
67 * @see org.archive.crawler.admin.StatisticsTracker
68 */
69 public class StatisticsSummary {
70 /***
71 * Messages from the StatisticsSummary.
72 */
73 private final static Logger logger =
74 Logger.getLogger(StatisticsSummary.class.getName());
75
76 private boolean stats = true;
77
78 /*** Crawl job whose summary we want to view */
79 private CrawlJob cjob;
80
81 protected long totalDnsStatusCodeDocuments = 0;
82 protected long totalStatusCodeDocuments = 0;
83 protected long totalFileTypeDocuments = 0;
84 protected long totalMimeTypeDocuments = 0;
85 protected long totalDnsMimeTypeDocuments = 0;
86 protected long totalDnsHostDocuments = 0;
87 protected long totalHostDocuments = 0;
88 protected long totalMimeSize = 0;
89 protected long totalDnsMimeSize = 0;
90 protected long totalHostSize = 0;
91 protected long totalDnsHostSize = 0;
92 protected long totalTldDocuments = 0;
93 protected long totalTldSize = 0;
94 protected long totalHosts = 0;
95
96 protected String durationTime;
97 protected String processedDocsPerSec;
98 protected String bandwidthKbytesPerSec;
99 protected String totalDataWritten;
100
101 /*** Keep track of the file types we see (mime type -> count) */
102 protected Hashtable<String,AtomicLong> mimeTypeDistribution = new Hashtable<String,AtomicLong>();
103 protected Hashtable<String,AtomicLong> mimeTypeBytes = new Hashtable<String,AtomicLong>();
104 protected Hashtable<String,AtomicLong> mimeTypeDnsDistribution = new Hashtable<String,AtomicLong>();
105 protected Hashtable<String,AtomicLong> mimeTypeDnsBytes = new Hashtable<String,AtomicLong>();
106
107 /*** Keep track of status codes */
108 protected Hashtable<String,AtomicLong> statusCodeDistribution = new Hashtable<String,AtomicLong>();
109 protected Hashtable<String,AtomicLong> dnsStatusCodeDistribution
110 = new Hashtable<String,AtomicLong>();
111
112 /*** Keep track of hosts */
113 protected Hashtable<String,AtomicLong> hostsDistribution = new Hashtable<String,AtomicLong>();
114 protected Hashtable<String,AtomicLong> hostsBytes = new Hashtable<String,AtomicLong>();
115 protected Hashtable<String,AtomicLong> hostsDnsDistribution = new Hashtable<String,AtomicLong>();
116 protected Hashtable<String,AtomicLong> hostsDnsBytes = new Hashtable<String,AtomicLong>();
117
118 /*** Keep track of TLDs */
119 protected Hashtable<String,AtomicLong> tldDistribution = new Hashtable<String,AtomicLong>();
120 protected Hashtable<String,AtomicLong> tldBytes = new Hashtable<String,AtomicLong>();
121 protected Hashtable<String,AtomicLong> tldHostDistribution = new Hashtable<String,AtomicLong>();
122
123 /*** Keep track of processed seeds */
124 protected transient Map<String,SeedRecord> processedSeedsRecords
125 = new Hashtable<String,SeedRecord>();
126
127 /***
128 * Constructor
129 *
130 * @param cjob
131 * Completed crawl job
132 */
133 public StatisticsSummary(CrawlJob cjob) {
134 this.cjob = cjob;
135
136
137 this.stats = calculateStatusCodeDistribution();
138 if (calculateMimeTypeDistribution()) {
139 this.stats = true;
140 }
141 if (calculateHostsDistribution()) {
142 this.stats = true;
143 }
144 if (readCrawlReport()) {
145 this.stats = true;
146 }
147 if (readSeedReport()) {
148 this.stats = true;
149 }
150 }
151
152
153 /***
154 * Increment a counter for a key in a given HashMap. Used for various
155 * aggregate data.
156 *
157 * @param map The HashMap
158 * @param key The key for the counter to be incremented, if it does not
159 * exist it will be added (set to 1). If null it will
160 * increment the counter "unknown".
161 */
162 protected static void incrementMapCount(Map<String,AtomicLong> map,
163 String key) {
164 incrementMapCount(map,key,1);
165 }
166
167 /***
168 * Increment a counter for a key in a given HashMap by an arbitrary amount.
169 * Used for various aggregate data. The increment amount can be negative.
170 *
171 * @param map
172 * The HashMap
173 * @param key
174 * The key for the counter to be incremented, if it does not
175 * exist it will be added (set to equal to
176 * <code>increment</code>).
177 * If null it will increment the counter "unknown".
178 * @param increment
179 * The amount to increment counter related to the
180 * <code>key</code>.
181 */
182 protected static void incrementMapCount(Map<String,AtomicLong> map,
183 String key, long increment) {
184 if (key == null) {
185 key = "unknown";
186 }
187
188 AtomicLong lw = map.get(key);
189 if(lw == null) {
190 map.put(key, new AtomicLong(increment));
191 } else {
192 lw.addAndGet(increment);
193 }
194 }
195
196 /*** Returns a HashMap that contains information about distributions of
197 * encountered mime types. Key/value pairs represent
198 * mime type -> count.
199 * <p>
200 * <b>Note:</b> All the values are wrapped with a
201 * {@link AtomicLong AtomicLong}
202 * @return mimeTypeDistribution
203 */
204 public Hashtable getMimeDistribution() {
205 return mimeTypeDistribution;
206 }
207
208 public long getTotalMimeTypeDocuments() {
209 return totalMimeTypeDocuments;
210 }
211
212 public long getTotalDnsMimeTypeDocuments() {
213 return totalDnsMimeTypeDocuments;
214 }
215
216 public long getTotalMimeSize() {
217 return totalMimeSize;
218 }
219
220 public long getTotalDnsMimeSize() {
221 return totalDnsMimeSize;
222 }
223
224 /***
225 * Return a HashMap representing the distribution of HTTP status codes for
226 * successfully fetched curis, as represented by a hashmap where key ->
227 * val represents (string)code -> (integer)count.
228 *
229 * <b>Note: </b> All the values are wrapped with a
230 * {@link AtomicLong AtomicLong}
231 *
232 * @return statusCodeDistribution
233 */
234 public Hashtable getStatusCodeDistribution() {
235 return statusCodeDistribution;
236 }
237
238 /***
239 * Return a HashMap representing the distribution of DNS status codes for
240 * successfully fetched curis, as represented by a hashmap where key ->
241 * val represents (string)code -> (integer)count.
242 *
243 * <b>Note: </b> All the values are wrapped with a
244 * {@link AtomicLong AtomicLong}
245 *
246 * @return dnsStatusCodeDistribution
247 */
248 public Hashtable getDnsStatusCodeDistribution() {
249 return dnsStatusCodeDistribution;
250 }
251
252 public Hashtable getDnsMimeDistribution() {
253 return mimeTypeDnsDistribution;
254 }
255
256 public long getTotalDnsStatusCodeDocuments() {
257 return totalDnsStatusCodeDocuments;
258 }
259
260 public long getTotalStatusCodeDocuments() {
261 return totalStatusCodeDocuments;
262 }
263
264 public long getTotalHostDocuments() {
265 return totalHostDocuments;
266 }
267
268 public long getTotalDnsHostDocuments() {
269 return totalDnsHostDocuments;
270 }
271
272 public Hashtable getHostsDnsDistribution() {
273 return hostsDnsDistribution;
274 }
275
276 public long getTotalHostDnsDocuments() {
277 return totalDnsHostDocuments;
278 }
279
280 public long getTotalHostSize() {
281 return totalHostSize;
282 }
283
284 public long getTotalDnsHostSize() {
285 return totalDnsHostSize;
286 }
287
288 public Hashtable getTldDistribution() {
289 return tldDistribution;
290 }
291
292 public Hashtable getTldBytes() {
293 return tldBytes;
294 }
295
296 public long getTotalTldDocuments() {
297 return totalTldDocuments;
298 }
299
300 public long getTotalTldSize() {
301 return totalTldSize;
302 }
303
304 public Hashtable getTldHostDistribution() {
305 return tldHostDistribution;
306 }
307
308 public long getTotalHosts() {
309 return totalHosts;
310 }
311
312 public String getDurationTime() {
313 return durationTime;
314 }
315
316 public String getProcessedDocsPerSec() {
317 return processedDocsPerSec;
318 }
319
320 public String getBandwidthKbytesPerSec() {
321 return bandwidthKbytesPerSec;
322 }
323
324 public String getTotalDataWritten() {
325 return totalDataWritten;
326 }
327
328 /***
329 * Sort the entries of the given HashMap in descending order by their
330 * values, which must be <code>AtomicLong</code>s.
331 * <p>
332 * Elements are sorted by value from largest to smallest. Equal values are
333 * sorted in an arbitrary, but consistent manner by their keys. Only items
334 * with identical value and key are considered equal.
335 *
336 * If the passed-in map requires access to be synchronized, the caller
337 * should ensure this synchronization.
338 *
339 * @param mapOfAtomicLongValues
340 * Assumes values are AtomicLongs.
341 * @return a sorted set containing the same elements as the map.
342 */
343 public TreeMap<String,AtomicLong> getReverseSortedCopy(
344 final Map<String,AtomicLong> mapOfAtomicLongValues) {
345 TreeMap<String,AtomicLong> sortedMap = new TreeMap<String,AtomicLong>(
346 new Comparator<String>() {
347 public int compare(String e1, String e2) {
348 long firstVal = mapOfAtomicLongValues.get(e1).get();
349 long secondVal = mapOfAtomicLongValues.get(e2).get();
350 if (firstVal < secondVal) {
351 return 1;
352 }
353 if (secondVal < firstVal) {
354 return -1;
355 }
356
357 return e1.compareTo(e2);
358 }
359 });
360 try {
361 sortedMap.putAll(mapOfAtomicLongValues);
362 } catch (UnsupportedOperationException e) {
363 for (String key: mapOfAtomicLongValues.keySet()) {
364 sortedMap.put(key, mapOfAtomicLongValues.get(key));
365 }
366 }
367 return sortedMap;
368 }
369
370 /***
371 * Get the number of hosts with a particular TLD.
372 * @param tld
373 * top-level domain name
374 * @return Total crawled hosts
375 */
376 public long getHostsPerTld(String tld) {
377 AtomicLong lw = (AtomicLong)tldHostDistribution.get(tld);
378 return (lw == null ? 0 : lw.get());
379 }
380
381 /***
382 * Read status code distribution from responsecode-report.txt.
383 * DNS and HTTP status codes are separated when read.
384 * @return True if we found some stats.
385 */
386 private boolean calculateStatusCodeDistribution() {
387
388 File f = new File(cjob.getDirectory(), "responsecode-report.txt");
389 if (!f.exists()) {
390 return false;
391 }
392 BufferedReader br = null;
393 try {
394 FileReader reader = new FileReader(f);
395 br = new BufferedReader(reader);
396 String line = br.readLine();
397 line = br.readLine();
398 while (line != null) {
399
400
401 String[] items = line.split(" ");
402 if (items.length < 2) {
403 logger.log(Level.WARNING,
404 "Unexpected formatting on line [" + line + "]");
405 }
406 else {
407
408 if (items[0].length() < 3) {
409
410 long total = Long.parseLong(items[1]);
411 dnsStatusCodeDistribution.put(items[0],
412 new AtomicLong(total));
413 totalDnsStatusCodeDocuments += total;
414 }
415 else {
416
417 long total = Long.parseLong(items[1]);
418 statusCodeDistribution.put(items[0],
419 new AtomicLong(total));
420 totalStatusCodeDocuments += total;
421 }
422 }
423 line = br.readLine();
424 }
425 } catch (IOException e) {
426 logger.log(Level.SEVERE, "Unable to read " + f.getAbsolutePath(),
427 e);
428 } finally {
429 if (br != null) {
430 try {
431 br.close();
432 } catch (IOException e) {
433 logger.log(Level.SEVERE,
434 "Closing " + f.getAbsolutePath(), e);
435 }
436 }
437 }
438 return true;
439 }
440
441 /***
442 * Read MIME type data from mimetype-report.txt.
443 * MIME type of text/dns is separated from other MIME types.
444 * @return True if we found some stats.
445 */
446 private boolean calculateMimeTypeDistribution() {
447 File f = new File(cjob.getDirectory(), "mimetype-report.txt");
448 if (!f.exists()) {
449 return false;
450 }
451 BufferedReader br = null;
452 try {
453 FileReader reader = new FileReader(f);
454 br = new BufferedReader(reader);
455 String line = br.readLine();
456 line = br.readLine();
457 while (line != null) {
458
459
460
461 String[] items = line.split(" ");
462 if (items.length < 3) {
463 logger.log(Level.WARNING,
464 "Unexpected formatting on line [" + line + "]");
465 }
466 else {
467 long total = Long.parseLong(items[0]);
468 long bytes = Long.parseLong(items[1]);
469 String mime = items[2];
470
471
472 if (mime.equalsIgnoreCase("text/dns")) {
473 mimeTypeDnsDistribution.put(mime,
474 new AtomicLong(total));
475 mimeTypeDnsBytes.put(mime, new AtomicLong(bytes));
476 totalDnsMimeTypeDocuments += total;
477 totalDnsMimeSize += bytes;
478 }
479 else {
480 mimeTypeDistribution.put(mime, new AtomicLong(total));
481 mimeTypeBytes.put(mime, new AtomicLong(bytes));
482 totalMimeTypeDocuments += total;
483 totalMimeSize += bytes;
484 }
485 }
486 line = br.readLine();
487 }
488 } catch (IOException e) {
489 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
490 } finally {
491 if (br != null) {
492 try {
493 br.close();
494 } catch (IOException e) {
495 logger.log(Level.SEVERE,
496 "Closing " + f.getAbsolutePath(), e);
497 }
498 }
499 }
500 return true;
501 }
502
503 /***
504 * Read number of URLs and total bytes for each host name from
505 * hosts-report.txt.
506 * Host name of "dns:" is separated from others.
507 * @return true if stats found.
508 */
509 private boolean calculateHostsDistribution() {
510 File f = new File(cjob.getDirectory(), "hosts-report.txt");
511 if (!f.exists()) {
512 return false;
513 }
514 BufferedReader br = null;
515 try {
516 FileReader reader = new FileReader(f);
517 br = new BufferedReader(reader);
518 String line = br.readLine();
519 line = br.readLine();
520 while (line != null) {
521
522
523
524 String[] items = line.split(" ");
525 if (items.length < 3) {
526 logger.log(Level.WARNING,
527 "Unexpected formatting on line [" + line + "]");
528 }
529 else {
530 long total = Long.parseLong(items[0]);
531 long bytes = Long.parseLong(items[1]);
532 String host = items[2];
533
534
535 if (host.startsWith("dns:", 0)) {
536 hostsDnsDistribution.put(host, new AtomicLong(total));
537 hostsDnsBytes.put(host, new AtomicLong(bytes));
538 totalDnsHostDocuments += total;
539 totalDnsHostSize += bytes;
540 }
541 else {
542 hostsDistribution.put(host, new AtomicLong(total));
543 hostsBytes.put(host, new AtomicLong(bytes));
544 totalHostDocuments += total;
545 totalHostSize += bytes;
546
547
548 String tld = host.substring(host.lastIndexOf('.')+1);
549 incrementMapCount(tldDistribution, tld, total);
550 incrementMapCount(tldBytes, tld, bytes);
551 incrementMapCount(tldHostDistribution, tld);
552 totalTldDocuments += total;
553 totalTldSize += bytes;
554
555 totalHosts++;
556 }
557 }
558 line = br.readLine();
559 }
560 } catch (IOException e) {
561 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
562 } finally {
563 if (br != null) {
564 try {
565 br.close();
566 } catch (IOException e) {
567 logger.log(Level.SEVERE,
568 "Closing " + f.getAbsolutePath(), e);
569 }
570 }
571 }
572 return true;
573 }
574
575 /***
576 * Returns the accumulated number of bytes downloaded from a given host.
577 * @param host name of the host
578 * @return the accumulated number of bytes downloaded from a given host
579 */
580 public long getBytesPerHost(String host) {
581 long bytes = -1;
582
583 bytes = host != null && host.startsWith("dns:", 0) ?
584 ((AtomicLong)hostsDnsBytes.get(host)).get() :
585 ((AtomicLong)hostsBytes.get(host)).get();
586
587 return bytes;
588 }
589
590 /***
591 * Returns the total number of bytes downloaded for a given TLD.
592 * @param tld TLD
593 * @return the total number of bytes downloaded for a given TLD
594 */
595 public long getBytesPerTld(String tld) {
596 AtomicLong lw = (AtomicLong)tldBytes.get(tld);
597 return (lw == null ? 0 : lw.get());
598 }
599
600 /***
601 * Returns the accumulated number of bytes from files of a given file type.
602 * @param filetype Filetype to check.
603 * @return the accumulated number of bytes from files of a given mime type
604 */
605 public long getBytesPerMimeType(String filetype) {
606 long bytes = -1;
607
608 if (filetype != null) {
609 if (filetype.equals("text/dns")) {
610 bytes = mimeTypeDnsBytes.get(filetype) == null ? 0 :
611 ((AtomicLong)mimeTypeDnsBytes.get(filetype)).get();
612 }
613 else {
614 bytes = mimeTypeBytes.get(filetype) == null ? 0 :
615 ((AtomicLong)mimeTypeBytes.get(filetype)).get();
616 }
617 }
618 return bytes;
619 }
620
621 /***
622 * Reads duration time, processed docs/sec, bandwidth, and total size
623 * of crawl from crawl-report.txt.
624 * @return true if stats found.
625 */
626 public boolean readCrawlReport() {
627 File f = new File(cjob.getDirectory(), "crawl-report.txt");
628 if (!f.exists()) {
629 return false;
630 }
631 BufferedReader br = null;
632 try {
633 FileReader reader = new FileReader(f);
634 br = new BufferedReader(reader);
635 String line = br.readLine();
636 while (line != null) {
637 if (line.startsWith("Duration Time")) {
638 durationTime = line.substring(line.indexOf(':')+1);
639 }
640 else if (line.startsWith("Processed docs/sec")) {
641 processedDocsPerSec = line.substring(line.indexOf(':')+1);
642 }
643 else if (line.startsWith("Bandwidth in Kbytes/sec")) {
644 bandwidthKbytesPerSec = line.substring(line.indexOf(':')+1);
645 }
646 else if (line.startsWith("Total Raw Data Size in Bytes")) {
647 totalDataWritten = line.substring(line.indexOf(':')+1);
648 }
649
650 line = br.readLine();
651 }
652 }
653 catch (IOException e) {
654 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
655 } finally {
656 if (br != null) {
657 try {
658 br.close();
659 } catch (IOException e) {
660 logger.log(Level.SEVERE,
661 "Failed close of " + f.getAbsolutePath(), e);
662 }
663 }
664 }
665 return true;
666 }
667
668 /***
669 * Returns sorted Iterator of seeds records based on status code.
670 * @return sorted Iterator of seeds records
671 */
672 public Iterator<SeedRecord> getSeedRecordsSortedByStatusCode() {
673 TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(
674 new Comparator<SeedRecord>() {
675 public int compare(SeedRecord sr1, SeedRecord sr2) {
676 int code1 = sr1.getStatusCode();
677 int code2 = sr2.getStatusCode();
678 if (code1 == code2) {
679
680 return sr1.getUri().compareTo(sr2.getUri());
681 }
682
683
684
685
686 code1 = -code1 - Integer.MAX_VALUE;
687 code2 = -code2 - Integer.MAX_VALUE;
688
689 return new Integer(code1).compareTo(new Integer(code2));
690 }
691 });
692 for (SeedRecord sr: processedSeedsRecords.values()) {
693 sortedSet.add(sr);
694 }
695
696 return sortedSet.iterator();
697 }
698
699 /***
700 * Reads seed data from seeds-report.txt.
701 * @return True if stats found.
702 */
703 private boolean readSeedReport() {
704 File f = new File(cjob.getDirectory(), "seeds-report.txt");
705 if (!f.exists()) {
706 return false;
707 }
708 BufferedReader br = null;
709 try {
710 FileReader reader = new FileReader(f);
711 br = new BufferedReader(reader);
712
713
714 String line = br.readLine();
715 line = br.readLine();
716 while (line != null) {
717
718
719
720
721 String[] items = line.split(" ");
722
723 if (items.length < 3) {
724 logger.log(Level.WARNING,
725 "Unexpected formatting on line [" + line + "]");
726 }
727 else {
728 String statusCode = items[0];
729 String crawlStatus = items[1];
730 String seed = items[2];
731 String redirect = items.length > 3 ? items[3] : null;
732
733
734 if (crawlStatus.equals("CRAWLED")) {
735 crawlStatus =org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_SUCCESS;
736 }
737 else {
738 crawlStatus = org.archive.crawler.framework.StatisticsTracking.SEED_DISPOSITION_FAILURE;
739 }
740 SeedRecord sr = new SeedRecord(seed, crawlStatus,
741 Integer.parseInt(statusCode), redirect);
742 processedSeedsRecords.put(seed, sr);
743 }
744
745 line = br.readLine();
746 }
747 } catch (IOException e) {
748 logger.log(Level.SEVERE, "Reading " + f.getAbsolutePath(), e);
749 } finally {
750 if (br != null) {
751 try {
752 br.close();
753 } catch (IOException e) {
754 logger.log(Level.SEVERE,
755 "Closing " + f.getAbsolutePath(), e);
756 }
757 }
758 }
759 return true;
760 }
761
762 /***
763 * Return a copy of the hosts distribution in reverse-sorted
764 * (largest first) order.
765 *
766 * @return SortedMap of hosts distribution
767 */
768 public SortedMap getReverseSortedHostsDistribution() {
769 return getReverseSortedCopy(hostsDistribution);
770 }
771
772 /***
773 * @return True if we compiled stats, false if none to compile (e.g.
774 * there are no reports files on disk).
775 */
776 public boolean isStats() {
777 return this.stats;
778 }
779 }