1 package org.archive.crawler.util;
2
3 import org.apache.commons.httpclient.HttpStatus;
4 import org.archive.crawler.datamodel.CoreAttributeConstants;
5 import org.archive.crawler.datamodel.CrawlURI;
6 import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
7 import org.archive.util.Accumulator;
8 import org.archive.util.ArchiveUtils;
9 import org.archive.util.Histotable;
10
11 public class CrawledBytesHistotable extends Histotable<String>
12 implements Accumulator<CrawlURI>, CoreAttributeConstants {
13 private static final long serialVersionUID = 7923431123239026213L;
14
15 public static final String NOTMODIFIED = "not-modified";
16 public static final String DUPLICATE = "dup-by-hash";
17 public static final String NOVEL = "novel";
18
19
20 public CrawledBytesHistotable() {
21 super();
22 tally(NOVEL,0);
23 }
24
25 public void accumulate(CrawlURI curi) {
26 if(curi.getFetchStatus()==HttpStatus.SC_NOT_MODIFIED) {
27 tally(NOTMODIFIED, curi.getContentSize());
28 } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
29 tally(DUPLICATE,curi.getContentSize());
30 } else {
31 tally(NOVEL,curi.getContentSize());
32 }
33 }
34
35 public String summary() {
36 StringBuilder sb = new StringBuilder();
37 sb.append(ArchiveUtils.formatBytesForDisplay(getTotal()));
38 sb.append(" crawled (");
39 sb.append(ArchiveUtils.formatBytesForDisplay(get(NOVEL)));
40 sb.append(" novel");
41 if(get(DUPLICATE)!=null) {
42 sb.append(", ");
43 sb.append(ArchiveUtils.formatBytesForDisplay(get(DUPLICATE)));
44 sb.append(" ");
45 sb.append(DUPLICATE);
46 }
47 if(get(NOTMODIFIED)!=null) {
48 sb.append(", ");
49 sb.append(ArchiveUtils.formatBytesForDisplay(get(NOTMODIFIED)));
50 sb.append(" ");
51 sb.append(NOTMODIFIED);
52 }
53 sb.append(")");
54 return sb.toString();
55 }
56 }