1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.archive.crawler.extractor;
23
24 import java.util.logging.Logger;
25
26 import org.archive.crawler.datamodel.CoreAttributeConstants;
27 import org.archive.crawler.datamodel.CrawlURI;
28
29
30 /***
31 * Pseudo-extractor that suppresses link-extraction of likely trap pages,
32 * by noticing when content's digest is identical to that of its 'via'.
33 *
34 * @author gojomo
35 *
36 */
37 public class TrapSuppressExtractor extends Extractor implements CoreAttributeConstants {
38 private static final long serialVersionUID = -1028783453022579530L;
39
40 private static final Logger LOGGER =
41 Logger.getLogger(TrapSuppressExtractor.class.getName());
42
43 /*** ALIst attribute key for carrying-forward content-digest from 'via'*/
44 public static String A_VIA_DIGEST = "via-digest";
45
46 protected long numberOfCURIsHandled = 0;
47 protected long numberOfCURIsSuppressed = 0;
48
49 /***
50 * Usual constructor.
51 * @param name
52 */
53 public TrapSuppressExtractor(String name) {
54 super(name, "TrapSuppressExtractor. Prevent extraction of likely " +
55 "trap content.");
56 }
57
58 @Override
59 protected void initialTasks() {
60 super.initialTasks();
61 }
62
63 protected void extract(CrawlURI curi){
64 numberOfCURIsHandled++;
65
66 String currentDigest = curi.getContentDigestSchemeString();
67 String viaDigest = null;
68 if(curi.containsKey(A_VIA_DIGEST)) {
69 viaDigest = curi.getString(A_VIA_DIGEST);
70 }
71
72 if(currentDigest!=null) {
73 if(currentDigest.equals(viaDigest)) {
74
75 curi.linkExtractorFinished();
76 curi.addAnnotation("trapSuppressExtractor");
77 numberOfCURIsSuppressed++;
78 }
79
80
81 curi.putString(A_VIA_DIGEST, currentDigest);
82 curi.makeHeritable(A_VIA_DIGEST);
83 }
84 }
85
86 /***
87 * Provide a human-readable textual summary of this Processor's state.
88 *
89 * @see org.archive.crawler.framework.Processor#report()
90 */
91 public String report() {
92 StringBuffer ret = new StringBuffer();
93 ret.append("Processor: org.archive.crawler.extractor.TrapSuppressExtractor\n");
94 ret.append(" Function: Suppress extraction on likely traps\n");
95 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
96 ret.append(" CrawlURIs suppressed: " + numberOfCURIsSuppressed + "\n\n");
97
98 return ret.toString();
99 }
100 }