1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.settings.SimpleType;
29 import org.archive.crawler.settings.Type;
30
31
32
33 /***
34 * Rule REJECTs any CrawlURIs whose total number of hops (length of the
35 * hopsPath string, traversed links of any type) is over a threshold.
36 * Otherwise returns PASS.
37 *
38 * @author gojomo
39 */
40 public class TooManyHopsDecideRule extends PredicatedDecideRule {
41
42 private static final long serialVersionUID = -5429536193865916670L;
43
44 private static final String ATTR_MAX_HOPS = "max-hops";
45
46 /***
47 * Default access so available to test code.
48 */
49 static final Integer DEFAULT_MAX_HOPS = new Integer(20);
50
51 /***
52 * Usual constructor.
53 * @param name Name of this DecideRule.
54 */
55 public TooManyHopsDecideRule(String name) {
56 super(name);
57 setDescription("TooManyHopsDecideRule. REJECTs URIs discovered " +
58 "after too many hops (followed links of any type) from seed.");
59 addElementToDefinition(new SimpleType(ATTR_MAX_HOPS, "Max path" +
60 " depth for which this filter will match", DEFAULT_MAX_HOPS));
61
62 Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
63 "Decision to be applied", REJECT, ALLOWED_TYPES));
64 type.setTransient(true);
65 }
66
67 /***
68 * Evaluate whether given object is over the threshold number of
69 * hops.
70 *
71 * @param object
72 * @return true if the mx-hops is exceeded
73 */
74 protected boolean evaluate(Object object) {
75 try {
76 CandidateURI curi = (CandidateURI)object;
77 return curi.getPathFromSeed() != null &&
78 curi.getPathFromSeed().length() > getThresholdHops(object);
79 } catch (ClassCastException e) {
80
81 return false;
82 }
83 }
84
85 /***
86 * @param obj Conext object.
87 * @return hops cutoff threshold
88 */
89 private int getThresholdHops(Object obj) {
90 return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_HOPS)).intValue();
91 }
92 }