1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.settings.SimpleType;
29 import org.archive.crawler.settings.Type;
30
31
32
33 /***
34 * Rule REJECTs any CrawlURIs whose total number of path-segments (as
35 * indicated by the count of '/' characters not including the first '//')
36 * is over a given threshold.
37 *
38 * @author gojomo
39 */
40 public class TooManyPathSegmentsDecideRule extends PredicatedDecideRule {
41
42 private static final long serialVersionUID = 147079100367815075L;
43
44 public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth";
45
46 /***
47 * Default maximum value.
48 * Default access so available to unit test.
49 */
50 static final Integer DEFAULT_MAX_PATH_DEPTH = new Integer(20);
51
52 /***
53 * Usual constructor.
54 * @param name Name of this DecideRule.
55 */
56 public TooManyPathSegmentsDecideRule(String name) {
57 super(name);
58 setDescription("TooManyPathSegmentsDecideRule. REJECTs URIs with " +
59 "more total path-segments (as indicated by '/' characters) " +
60 "than the configured '" + ATTR_MAX_PATH_DEPTH + "'.");
61
62
63 Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
64 "Decision to be applied", REJECT, ALLOWED_TYPES));
65 type.setTransient(true);
66
67 addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH, "Number of" +
68 " path segments beyond which this rule will reject URIs.",
69 DEFAULT_MAX_PATH_DEPTH));
70
71 }
72
73 /***
74 * Evaluate whether given object is over the threshold number of
75 * path-segments.
76 *
77 * @param object
78 * @return true if the path-segments is exceeded
79 */
80 protected boolean evaluate(Object object) {
81 boolean result = false;
82 CandidateURI curi = null;
83 try {
84 curi = (CandidateURI)object;
85 } catch (ClassCastException e) {
86
87 return result;
88 }
89 String uri = curi.toString();
90 int count = 0;
91 int threshold = getThresholdSegments(object);
92 for (int i = 0; i < uri.length(); i++) {
93 if (uri.charAt(i) == '/') {
94 count++;
95 }
96 if (count > threshold) {
97 result = true;
98 break;
99 }
100 }
101 return result;
102 }
103
104 /***
105 * @param obj
106 * @return path-segments cutoff threshold
107 */
108 private int getThresholdSegments(Object obj) {
109
110 return ((Integer) getUncheckedAttribute(obj, ATTR_MAX_PATH_DEPTH))
111 .intValue() + 2;
112 }
113 }