1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import java.util.logging.Logger;
28
29 import javax.management.AttributeNotFoundException;
30
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.crawler.settings.Type;
33
34
35
36 /***
37 * Rule REJECTs any URI which contains an excessive number of identical,
38 * consecutive path-segments (eg http://example.com/a/a/a/boo.html == 3 '/a'
39 * segments)
40 *
41 * @author gojomo
42 */
43 public class PathologicalPathDecideRule extends MatchesRegExpDecideRule {
44
45 private static final long serialVersionUID = -1803997581321178499L;
46
47 private static final Logger logger =
48 Logger.getLogger(PathologicalPathDecideRule.class.getName());
49
50 public static final String ATTR_REPETITIONS = "max-repetitions";
51
52 /***
53 * Default maximum repetitions.
54 * Default access so accessible by unit test.
55 */
56 static final Integer DEFAULT_REPETITIONS = new Integer(2);
57
58 protected String constructedRegexp;
59
60 /*** Constructs a new PathologicalPathFilter.
61 *
62 * @param name the name of the filter.
63 */
64 public PathologicalPathDecideRule(String name) {
65 super(name);
66 setDescription("PathologicalPathDecideRule. This rule" +
67 " is used to avoid crawler traps by adding a constraint on" +
68 " how many times a path-segment pattern in the URI may be" +
69 " repeated. A URI will be REJECTed if the same path-segment" +
70 " repeats more than '" + ATTR_REPETITIONS + "' in a row.");
71
72
73 Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
74 "Decision to be applied", REJECT, ALLOWED_TYPES));
75 type.setTransient(true);
76
77
78 type = getElementFromDefinition(ATTR_REGEXP);
79 type.setTransient(true);
80
81 type = addElementToDefinition(new SimpleType(ATTR_REPETITIONS,
82 "Number of times the pattern should be allowed to occur. " +
83 "This rule returns its decision (usually REJECT) if a " +
84 "path-segment is repeated more than number of times.",
85 DEFAULT_REPETITIONS));
86
87 type.setOverrideable(false);
88 }
89
90 /***
91 * Construct the regexp string to be matched against the URI.
92 * @param o an object to extract a URI from.
93 * @return the regexp pattern.
94 */
95 protected String getRegexp(Object o) {
96 if (constructedRegexp == null) {
97
98 constructedRegexp = constructRegexp();
99 }
100 return constructedRegexp;
101 }
102
103 protected String constructRegexp() {
104 int rep = 0;
105 try {
106 rep = ((Integer) getAttribute(null, ATTR_REPETITIONS)).intValue();
107 } catch (AttributeNotFoundException e) {
108 logger.severe(e.getMessage());
109 }
110 return (rep == 0) ? null : ".*?/(.*?/)//1{" + rep + ",}.*";
111 }
112
113
114 /***
115 * Repetitions may have changed; refresh constructedRegexp
116 *
117 * @see org.archive.crawler.deciderules.DecideRule#kickUpdate()
118 */
119 public void kickUpdate() {
120 super.kickUpdate();
121 constructedRegexp = constructRegexp();
122 }
123 }