1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import org.archive.crawler.datamodel.CandidateURI;
28 import org.archive.crawler.extractor.Link;
29 import org.archive.crawler.settings.SimpleType;
30 import org.archive.crawler.settings.Type;
31
32
33
34 /***
35 * Rule ACCEPTs any CrawlURIs whose path-from-seed ('hopsPath' -- see
36 * {@link CandidateURI#getPathFromSeed()}) ends
37 * with at least one, but not more than, the given number of
38 * non-navlink ('L') hops.
39 *
40 * Otherwise, if the path-from-seed is empty or if a navlink ('L') occurs
41 * within max-trans-hops of the tail of the path-from-seed, this rule
42 * returns PASS.
43 *
44 * <p>Thus, it allows things like embedded resources (frames/images/media)
45 * and redirects to be transitively included ('transcluded') in a crawl,
46 * even if they otherwise would not, for some reasonable number of hops
47 * (1-4).
48 *
49 * @see <a href="http://www.google.com/search?q=define%3Atransclusion&sourceid=mozilla&start=0&start=0&ie=utf-8&oe=utf-8">Transclusion</a>
50 *
51 * @author gojomo
52 */
53 public class TransclusionDecideRule extends PredicatedDecideRule {
54
55 private static final long serialVersionUID = -3975688876990558918L;
56
57 private static final String ATTR_MAX_TRANS_HOPS = "max-trans-hops";
58
59 private static final String ATTR_MAX_SPECULATIVE_HOPS = "max-speculative-hops";
60
61 /***
62 * Default maximum transitive hops -- any type
63 * Default access so can be accessed by unit tests.
64 */
65 static final Integer DEFAULT_MAX_TRANS_HOPS = new Integer(3);
66
67 /***
68 * Default maximum speculative ('X') hops.
69 * Default access so can be accessed by unit tests.
70 */
71 static final Integer DEFAULT_MAX_SPECULATIVE_HOPS = new Integer(1);
72
73 /***
74 * Usual constructor.
75 * @param name Name of this DecideRule.
76 */
77 public TransclusionDecideRule(String name) {
78 super(name);
79 setDescription("TransclusionDecideRule. ACCEPTs URIs whose path " +
80 "from the seed ends with up to (but not more than) the " +
81 "configured '" + ATTR_MAX_TRANS_HOPS +
82 "' number of non-navlink ('L') hops.");
83
84 Type type = getElementFromDefinition(ATTR_DECISION);
85 type.setTransient(true);
86 addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS,
87 "Maximum number of non-navlink (non-'L') hops to ACCEPT.",
88 DEFAULT_MAX_TRANS_HOPS));
89 addElementToDefinition(new SimpleType(ATTR_MAX_SPECULATIVE_HOPS,
90 "Maximum number of speculative ('X') hops to ACCEPT.",
91 DEFAULT_MAX_SPECULATIVE_HOPS));
92 }
93
94 /***
95 * Evaluate whether given object is within the threshold number of
96 * transitive hops.
97 *
98 * @param object Object to make decision on.
99 * @return true if the transitive hops >0 and <= max
100 */
101 protected boolean evaluate(Object object) {
102 CandidateURI curi = null;
103 try {
104 curi = (CandidateURI)object;
105 } catch (ClassCastException e) {
106
107 return false;
108 }
109 String hopsPath = curi.getPathFromSeed();
110 if (hopsPath == null || hopsPath.length() == 0) {
111 return false;
112 }
113 int allCount = 0;
114 int nonrefCount = 0;
115 int specCount = 0;
116 for (int i = hopsPath.length() - 1; i >= 0; i--) {
117 char c = hopsPath.charAt(i);
118 if (c == Link.NAVLINK_HOP) {
119
120 break;
121 }
122 allCount++;
123 if(c != Link.REFER_HOP) {
124 nonrefCount++;
125 }
126 if(c == Link.SPECULATIVE_HOP) {
127 specCount++;
128 }
129 }
130
131 if (allCount <= 0) {
132 return false;
133 }
134
135
136 if (specCount > getThresholdSpeculativeHops(object)) {
137 return false;
138 }
139
140
141 return nonrefCount <= getThresholdHops(object);
142 }
143
144 /***
145 * @param obj Context object.
146 * @return hops cutoff threshold
147 */
148 private int getThresholdHops(Object obj) {
149 return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_TRANS_HOPS)).
150 intValue();
151 }
152
153 /***
154 * @param obj Context object.
155 * @return hops cutoff threshold
156 */
157 private int getThresholdSpeculativeHops(Object obj) {
158 return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_SPECULATIVE_HOPS)).
159 intValue();
160 }
161 }