1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.scope;
25
26 import javax.management.AttributeNotFoundException;
27
28 import org.archive.crawler.datamodel.CandidateURI;
29 import org.archive.crawler.extractor.Link;
30
31 import org.archive.crawler.framework.CrawlScope;
32 import org.archive.crawler.settings.SimpleType;
33
34 /***
35 * ClassicScope: superclass with shared Scope behavior for
36 * most common scopes.
37 *
38 * Roughly, its logic is captured in innerAccept(). A URI is
39 * included if:
40 * <pre>
41 * forceAccepts(uri)
42 * || (((isSeed(uri)
43 * || focusAccepts(uri))
44 * || additionalFocusAccepts(uri)
45 * || transitiveAccepts(uri))
46 * && !excludeAccepts(uri));</pre>
47 *
48 * Subclasses should override focusAccepts, additionalFocusAccepts,
49 * and transitiveAccepts.
50 *
51 * The excludeFilter may be specified by supplying
52 * a <code>exclude</code> subelement. If unspecified, a
53 * accepts-none filter will be used -- meaning that
54 * no URIs will pass the filter and thus be excluded.
55 *
56 * @author gojomo
57 */
58 public class ClassicScope extends CrawlScope {
59
60 private static final long serialVersionUID = 4494905304855590002L;
61
62
63
64
65 public static final String ATTR_EXCLUDE_FILTER = "exclude-filter";
66
67 public static final String ATTR_MAX_LINK_HOPS = "max-link-hops";
68
69 public static final String ATTR_MAX_TRANS_HOPS = "max-trans-hops";
70
71
72
73 @SuppressWarnings("deprecation")
74 private org.archive.crawler.filter.OrFilter excludeFilter;
75
76 /***
77 * @param name
78 * ignored by superclass
79 */
80 @SuppressWarnings("deprecation")
81 public ClassicScope(String name) {
82 super(name);
83 addElementToDefinition(new SimpleType(ATTR_MAX_LINK_HOPS,
84 "Max link hops to include. URIs more than this number "
85 + "of links from a seed will not be ruled in-scope. (Such "
86 + "determination does not preclude later inclusion if a "
87 + "shorter path is later discovered.)", new Integer(25)));
88 addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS,
89 "Max transitive hops (embeds, referrals, preconditions) to " +
90 "include. URIs reached by more than this number of transitive " +
91 "hops will not be ruled in-scope, even if otherwise on an " +
92 "in-focus site. (Such determination does not preclude later " +
93 " inclusion if a shorter path is later discovered.)",
94 new Integer(5)));
95 this.excludeFilter = (org.archive.crawler.filter.OrFilter)
96 addElementToDefinition(new org.archive.crawler.filter.OrFilter(
97 ATTR_EXCLUDE_FILTER));
98
99
100
101 setPreservedFields(new String[] { ATTR_SEEDS, ATTR_MAX_LINK_HOPS,
102 ATTR_MAX_TRANS_HOPS, ATTR_EXCLUDE_FILTER });
103 }
104
105 /***
106 * Default constructor.
107 */
108 public ClassicScope() {
109 this(CrawlScope.ATTR_NAME);
110 }
111
112 /***
113 * Returns whether the given object (typically a CandidateURI) falls within
114 * this scope.
115 *
116 * @param o
117 * Object to test.
118 * @return Whether the given object (typically a CandidateURI) falls within
119 * this scope.
120 */
121 protected final boolean innerAccepts(Object o) {
122 return (((isSeed(o) || focusAccepts(o)) ||
123 additionalFocusAccepts(o) || transitiveAccepts(o)) &&
124 !excludeAccepts(o));
125 }
126
127 /***
128 * Check if URI is accepted by the additional focus of this scope.
129 *
130 * This method should be overridden in subclasses.
131 *
132 * @param o
133 * the URI to check.
134 * @return True if additional focus filter accepts passed object.
135 */
136 protected boolean additionalFocusAccepts(Object o) {
137 return false;
138 }
139
140 /***
141 * @param o
142 * the URI to check.
143 * @return True if transitive filter accepts passed object.
144 */
145 protected boolean transitiveAccepts(Object o) {
146 return false;
147 }
148
149 /***
150 * @param o the URI to check.
151 * @return True if force-accepts filter accepts passed object.
152 */
153 protected boolean xforceAccepts(Object o) {
154 return false;
155 }
156
157 /***
158 * Check if URI is accepted by the focus of this scope.
159 *
160 * This method should be overridden in subclasses.
161 *
162 * @param o
163 * the URI to check.
164 * @return True if focus filter accepts passed object.
165 */
166 protected boolean focusAccepts(Object o) {
167
168 return false;
169 }
170
171 /***
172 * Check if URI is excluded by any filters.
173 *
174 * @param o
175 * the URI to check.
176 * @return True if exclude filter accepts passed object.
177 */
178 @SuppressWarnings("deprecation")
179 protected boolean excludeAccepts(Object o) {
180 return (this.excludeFilter.isEmpty(o)) ? exceedsMaxHops(o)
181 : this.excludeFilter.accepts(o) || exceedsMaxHops(o);
182 }
183
184 /***
185 * Check if there are too many hops
186 *
187 * @param o
188 * URI to check.
189 * @return true if too many hops.
190 */
191 protected boolean exceedsMaxHops(Object o) {
192 if (!(o instanceof CandidateURI)) {
193 return false;
194 }
195
196 int maxLinkHops = 0;
197
198
199 try {
200 maxLinkHops = ((Integer) getAttribute(o, ATTR_MAX_LINK_HOPS))
201 .intValue();
202
203
204 } catch (AttributeNotFoundException e) {
205
206 e.printStackTrace();
207 }
208
209 CandidateURI cand = (CandidateURI) o;
210
211 String path = cand.getPathFromSeed();
212 int linkCount = 0;
213 int transCount = 0;
214 for (int i = path.length() - 1; i >= 0; i--) {
215 if (path.charAt(i) == Link.NAVLINK_HOP) {
216 linkCount++;
217 } else if (linkCount == 0) {
218 transCount++;
219 }
220 }
221
222
223 return (linkCount > maxLinkHops);
224 }
225
226 /***
227 * Take note of a situation (such as settings edit) where involved
228 * reconfiguration (such as reading from external files) may be necessary.
229 */
230 @SuppressWarnings("deprecation")
231 public void kickUpdate() {
232 super.kickUpdate();
233 excludeFilter.kickUpdate();
234 }
235 }