1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.scope;
25
26
27 /***
28 * A CrawlScope instance defines which URIs are "in"
29 * a particular crawl.
30 *
31 * It is essentially a Filter which determines, looking at
32 * the totality of information available about a
33 * CandidateURI/CrawlURI instamce, if that URI should be
34 * scheduled for crawling.
35 *
36 * <p>Dynamic information inherent in the discovery of the
37 * URI -- such as the path by which it was discovered --
38 * may be considered.
39 *
40 * <p>Dynamic information which requires the consultation
41 * of external and potentially volatile information --
42 * such as current robots.txt requests and the history
43 * of attempts to crawl the same URI -- should NOT be
44 * considered. Those potentially high-latency decisions
45 * should be made at another step. .
46 *
47 * @author gojomo
48 *
49 */
50 public class BroadScope extends ClassicScope {
51
52 private static final long serialVersionUID = -2354234238454865888L;
53
54 /***
55 * Constructor.
56 *
57 * @param name Name of this crawlscope.
58 */
59 public BroadScope(String name) {
60 super(name);
61 setDescription("BroadScope: A scope for broad crawls. Crawls made" +
62 " with this scope will not be limited to the hosts or domains of" +
63 " its seeds. NOTE: BroadScoped crawls will eventually run out of" +
64 " memory (See Release Notes).");
65 }
66
67 /***
68 * @param o the URI to check.
69 * @return True if transitive filter accepts passed object.
70 */
71 protected boolean transitiveAccepts(Object o) {
72 return true;
73 }
74
75 /*** Check if URI is accepted by the focus of this scope.
76 *
77 * This method should be overridden in subclasses.
78 *
79 * @param o the URI to check.
80 * @return True if focus filter accepts passed object.
81 */
82 protected boolean focusAccepts(Object o) {
83 return true;
84 }
85 }