1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.scope;
25
26 import java.util.Iterator;
27
28 import org.archive.crawler.deciderules.DecidingScope;
29 import org.archive.crawler.filter.FilePatternFilter;
30 import org.archive.crawler.filter.TransclusionFilter;
31 import org.archive.crawler.framework.Filter;
32 import org.archive.net.UURI;
33
34 /***
35 * A core CrawlScope suitable for the most common
36 * crawl needs.
37 *
38 * Roughly, its logic is that a URI is included if:
39 *
40 * (( isSeed(uri) || focusFilter.accepts(uri) )
41 * || transitiveFilter.accepts(uri) )
42 * && ! excludeFilter.accepts(uri)
43 *
44 * The focusFilter may be specified by either:
45 * - adding a 'mode' attribute to the
46 * <code>scope</code> element. mode="broad" is equivalent
47 * to no focus; modes "path", "host", and "domain"
48 * imply a SeedExtensionFilter will be used, with
49 * the <code>scope</code> element providing its configuration
50 * - adding a <code>focus</code> subelement
51 * If unspecified, the focusFilter will default to
52 * an accepts-all filter.
53 *
54 * The transitiveFilter may be specified by supplying
55 * a <code>transitive</code> subelement. If unspecified, a
56 * TransclusionFilter will be used, with the <code>scope</code>
57 * element providing its configuration.
58 *
59 * The excludeFilter may be specified by supplying
60 * a <code>exclude</code> subelement. If unspecified, a
61 * accepts-none filter will be used -- meaning that
62 * no URIs will pass the filter and thus be excluded.
63 *
64 * @author gojomo
65 * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
66 */
67 public class HostScope extends SeedCachingScope {
68
69 private static final long serialVersionUID = -6257664892667267266L;
70
71 public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
72 public static final String ATTR_ADDITIONAL_FOCUS_FILTER =
73 "additionalScopeFocus";
74
75 Filter additionalFocusFilter;
76 Filter transitiveFilter;
77
78 public HostScope(String name) {
79 super(name);
80 setDescription(
81 "HostScope: A scope for host crawls *Deprecated* Use " +
82 "DecidingScope instead. Crawls made with this scope" +
83 " will be limited to the hosts its seeds. Thus if one of" +
84 " the seeds is 'archive.org' the subdomain" +
85 " 'crawler.archive.org' will not be crawled." +
86 " 'www.host' is considered to be the same as host.");
87 additionalFocusFilter = (Filter) addElementToDefinition(
88 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER));
89 this.transitiveFilter = (Filter) addElementToDefinition(
90 new TransclusionFilter(ATTR_TRANSITIVE_FILTER));
91 }
92
93 /***
94 * @param o
95 * @return True if transitive filter accepts passed object.
96 */
97 protected boolean transitiveAccepts(Object o) {
98 if (this.transitiveFilter == null) {
99 return true;
100 }
101 return this.transitiveFilter.accepts(o);
102 }
103
104 /***
105 * @param o
106 * @return True if focus filter accepts passed object.
107 */
108 protected boolean focusAccepts(Object o) {
109 UURI u = UURI.from(o);
110 if (u == null) {
111 return false;
112 }
113
114 Iterator iter = seedsIterator();
115 while(iter.hasNext()) {
116 if (isSameHost((UURI)iter.next(), u)) {
117 checkClose(iter);
118 return true;
119 }
120 }
121
122 checkClose(iter);
123 return false;
124 }
125
126
127
128 @Override
129 protected boolean additionalFocusAccepts(Object o) {
130 return additionalFocusFilter.accepts(o);
131 }
132
133 }