1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.scope;
25
26 import java.util.Iterator;
27 import java.util.logging.Logger;
28
29 import org.apache.commons.httpclient.URIException;
30 import org.archive.crawler.deciderules.DecidingScope;
31 import org.archive.crawler.filter.FilePatternFilter;
32 import org.archive.crawler.filter.TransclusionFilter;
33 import org.archive.crawler.framework.Filter;
34 import org.archive.net.UURI;
35
36 /***
37 * A core CrawlScope suitable for the most common
38 * crawl needs.
39 *
40 * Roughly, its logic is that a URI is included if:
41 *
42 * (( isSeed(uri) || focusFilter.accepts(uri) )
43 * || transitiveFilter.accepts(uri) )
44 * && ! excludeFilter.accepts(uri)
45 *
46 * The focusFilter may be specified by either:
47 * - adding a 'mode' attribute to the
48 * <code>scope</code> element. mode="broad" is equivalent
49 * to no focus; modes "path", "host", and "domain"
50 * imply a SeedExtensionFilter will be used, with
51 * the <code>scope</code> element providing its configuration
52 * - adding a <code>focus</code> subelement
53 * If unspecified, the focusFilter will default to
54 * an accepts-all filter.
55 *
56 * The transitiveFilter may be specified by supplying
57 * a <code>transitive</code> subelement. If unspecified, a
58 * TransclusionFilter will be used, with the <code>scope</code>
59 * element providing its configuration.
60 *
61 * The excludeFilter may be specified by supplying
62 * a <code>exclude</code> subelement. If unspecified, a
63 * accepts-none filter will be used -- meaning that
64 * no URIs will pass the filter and thus be excluded.
65 *
66 * @author gojomo
67 * @deprecated As of release 1.10.0. Replaced by {@link DecidingScope}.
68 */
69 public class PathScope extends SeedCachingScope {
70
71 private static final long serialVersionUID = -2217024073240277527L;
72
73 private static Logger logger =
74 Logger.getLogger("org.archive.crawler.basic.PathScope");
75
76 public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
77 public static final String ATTR_ADDITIONAL_FOCUS_FILTER =
78 "additionalScopeFocus";
79
80 Filter additionalFocusFilter;
81 Filter transitiveFilter;
82
83 public PathScope(String name) {
84 super(name);
85 setDescription(
86 "PathScope: A scope for path crawls *Deprecated* Use " +
87 "DecidingScope instead. Crawls made with this scope" +
88 " will be limited to a specific portion of the hosts its seeds" +
89 " provide. More specifically the paths those seeds provide." +
90 " For example if one of the seeds is 'archive.org/example/'" +
91 " all URIs under the path 'examples' will be crawled (like" +
92 " 'archive.org/examples/hello.html') but not URIs in other" +
93 " paths or root (i.e. 'archive.org/index.html).");
94 this.additionalFocusFilter = (Filter) addElementToDefinition(
95 new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER));
96 this.transitiveFilter = (Filter) addElementToDefinition(
97 new TransclusionFilter(ATTR_TRANSITIVE_FILTER));
98 }
99
100 /***
101 * @param o
102 * @return True if transitive filter accepts passed object.
103 */
104 protected boolean transitiveAccepts(Object o) {
105 if (this.transitiveFilter == null) {
106 return true;
107 }
108 return this.transitiveFilter.accepts(o);
109 }
110
111 /***
112 * @param o
113 * @return True if focus filter accepts passed object.
114 */
115 protected boolean focusAccepts(Object o) {
116 UURI u = UURI.from(o);
117 if (u == null) {
118 return false;
119 }
120
121 Iterator iter = seedsIterator();
122 while(iter.hasNext()) {
123 UURI s = (UURI) iter.next();
124 if (isSameHost(s, u)) {
125 try {
126
127
128
129 if (s.getPath() == null || u.getPath() == null) {
130 continue;
131 }
132 }
133 catch (URIException e) {
134 logger.severe("Failed get path on " + u + " or " + s +
135 ": " + e.getMessage());
136 }
137 try {
138 if (s.getPath().regionMatches(0, u.getPath(), 0,
139 s.getPath().lastIndexOf('/'))) {
140
141 checkClose(iter);
142 return true;
143 } else {
144
145 continue;
146 }
147 }
148 catch (URIException e) {
149 logger.severe("Failed get path on " + u + " or " + s +
150 ": " + e.getMessage());
151 }
152 }
153 }
154
155 checkClose(iter);
156 return false;
157 }
158
159
160 @Override
161 protected boolean additionalFocusAccepts(Object o) {
162 return this.additionalFocusFilter.accepts(o);
163 }
164
165 }