1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.deciderules;
24
25 import java.util.logging.Level;
26 import java.util.logging.Logger;
27
28 import javax.management.AttributeNotFoundException;
29
30 import org.archive.crawler.datamodel.CandidateURI;
31 import org.archive.crawler.settings.SimpleType;
32 import org.archive.net.UURI;
33 import org.archive.util.SurtPrefixSet;
34
35 /***
36 * Rule allows one level of discovery beyond configured scope
37 * (e.g. Domain, plus the first otherwise out-of-scope link from an
38 * in-scope page, but not further hops from that first page)
39 *
40 * @author Shifra Raffel
41 * @version $Date: 2006-09-25 17:16:55 +0000 (Mon, 25 Sep 2006) $ $Revision: 4649 $
42 */
43 public class ScopePlusOneDecideRule extends SurtPrefixedDecideRule {
44
45 private static final long serialVersionUID = -6344162369024146340L;
46
47 public static final String ATTR_SCOPE = "host-or-domain-scope";
48 public static final String HOST = "Host";
49 public static final String DOMAIN = "Domain";
50
51 private static final Logger logger =
52 Logger.getLogger(ScopePlusOneDecideRule.class.getName());
53
54 /***
55 * Constructor.
56 * @param name
57 */
58 public ScopePlusOneDecideRule(String name) {
59 super(name);
60 setDescription(
61 "ScopePlusOneDecideRule. Rule allows one level of discovery " +
62 "beyond configured scope (e.g. Domain, plus the first " +
63 "otherwise out-of-scope link from an in-scope page, but " +
64 "no further hops from that first otherwise-out-of-scope page). " +
65 "surts-source-file is optional. Use surts-dump-file option " +
66 "when testing.");
67 addElementToDefinition(new SimpleType(ATTR_SCOPE,
68 "Restrict to host, e.g. archive.org excludes audio.archive.org, " +
69 "or expand to domain as well, e.g. archive.org includes all " +
70 "*.archive.org", DOMAIN, new String[] {HOST, DOMAIN}));
71 }
72
73 /***
74 * Evaluate whether given object comes from a URI which is in scope
75 *
76 * @param object to evaluate
77 * @return true if URI is either in scope or its via is
78 */
79 protected boolean evaluate(Object object) {
80 boolean result = false;
81 if (!(object instanceof CandidateURI)) {
82
83 return false;
84 }
85 SurtPrefixSet set = getPrefixes(object);
86 UURI u = UURI.from(object);
87
88 boolean firstResult = isInScope(u, set);
89 if (logger.isLoggable(Level.FINE)) {
90 logger.fine("Tested scope of UURI itself '" + u +
91 " and result was " + firstResult);
92 }
93 if (firstResult == true) {
94 result = true;
95 } else {
96
97
98 UURI via = getVia(object);
99 if (via == null) {
100
101 return false;
102 }
103
104 result = isInScope (via, set);
105 if (logger.isLoggable(Level.FINE)) {
106 logger.fine("Tested via UURI '" + via +
107 " and result was " + result);
108 }
109 }
110 return result;
111 }
112
113 /***
114 * Synchronized get of prefix set to use
115 *
116 * @return SurtPrefixSet to use for check
117 *@see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
118 */
119 protected synchronized SurtPrefixSet getPrefixes() {
120 return getPrefixes(null);
121 }
122
123 /***
124 * Synchronized get of prefix set to use.
125 * @param o Context object.
126 *
127 * @return SurtPrefixSet to use for check
128 * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
129 */
130 protected synchronized SurtPrefixSet getPrefixes(Object o) {
131 if (surtPrefixes == null) {
132 readPrefixes(o);
133 }
134 return surtPrefixes;
135 }
136
137 /***
138 * Patch the SURT prefix set so that it only includes the appropriate
139 * prefixes.
140 * @param o Context object.
141 * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
142 */
143 protected void readPrefixes(Object o) {
144 buildSurtPrefixSet();
145
146 String scope = this.getScope(o);
147 if (scope.equals(HOST)){
148 surtPrefixes.convertAllPrefixesToHosts();
149 } else if (scope.equals(DOMAIN)) {
150 surtPrefixes.convertAllPrefixesToDomains();
151 }
152 dumpSurtPrefixSet();
153 }
154
155 private UURI getVia(Object o){
156 return (o instanceof CandidateURI)? ((CandidateURI)o).getVia(): null;
157 }
158
159 /***
160 * Decide whether using host or domain scope
161 * @param o Context
162 * @return String Host or domain
163 *
164 */
165 protected String getScope(Object o) {
166 try {
167 String scope = (String)getAttribute(o, ATTR_SCOPE);
168 if (scope.equals(HOST)) {
169 return HOST;
170 } else if (scope.equals(DOMAIN)) {
171 return DOMAIN;
172 } else {
173 assert false : "Unrecognized scope " + scope
174 + ". Should never happen!";
175 }
176 } catch (AttributeNotFoundException e) {
177 logger.severe(e.getMessage());
178 }
179 return null;
180 }
181
182
183 private boolean isInScope (Object o, SurtPrefixSet set) {
184 boolean iResult = false;
185 UURI u = (UURI)o;
186 if (u == null) {
187 return false;
188 }
189 String candidateSurt = u.getSurtForm();
190
191 if (candidateSurt.startsWith("https:")) {
192 candidateSurt = "http:" + candidateSurt.substring(6);
193 }
194 if (set.containsPrefixOf(candidateSurt)){
195 iResult = true;
196 }
197 return iResult;
198 }
199 }