1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.prefetch;
25
26 import java.util.Iterator;
27 import java.util.Set;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import javax.management.AttributeNotFoundException;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlHost;
36 import org.archive.crawler.datamodel.CrawlServer;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.datamodel.CredentialStore;
39 import org.archive.crawler.datamodel.FetchStatusCodes;
40 import org.archive.crawler.datamodel.credential.Credential;
41 import org.archive.crawler.datamodel.credential.CredentialAvatar;
42 import org.archive.crawler.framework.Processor;
43 import org.archive.crawler.settings.SimpleType;
44 import org.archive.crawler.settings.Type;
45 import org.archive.net.UURI;
46
47 /***
48 * Ensures the preconditions for a fetch -- such as DNS lookup
49 * or acquiring and respecting a robots.txt policy -- are
50 * satisfied before a URI is passed to subsequent stages.
51 *
52 * @author gojomo
53 */
54 public class PreconditionEnforcer
55 extends Processor
56 implements CoreAttributeConstants, FetchStatusCodes {
57
58 private static final long serialVersionUID = 4636474153589079615L;
59
60 private static final Logger logger =
61 Logger.getLogger(PreconditionEnforcer.class.getName());
62
63 private final static Integer DEFAULT_IP_VALIDITY_DURATION =
64 new Integer(60*60*6);
65 private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION =
66 new Integer(60*60*24);
67
68 /*** seconds to keep IP information for */
69 public final static String ATTR_IP_VALIDITY_DURATION
70 = "ip-validity-duration-seconds";
71 /*** seconds to cache robots info */
72 public final static String ATTR_ROBOTS_VALIDITY_DURATION
73 = "robot-validity-duration-seconds";
74
75 /*** whether to calculate robots exclusion without applying */
76 public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE;
77 public final static String ATTR_CALCULATE_ROBOTS_ONLY
78 = "calculate-robots-only";
79
80 public PreconditionEnforcer(String name) {
81 super(name, "Precondition enforcer");
82
83 Type e;
84
85 e = addElementToDefinition(new SimpleType(ATTR_IP_VALIDITY_DURATION,
86 "The minimum interval for which a dns-record will be considered " +
87 "valid (in seconds). " +
88 "If the record's DNS TTL is larger, that will be used instead.",
89 DEFAULT_IP_VALIDITY_DURATION));
90 e.setExpertSetting(true);
91
92 e = addElementToDefinition(new SimpleType(ATTR_ROBOTS_VALIDITY_DURATION,
93 "The time in seconds that fetched robots.txt information is " +
94 "considered to be valid. " +
95 "If the value is set to '0', then the robots.txt information" +
96 " will never expire.",
97 DEFAULT_ROBOTS_VALIDITY_DURATION));
98 e.setExpertSetting(true);
99
100 e = addElementToDefinition(new SimpleType(ATTR_CALCULATE_ROBOTS_ONLY,
101 "Whether to only calculate the robots status of an URI, " +
102 "without actually applying any exclusions found. If true, " +
103 "exlcuded URIs will only be annotated in the crawl.log, but " +
104 "still fetched. Default is false. ",
105 DEFAULT_CALCULATE_ROBOTS_ONLY));
106 e.setExpertSetting(true);
107 }
108
109 protected void innerProcess(CrawlURI curi) {
110
111 if (considerDnsPreconditions(curi)) {
112 return;
113 }
114
115
116 String scheme = curi.getUURI().getScheme().toLowerCase();
117 if (! (scheme.equals("http") || scheme.equals("https"))) {
118 logger.fine("PolitenessEnforcer doesn't understand uri's of type " +
119 scheme + " (ignoring)");
120 return;
121 }
122
123 if (considerRobotsPreconditions(curi)) {
124 return;
125 }
126
127 if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
128 return;
129 }
130
131
132
133
134
135
136
137
138 return;
139 }
140
141 /***
142 * Consider the robots precondition.
143 *
144 * @param curi CrawlURI we're checking for any required preconditions.
145 * @return True, if this <code>curi</code> has a precondition or processing
146 * should be terminated for some other reason. False if
147 * we can precede to process this url.
148 */
149 private boolean considerRobotsPreconditions(CrawlURI curi) {
150
151 UURI uuri = curi.getUURI();
152 try {
153 if (uuri != null && uuri.getPath() != null &&
154 curi.getUURI().getPath().equals("/robots.txt")) {
155
156 curi.setPrerequisite(true);
157 return false;
158 }
159 }
160 catch (URIException e) {
161 logger.severe("Failed get of path for " + curi);
162 }
163
164 if (isRobotsExpired(curi)) {
165
166 if (logger.isLoggable(Level.FINE)) {
167 logger.fine( "No valid robots for " +
168 getController().getServerCache().getServerFor(curi) +
169 "; deferring " + curi);
170 }
171
172
173
174 try {
175 String prereq = curi.getUURI().resolve("/robots.txt").toString();
176 curi.markPrerequisite(prereq,
177 getController().getPostprocessorChain());
178 }
179 catch (URIException e1) {
180 logger.severe("Failed resolve using " + curi);
181 throw new RuntimeException(e1);
182 }
183 return true;
184 }
185
186 CrawlServer cs = getController().getServerCache().getServerFor(curi);
187 if(cs.isValidRobots()){
188 String ua = getController().getOrder().getUserAgent(curi);
189 if(cs.getRobots().disallows(curi, ua)) {
190 if(((Boolean)getUncheckedAttribute(curi,ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {
191
192 curi.addAnnotation("robotExcluded");
193 return false;
194 }
195
196
197
198
199 curi.setFetchStatus(S_ROBOTS_PRECLUDED);
200 curi.putString("error","robots.txt exclusion");
201 logger.fine("robots.txt precluded " + curi);
202 return true;
203 }
204 return false;
205 }
206
207 curi.skipToProcessorChain(getController().getPostprocessorChain());
208 curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
209 curi.putString("error","robots.txt prerequisite failed");
210 if (logger.isLoggable(Level.FINE)) {
211 logger.fine("robots.txt prerequisite failed " + curi);
212 }
213 return true;
214 }
215
216 /***
217 * @param curi CrawlURI whose dns prerequisite we're to check.
218 * @return true if no further processing in this module should occur
219 */
220 private boolean considerDnsPreconditions(CrawlURI curi) {
221 if(curi.getUURI().getScheme().equals("dns")){
222
223 curi.setPrerequisite(true);
224 return false;
225 }
226
227 CrawlServer cs = getController().getServerCache().getServerFor(curi);
228 if(cs == null) {
229 curi.setFetchStatus(S_UNFETCHABLE_URI);
230 curi.skipToProcessorChain(getController().getPostprocessorChain());
231 return true;
232 }
233
234
235
236
237 CrawlHost ch = getController().getServerCache().getHostFor(curi);
238 if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) {
239 if (logger.isLoggable(Level.FINE)) {
240 logger.fine( "no dns for " + ch +
241 " cancelling processing for CrawlURI " + curi.toString());
242 }
243 curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
244 curi.skipToProcessorChain(getController().getPostprocessorChain());
245 return true;
246 }
247
248
249
250 if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
251 logger.fine("Deferring processing of CrawlURI " + curi.toString()
252 + " for dns lookup.");
253 String preq = "dns:" + ch.getHostName();
254 try {
255 curi.markPrerequisite(preq,
256 getController().getPostprocessorChain());
257 } catch (URIException e) {
258 throw new RuntimeException(e);
259 }
260 return true;
261 }
262
263
264 return false;
265 }
266
267 /***
268 * Get the maximum time a dns-record is valid.
269 *
270 * @param curi the uri this time is valid for.
271 * @return the maximum time a dns-record is valid -- in seconds -- or
272 * negative if record's ttl should be used.
273 */
274 public long getIPValidityDuration(CrawlURI curi) {
275 Integer d;
276 try {
277 d = (Integer)getAttribute(ATTR_IP_VALIDITY_DURATION, curi);
278 } catch (AttributeNotFoundException e) {
279 d = DEFAULT_IP_VALIDITY_DURATION;
280 }
281
282 return d.longValue();
283 }
284
285 /*** Return true if ip should be looked up.
286 *
287 * @param curi the URI to check.
288 * @return true if ip should be looked up.
289 */
290 public boolean isIpExpired(CrawlURI curi) {
291 CrawlHost host = getController().getServerCache().getHostFor(curi);
292 if (!host.hasBeenLookedUp()) {
293
294 return true;
295 }
296
297 if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
298
299 return false;
300 }
301
302 long duration = getIPValidityDuration(curi);
303 if (duration == 0) {
304
305
306 return false;
307 }
308
309
310
311 if (duration <= 0) {
312 duration = DEFAULT_IP_VALIDITY_DURATION.intValue();
313 }
314
315 long ttl = host.getIpTTL();
316 if (ttl > duration) {
317
318
319 duration = ttl;
320 }
321
322
323 if (duration > 0) {
324 duration *= 1000;
325 }
326
327 return (duration + host.getIpFetched()) < System.currentTimeMillis();
328 }
329
330 /*** Get the maximum time a robots.txt is valid.
331 *
332 * @param curi
333 * @return the time a robots.txt is valid in milliseconds.
334 */
335 public long getRobotsValidityDuration(CrawlURI curi) {
336 Integer d;
337 try {
338 d = (Integer) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION, curi);
339 } catch (AttributeNotFoundException e) {
340
341 logger.severe(e.getLocalizedMessage());
342 d = DEFAULT_ROBOTS_VALIDITY_DURATION;
343 }
344
345 return d.longValue() * 1000;
346 }
347
348 /***
349 * Is the robots policy expired.
350 *
351 * This method will also return true if we haven't tried to get the
352 * robots.txt for this server.
353 *
354 * @param curi
355 * @return true if the robots policy is expired.
356 */
357 public boolean isRobotsExpired(CrawlURI curi) {
358 CrawlServer server =
359 getController().getServerCache().getServerFor(curi);
360 long robotsFetched = server.getRobotsFetchedTime();
361 if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) {
362
363 return true;
364 }
365 long duration = getRobotsValidityDuration(curi);
366 if (duration == 0) {
367
368 return false;
369 }
370 if (robotsFetched + duration < System.currentTimeMillis()) {
371
372 return true;
373 }
374 return false;
375 }
376
377 /***
378 * Consider credential preconditions.
379 *
380 * Looks to see if any credential preconditions (e.g. html form login
381 * credentials) for this <code>CrawlServer</code>. If there are, have they
382 * been run already? If not, make the running of these logins a precondition
383 * of accessing any other url on this <code>CrawlServer</code>.
384 *
385 * <p>
386 * One day, do optimization and avoid running the bulk of the code below.
387 * Argument for running the code everytime is that overrides and refinements
388 * may change what comes back from credential store.
389 *
390 * @param curi CrawlURI we're checking for any required preconditions.
391 * @return True, if this <code>curi</code> has a precondition that needs to
392 * be met before we can proceed. False if we can precede to process
393 * this url.
394 */
395 @SuppressWarnings("unchecked")
396 private boolean credentialPrecondition(final CrawlURI curi) {
397
398 boolean result = false;
399
400 CredentialStore cs =
401 CredentialStore.getCredentialStore(getSettingsHandler());
402 if (cs == null) {
403 logger.severe("No credential store for " + curi);
404 return result;
405 }
406
407 Iterator i = cs.iterator(curi);
408 if (i == null) {
409 return result;
410 }
411
412 while (i.hasNext()) {
413 Credential c = (Credential)i.next();
414
415 if (c.isPrerequisite(curi)) {
416
417
418
419
420
421
422 c.attach(curi);
423 curi.setPost(c.isPost(curi));
424 break;
425 }
426
427 if (!c.rootUriMatch(getController(), curi)) {
428 continue;
429 }
430
431 if (!c.hasPrerequisite(curi)) {
432 continue;
433 }
434
435 if (!authenticated(c, curi)) {
436
437
438
439 String prereq = c.getPrerequisite(curi);
440 if (prereq == null || prereq.length() <= 0) {
441 CrawlServer server =
442 getController().getServerCache().getServerFor(curi);
443 logger.severe(server.getName() + " has "
444 + " credential(s) of type " + c + " but prereq"
445 + " is null.");
446 } else {
447 try {
448 curi.markPrerequisite(prereq,
449 getController().getPostprocessorChain());
450 } catch (URIException e) {
451 logger.severe("unable to set credentials prerequisite "+prereq);
452 getController().logUriError(e,curi.getUURI(),prereq);
453 return false;
454 }
455 result = true;
456 if (logger.isLoggable(Level.FINE)) {
457 logger.fine("Queueing prereq " + prereq + " of type " +
458 c + " for " + curi);
459 }
460 break;
461 }
462 }
463 }
464 return result;
465 }
466
467 /***
468 * Has passed credential already been authenticated.
469 *
470 * @param credential Credential to test.
471 * @param curi CrawlURI.
472 * @return True if already run.
473 */
474 @SuppressWarnings("unchecked")
475 private boolean authenticated(final Credential credential,
476 final CrawlURI curi) {
477 boolean result = false;
478 CrawlServer server =
479 getController().getServerCache().getServerFor(curi);
480 if (!server.hasCredentialAvatars()) {
481 return result;
482 }
483 Set avatars = server.getCredentialAvatars();
484 for (Iterator i = avatars.iterator(); i.hasNext();) {
485 CredentialAvatar ca = (CredentialAvatar)i.next();
486 String key = null;
487 try {
488 key = credential.getKey(curi);
489 } catch (AttributeNotFoundException e) {
490 logger.severe("Failed getting key for " + credential +
491 " for " + curi);
492 continue;
493 }
494 if (ca.match(credential.getClass(), key)) {
495 result = true;
496 }
497 }
498 return result;
499 }
500 }