View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimplePolitenessEnforcer.java
20   * Created on May 22, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.prefetch;
25  
26  import java.util.Iterator;
27  import java.util.Set;
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import javax.management.AttributeNotFoundException;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.archive.crawler.datamodel.CoreAttributeConstants;
35  import org.archive.crawler.datamodel.CrawlHost;
36  import org.archive.crawler.datamodel.CrawlServer;
37  import org.archive.crawler.datamodel.CrawlURI;
38  import org.archive.crawler.datamodel.CredentialStore;
39  import org.archive.crawler.datamodel.FetchStatusCodes;
40  import org.archive.crawler.datamodel.credential.Credential;
41  import org.archive.crawler.datamodel.credential.CredentialAvatar;
42  import org.archive.crawler.framework.Processor;
43  import org.archive.crawler.settings.SimpleType;
44  import org.archive.crawler.settings.Type;
45  import org.archive.net.UURI;
46  
47  /***
48   * Ensures the preconditions for a fetch -- such as DNS lookup 
49   * or acquiring and respecting a robots.txt policy -- are
50   * satisfied before a URI is passed to subsequent stages.
51   *
52   * @author gojomo
53   */
54  public class PreconditionEnforcer
55          extends Processor
56          implements CoreAttributeConstants, FetchStatusCodes {
57  
58      private static final long serialVersionUID = 4636474153589079615L;
59  
60      private static final Logger logger =
61          Logger.getLogger(PreconditionEnforcer.class.getName());
62  
63      private final static Integer DEFAULT_IP_VALIDITY_DURATION = 
64          new Integer(60*60*6); // six hours 
65      private final static Integer DEFAULT_ROBOTS_VALIDITY_DURATION =
66          new Integer(60*60*24); // one day
67  
68      /*** seconds to keep IP information for */
69      public final static String ATTR_IP_VALIDITY_DURATION
70          = "ip-validity-duration-seconds";
71      /*** seconds to cache robots info */
72      public final static String ATTR_ROBOTS_VALIDITY_DURATION
73          = "robot-validity-duration-seconds";
74  
75      /*** whether to calculate robots exclusion without applying */
76      public final static Boolean DEFAULT_CALCULATE_ROBOTS_ONLY = Boolean.FALSE;
77      public final static String ATTR_CALCULATE_ROBOTS_ONLY 
78          = "calculate-robots-only";
79      
80      public PreconditionEnforcer(String name) {
81          super(name, "Precondition enforcer");
82  
83          Type e;
84  
85          e = addElementToDefinition(new SimpleType(ATTR_IP_VALIDITY_DURATION,
86                  "The minimum interval for which a dns-record will be considered " +
87                  "valid (in seconds). " +
88                  "If the record's DNS TTL is larger, that will be used instead.",
89                  DEFAULT_IP_VALIDITY_DURATION));
90          e.setExpertSetting(true);
91  
92          e = addElementToDefinition(new SimpleType(ATTR_ROBOTS_VALIDITY_DURATION,
93                  "The time in seconds that fetched robots.txt information is " +
94                  "considered to be valid. " +
95                  "If the value is set to '0', then the robots.txt information" +
96                  " will never expire.",
97                  DEFAULT_ROBOTS_VALIDITY_DURATION));
98          e.setExpertSetting(true);
99          
100         e = addElementToDefinition(new SimpleType(ATTR_CALCULATE_ROBOTS_ONLY,
101                 "Whether to only calculate the robots status of an URI, " +
102                 "without actually applying any exclusions found. If true, " +
103                 "exlcuded URIs will only be annotated in the crawl.log, but " +
104                 "still fetched. Default is false. ",
105                 DEFAULT_CALCULATE_ROBOTS_ONLY));
106         e.setExpertSetting(true);
107     }
108 
109     protected void innerProcess(CrawlURI curi) {
110 
111         if (considerDnsPreconditions(curi)) {
112             return;
113         }
114 
115         // make sure we only process schemes we understand (i.e. not dns)
116         String scheme = curi.getUURI().getScheme().toLowerCase();
117         if (! (scheme.equals("http") || scheme.equals("https"))) {
118             logger.fine("PolitenessEnforcer doesn't understand uri's of type " +
119                 scheme + " (ignoring)");
120             return;
121         }
122 
123         if (considerRobotsPreconditions(curi)) {
124             return;
125         }
126 
127         if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
128             return;
129         }
130 
131         // OK, it's allowed
132 
133         // For all curis that will in fact be fetched, set appropriate delays.
134         // TODO: SOMEDAY: allow per-host, per-protocol, etc. factors
135         // curi.setDelayFactor(getDelayFactorFor(curi));
136         // curi.setMinimumDelay(getMinimumDelayFor(curi));
137 
138         return;
139     }
140 
141     /***
142      * Consider the robots precondition.
143      *
144      * @param curi CrawlURI we're checking for any required preconditions.
145      * @return True, if this <code>curi</code> has a precondition or processing
146      *         should be terminated for some other reason.  False if
147      *         we can precede to process this url.
148      */
149     private boolean considerRobotsPreconditions(CrawlURI curi) {
150         // treat /robots.txt fetches specially
151         UURI uuri = curi.getUURI();
152         try {
153             if (uuri != null && uuri.getPath() != null &&
154                     curi.getUURI().getPath().equals("/robots.txt")) {
155                 // allow processing to continue
156                 curi.setPrerequisite(true);
157                 return false;
158             }
159         }
160         catch (URIException e) {
161             logger.severe("Failed get of path for " + curi);
162         }
163         // require /robots.txt if not present
164         if (isRobotsExpired(curi)) {
165         	// Need to get robots
166             if (logger.isLoggable(Level.FINE)) {
167                 logger.fine( "No valid robots for " +
168                     getController().getServerCache().getServerFor(curi) +
169                     "; deferring " + curi);
170             }
171 
172             // Robots expired - should be refetched even though its already
173             // crawled.
174             try {
175                 String prereq = curi.getUURI().resolve("/robots.txt").toString();
176                 curi.markPrerequisite(prereq,
177                     getController().getPostprocessorChain());
178             }
179             catch (URIException e1) {
180                 logger.severe("Failed resolve using " + curi);
181                 throw new RuntimeException(e1); // shouldn't ever happen
182             }
183             return true;
184         }
185         // test against robots.txt if available
186         CrawlServer cs = getController().getServerCache().getServerFor(curi);
187         if(cs.isValidRobots()){
188             String ua = getController().getOrder().getUserAgent(curi);
189             if(cs.getRobots().disallows(curi, ua)) {
190                 if(((Boolean)getUncheckedAttribute(curi,ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {
191                     // annotate URI as excluded, but continue to process normally
192                     curi.addAnnotation("robotExcluded");
193                     return false; 
194                 }
195                 // mark as precluded; in FetchHTTP, this will
196                 // prevent fetching and cause a skip to the end
197                 // of processing (unless an intervening processor
198                 // overrules)
199                 curi.setFetchStatus(S_ROBOTS_PRECLUDED);
200                 curi.putString("error","robots.txt exclusion");
201                 logger.fine("robots.txt precluded " + curi);
202                 return true;
203             }
204             return false;
205         }
206         // No valid robots found => Attempt to get robots.txt failed
207         curi.skipToProcessorChain(getController().getPostprocessorChain());
208         curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
209         curi.putString("error","robots.txt prerequisite failed");
210         if (logger.isLoggable(Level.FINE)) {
211             logger.fine("robots.txt prerequisite failed " + curi);
212         }
213         return true;
214     }
215 
216     /***
217      * @param curi CrawlURI whose dns prerequisite we're to check.
218      * @return true if no further processing in this module should occur
219      */
220     private boolean considerDnsPreconditions(CrawlURI curi) {
221         if(curi.getUURI().getScheme().equals("dns")){
222             // DNS URIs never have a DNS precondition
223             curi.setPrerequisite(true);
224             return false; 
225         }
226         
227         CrawlServer cs = getController().getServerCache().getServerFor(curi);
228         if(cs == null) {
229             curi.setFetchStatus(S_UNFETCHABLE_URI);
230             curi.skipToProcessorChain(getController().getPostprocessorChain());
231             return true;
232         }
233 
234         // If we've done a dns lookup and it didn't resolve a host
235         // cancel further fetch-processing of this URI, because
236         // the domain is unresolvable
237         CrawlHost ch = getController().getServerCache().getHostFor(curi);
238         if (ch == null || ch.hasBeenLookedUp() && ch.getIP() == null) {
239             if (logger.isLoggable(Level.FINE)) {
240                 logger.fine( "no dns for " + ch +
241                     " cancelling processing for CrawlURI " + curi.toString());
242             }
243             curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
244             curi.skipToProcessorChain(getController().getPostprocessorChain());
245             return true;
246         }
247 
248         // If we haven't done a dns lookup  and this isn't a dns uri
249         // shoot that off and defer further processing
250         if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
251             logger.fine("Deferring processing of CrawlURI " + curi.toString()
252                 + " for dns lookup.");
253             String preq = "dns:" + ch.getHostName();
254             try {
255                 curi.markPrerequisite(preq,
256                     getController().getPostprocessorChain());
257             } catch (URIException e) {
258                 throw new RuntimeException(e); // shouldn't ever happen
259             }
260             return true;
261         }
262         
263         // DNS preconditions OK
264         return false;
265     }
266 
267     /***
268      * Get the maximum time a dns-record is valid.
269      *
270      * @param curi the uri this time is valid for.
271      * @return the maximum time a dns-record is valid -- in seconds -- or
272      * negative if record's ttl should be used.
273      */
274     public long getIPValidityDuration(CrawlURI curi) {
275         Integer d;
276         try {
277             d = (Integer)getAttribute(ATTR_IP_VALIDITY_DURATION, curi);
278         } catch (AttributeNotFoundException e) {
279             d = DEFAULT_IP_VALIDITY_DURATION;
280         }
281 
282         return d.longValue();
283     }
284 
285     /*** Return true if ip should be looked up.
286      *
287      * @param curi the URI to check.
288      * @return true if ip should be looked up.
289      */
290     public boolean isIpExpired(CrawlURI curi) {
291         CrawlHost host = getController().getServerCache().getHostFor(curi);
292         if (!host.hasBeenLookedUp()) {
293             // IP has not been looked up yet.
294             return true;
295         }
296 
297         if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
298             // IP never expires (numeric IP)
299             return false;
300         }
301 
302         long duration = getIPValidityDuration(curi);
303         if (duration == 0) {
304             // Never expire ip if duration is null (set by user or more likely,
305             // set to zero in case where we tried in FetchDNS but failed).
306             return false;
307         }
308 
309         // catch old "default" -1 settings that are now problematic,
310         // convert to new minimum
311         if (duration <= 0) {
312             duration = DEFAULT_IP_VALIDITY_DURATION.intValue();
313         }
314         
315         long ttl = host.getIpTTL();
316         if (ttl > duration) {
317             // Use the larger of the operator-set minimum duration 
318             // or the DNS record TTL
319             duration = ttl;
320         }
321 
322         // Duration and ttl are in seconds.  Convert to millis.
323         if (duration > 0) {
324             duration *= 1000;
325         }
326 
327         return (duration + host.getIpFetched()) < System.currentTimeMillis();
328     }
329 
330     /*** Get the maximum time a robots.txt is valid.
331      *
332      * @param curi
333      * @return the time a robots.txt is valid in milliseconds.
334      */
335     public long getRobotsValidityDuration(CrawlURI curi) {
336         Integer d;
337         try {
338             d = (Integer) getAttribute(ATTR_ROBOTS_VALIDITY_DURATION, curi);
339         } catch (AttributeNotFoundException e) {
340             // This should never happen, but if it does, return default
341             logger.severe(e.getLocalizedMessage());
342             d = DEFAULT_ROBOTS_VALIDITY_DURATION;
343         }
344         // convert from seconds to milliseconds
345         return d.longValue() * 1000;
346     }
347 
348     /***
349      * Is the robots policy expired.
350      *
351      * This method will also return true if we haven't tried to get the
352      * robots.txt for this server.
353      *
354      * @param curi
355      * @return true if the robots policy is expired.
356      */
357     public boolean isRobotsExpired(CrawlURI curi) {
358         CrawlServer server =
359             getController().getServerCache().getServerFor(curi);
360         long robotsFetched = server.getRobotsFetchedTime();
361         if (robotsFetched == CrawlServer.ROBOTS_NOT_FETCHED) {
362             // Have not attempted to fetch robots
363             return true;
364         }
365         long duration = getRobotsValidityDuration(curi);
366         if (duration == 0) {
367             // When zero, robots should be valid forever
368             return false;
369         }
370         if (robotsFetched + duration < System.currentTimeMillis()) {
371             // Robots is still valid
372             return true;
373         }
374         return false;
375     }
376 
377    /***
378     * Consider credential preconditions.
379     *
380     * Looks to see if any credential preconditions (e.g. html form login
381     * credentials) for this <code>CrawlServer</code>. If there are, have they
382     * been run already? If not, make the running of these logins a precondition
383     * of accessing any other url on this <code>CrawlServer</code>.
384     *
385     * <p>
386     * One day, do optimization and avoid running the bulk of the code below.
387     * Argument for running the code everytime is that overrides and refinements
388     * may change what comes back from credential store.
389     *
390     * @param curi CrawlURI we're checking for any required preconditions.
391     * @return True, if this <code>curi</code> has a precondition that needs to
392     *         be met before we can proceed. False if we can precede to process
393     *         this url.
394     */
395     @SuppressWarnings("unchecked")
396 	private boolean credentialPrecondition(final CrawlURI curi) {
397 
398         boolean result = false;
399 
400         CredentialStore cs =
401             CredentialStore.getCredentialStore(getSettingsHandler());
402         if (cs == null) {
403             logger.severe("No credential store for " + curi);
404             return result;
405         }
406 
407         Iterator i = cs.iterator(curi);
408         if (i == null) {
409             return result;
410         }
411 
412         while (i.hasNext()) {
413             Credential c = (Credential)i.next();
414 
415             if (c.isPrerequisite(curi)) {
416                 // This credential has a prereq. and this curi is it.  Let it
417                 // through.  Add its avatar to the curi as a mark.  Also, does
418                 // this curi need to be posted?  Note, we do this test for
419                 // is it a prereq BEFORE we do the check that curi is of the
420                 // credential domain because such as yahoo have you go to
421                 // another domain altogether to login.
422                 c.attach(curi);
423                 curi.setPost(c.isPost(curi));
424                 break;
425             }
426 
427             if (!c.rootUriMatch(getController(), curi)) {
428                 continue;
429             }
430 
431             if (!c.hasPrerequisite(curi)) {
432                 continue;
433             }
434 
435             if (!authenticated(c, curi)) {
436                 // Han't been authenticated.  Queue it and move on (Assumption
437                 // is that we can do one authentication at a time -- usually one
438                 // html form).
439                 String prereq = c.getPrerequisite(curi);
440                 if (prereq == null || prereq.length() <= 0) {
441                     CrawlServer server =
442                         getController().getServerCache().getServerFor(curi);
443                     logger.severe(server.getName() + " has "
444                         + " credential(s) of type " + c + " but prereq"
445                         + " is null.");
446                 } else {
447                     try {
448                         curi.markPrerequisite(prereq,
449                             getController().getPostprocessorChain());
450                     } catch (URIException e) {
451                         logger.severe("unable to set credentials prerequisite "+prereq);
452                         getController().logUriError(e,curi.getUURI(),prereq);
453                         return false; 
454                     }
455                     result = true;
456                     if (logger.isLoggable(Level.FINE)) {
457                         logger.fine("Queueing prereq " + prereq + " of type " +
458                             c + " for " + curi);
459                     }
460                     break;
461                 }
462             }
463         }
464         return result;
465     }
466 
467     /***
468      * Has passed credential already been authenticated.
469      *
470      * @param credential Credential to test.
471      * @param curi CrawlURI.
472      * @return True if already run.
473      */
474     @SuppressWarnings("unchecked")
475 	private boolean authenticated(final Credential credential,
476             final CrawlURI curi) {
477         boolean result = false;
478         CrawlServer server =
479             getController().getServerCache().getServerFor(curi);
480         if (!server.hasCredentialAvatars()) {
481             return result;
482         }
483         Set avatars = server.getCredentialAvatars();
484         for (Iterator i = avatars.iterator(); i.hasNext();) {
485             CredentialAvatar ca = (CredentialAvatar)i.next();
486             String key = null;
487             try {
488                 key = credential.getKey(curi);
489             } catch (AttributeNotFoundException e) {
490                 logger.severe("Failed getting key for " + credential +
491                     " for " + curi);
492                 continue;
493             }
494             if (ca.match(credential.getClass(), key)) {
495                 result = true;
496             }
497         }
498         return result;
499     }
500 }