View Javadoc

1   /* AcceptRule
2   *
3   * $Id: TooManyHopsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 1, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import org.archive.crawler.datamodel.CandidateURI;
28  import org.archive.crawler.settings.SimpleType;
29  import org.archive.crawler.settings.Type;
30  
31  
32  
33  /***
34   * Rule REJECTs any CrawlURIs whose total number of hops (length of the 
35   * hopsPath string, traversed links of any type) is over a threshold.
36   * Otherwise returns PASS.
37   *
38   * @author gojomo
39   */
40  public class TooManyHopsDecideRule extends PredicatedDecideRule {
41  
42      private static final long serialVersionUID = -5429536193865916670L;
43  
44      private static final String ATTR_MAX_HOPS = "max-hops";
45      
46      /***
47       * Default access so available to test code.
48       */
49      static final Integer DEFAULT_MAX_HOPS = new Integer(20);
50  
51      /***
52       * Usual constructor. 
53       * @param name Name of this DecideRule.
54       */
55      public TooManyHopsDecideRule(String name) {
56          super(name);
57          setDescription("TooManyHopsDecideRule. REJECTs URIs discovered " +
58                  "after too many hops (followed links of any type) from seed.");
59          addElementToDefinition(new SimpleType(ATTR_MAX_HOPS, "Max path" +
60                  " depth for which this filter will match", DEFAULT_MAX_HOPS));
61          // make default REJECT (overriding superclass) & always-default
62          Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
63                  "Decision to be applied", REJECT, ALLOWED_TYPES));
64          type.setTransient(true);
65      }
66  
67      /***
68       * Evaluate whether given object is over the threshold number of
69       * hops.
70       * 
71       * @param object
72       * @return true if the mx-hops is exceeded
73       */
74      protected boolean evaluate(Object object) {
75          try {
76              CandidateURI curi = (CandidateURI)object;
77              return curi.getPathFromSeed() != null &&
78                  curi.getPathFromSeed().length() > getThresholdHops(object);
79          } catch (ClassCastException e) {
80              // if not CrawlURI, always disregard
81              return false; 
82          }
83      }
84  
85      /***
86       * @param obj Conext object.
87       * @return hops cutoff threshold
88       */
89      private int getThresholdHops(Object obj) {
90          return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_HOPS)).intValue();
91      }
92  }