View Javadoc

1   /* AcceptRule
2   *
3   * $Id: TooManyPathSegmentsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 1, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import org.archive.crawler.datamodel.CandidateURI;
28  import org.archive.crawler.settings.SimpleType;
29  import org.archive.crawler.settings.Type;
30  
31  
32  
33  /***
34   * Rule REJECTs any CrawlURIs whose total number of path-segments (as
35   * indicated by the count of '/' characters not including the first '//')
36   * is over a given threshold.
37   *
38   * @author gojomo
39   */
40  public class TooManyPathSegmentsDecideRule extends PredicatedDecideRule {
41  
42      private static final long serialVersionUID = 147079100367815075L;
43  
44      public static final String ATTR_MAX_PATH_DEPTH = "max-path-depth";
45      
46      /***
47       * Default maximum value.
48       * Default access so available to unit test.
49       */
50      static final Integer DEFAULT_MAX_PATH_DEPTH = new Integer(20);
51  
52      /***
53       * Usual constructor. 
54       * @param name Name of this DecideRule.
55       */
56      public TooManyPathSegmentsDecideRule(String name) {
57          super(name);
58          setDescription("TooManyPathSegmentsDecideRule. REJECTs URIs with " +
59                  "more total path-segments (as indicated by '/' characters) " +
60                  "than the configured '" + ATTR_MAX_PATH_DEPTH + "'.");
61          
62          // make default REJECT (overriding superclass) & always-default
63          Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
64                  "Decision to be applied", REJECT, ALLOWED_TYPES));
65          type.setTransient(true);
66          
67          addElementToDefinition(new SimpleType(ATTR_MAX_PATH_DEPTH, "Number of" +
68                  " path segments beyond which this rule will reject URIs.", 
69                  DEFAULT_MAX_PATH_DEPTH));
70          
71      }
72  
73      /***
74       * Evaluate whether given object is over the threshold number of
75       * path-segments.
76       * 
77       * @param object
78       * @return true if the path-segments is exceeded
79       */
80      protected boolean evaluate(Object object) {
81          boolean result = false;
82          CandidateURI curi = null;
83          try {
84              curi = (CandidateURI)object;
85          } catch (ClassCastException e) {
86              // if not CrawlURI, always disregard
87              return result;
88          }
89          String uri = curi.toString();
90          int count = 0;
91          int threshold = getThresholdSegments(object);
92          for (int i = 0; i < uri.length(); i++) {
93              if (uri.charAt(i) == '/') {
94                  count++;
95              }
96              if (count > threshold) {
97                  result = true;
98                  break;
99              }
100         }
101         return result;
102     }
103 
104     /***
105      * @param obj
106      * @return path-segments cutoff threshold
107      */
108     private int getThresholdSegments(Object obj) {
109         // add 2 for start-of-authority slashes (not path segments)
110         return ((Integer) getUncheckedAttribute(obj, ATTR_MAX_PATH_DEPTH))
111                 .intValue() + 2;
112     }
113 }