View Javadoc

1   /* PathologicalPathDecideRule
2   *
3   * $Id: PathologicalPathDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 1, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import java.util.logging.Logger;
28  
29  import javax.management.AttributeNotFoundException;
30  
31  import org.archive.crawler.settings.SimpleType;
32  import org.archive.crawler.settings.Type;
33  
34  
35  
36  /***
37   * Rule REJECTs any URI which contains an excessive number of identical, 
38   * consecutive path-segments (eg http://example.com/a/a/a/boo.html == 3 '/a' 
39   * segments)
40   *
41   * @author gojomo
42   */
43  public class PathologicalPathDecideRule extends MatchesRegExpDecideRule {
44  
45      private static final long serialVersionUID = -1803997581321178499L;
46  
47      private static final Logger logger =
48          Logger.getLogger(PathologicalPathDecideRule.class.getName());
49  
50      public static final String ATTR_REPETITIONS = "max-repetitions";
51  
52      /***
53       * Default maximum repetitions.
54       * Default access so accessible by unit test.
55       */
56      static final Integer DEFAULT_REPETITIONS = new Integer(2);
57  
58      protected String constructedRegexp;
59      
60      /*** Constructs a new PathologicalPathFilter.
61       *
62       * @param name the name of the filter.
63       */
64      public PathologicalPathDecideRule(String name) {
65          super(name);
66          setDescription("PathologicalPathDecideRule. This rule" +
67                  " is used to avoid crawler traps by adding a constraint on" +
68                  " how many times a path-segment pattern in the URI may be" +
69                  " repeated. A URI will be REJECTed if the same path-segment" +
70                  " repeats more than '" + ATTR_REPETITIONS + "' in a row.");
71  
72          // make default REJECT (overriding superclass) & always-default
73          Type type = addElementToDefinition(new SimpleType(ATTR_DECISION,
74                  "Decision to be applied", REJECT, ALLOWED_TYPES));
75          type.setTransient(true);
76          
77          // disable direct setting of regexp from superclass
78          type = getElementFromDefinition(ATTR_REGEXP);
79          type.setTransient(true);
80          
81          type = addElementToDefinition(new SimpleType(ATTR_REPETITIONS,
82                  "Number of times the pattern should be allowed to occur. " +
83                  "This rule returns its decision (usually REJECT) if a " +
84                  "path-segment is repeated more than number of times.",
85                  DEFAULT_REPETITIONS));
86          // overriding would require reconstruction of regexp every test
87          type.setOverrideable(false); 
88      }
89  
90      /*** 
91       * Construct the regexp string to be matched against the URI.
92       * @param o an object to extract a URI from.
93       * @return the regexp pattern.
94       */
95      protected String getRegexp(Object o) {
96          if (constructedRegexp == null) {
97              // race no concern: assignment is atomic, happy with any last value
98              constructedRegexp = constructRegexp();
99          }
100         return constructedRegexp;
101     }
102     
103     protected String constructRegexp() {
104         int rep = 0;
105         try {
106             rep = ((Integer) getAttribute(null, ATTR_REPETITIONS)).intValue();
107         } catch (AttributeNotFoundException e) {
108             logger.severe(e.getMessage());
109         }
110         return (rep == 0) ? null : ".*?/(.*?/)//1{" + rep + ",}.*";
111     }
112     
113     
114     /***
115      * Repetitions may have changed; refresh constructedRegexp
116      * 
117      * @see org.archive.crawler.deciderules.DecideRule#kickUpdate()
118      */
119     public void kickUpdate() {
120         super.kickUpdate();
121         constructedRegexp = constructRegexp();
122     }
123 }