View Javadoc

1   /* TransclusionDecideRule
2   *
3   * $Id: TransclusionDecideRule.java 6895 2010-06-16 21:49:33Z gojomo $
4   *
5   * Created on Apr 1, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import org.archive.crawler.datamodel.CandidateURI;
28  import org.archive.crawler.extractor.Link;
29  import org.archive.crawler.settings.SimpleType;
30  import org.archive.crawler.settings.Type;
31  
32  
33  
34  /***
35   * Rule ACCEPTs any CrawlURIs whose path-from-seed ('hopsPath' -- see
36   * {@link CandidateURI#getPathFromSeed()}) ends 
37   * with at least one, but not more than, the given number of 
38   * non-navlink ('L') hops. 
39   * 
40   * Otherwise, if the path-from-seed is empty or if a navlink ('L') occurs
41   * within max-trans-hops of the tail of the path-from-seed, this rule
42   * returns PASS.
43   *  
44   * <p>Thus, it allows things like embedded resources (frames/images/media) 
45   * and redirects to be transitively included ('transcluded') in a crawl, 
46   * even if they otherwise would not, for some reasonable number of hops
47   * (1-4).
48   *
49   * @see <a href="http://www.google.com/search?q=define%3Atransclusion&sourceid=mozilla&start=0&start=0&ie=utf-8&oe=utf-8">Transclusion</a>
50   *
51   * @author gojomo
52   */
53  public class TransclusionDecideRule extends PredicatedDecideRule {
54  
55      private static final long serialVersionUID = -3975688876990558918L;
56  
57      private static final String ATTR_MAX_TRANS_HOPS = "max-trans-hops";
58  
59      private static final String ATTR_MAX_SPECULATIVE_HOPS = "max-speculative-hops";
60  
61      /***
62       * Default maximum transitive hops -- any type
63       * Default access so can be accessed by unit tests.
64       */
65      static final Integer DEFAULT_MAX_TRANS_HOPS = new Integer(3);
66  
67      /***
68       * Default maximum speculative ('X') hops.
69       * Default access so can be accessed by unit tests.
70       */
71      static final Integer DEFAULT_MAX_SPECULATIVE_HOPS = new Integer(1);
72  
73      /***
74       * Usual constructor. 
75       * @param name Name of this DecideRule.
76       */
77      public TransclusionDecideRule(String name) {
78          super(name);
79          setDescription("TransclusionDecideRule. ACCEPTs URIs whose path " +
80                  "from the seed ends with up to (but not more than) the " +
81                  "configured '" + ATTR_MAX_TRANS_HOPS +
82                  "' number of non-navlink ('L') hops.");
83          // make default ACCEPT unchangeable 
84          Type type = getElementFromDefinition(ATTR_DECISION);
85          type.setTransient(true);
86          addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS,
87              "Maximum number of non-navlink (non-'L') hops to ACCEPT.", 
88              DEFAULT_MAX_TRANS_HOPS));
89          addElementToDefinition(new SimpleType(ATTR_MAX_SPECULATIVE_HOPS,
90              "Maximum number of speculative ('X') hops to ACCEPT.", 
91              DEFAULT_MAX_SPECULATIVE_HOPS));
92      }
93  
94      /***
95       * Evaluate whether given object is within the threshold number of
96       * transitive hops.
97       * 
98       * @param object Object to make decision on.
99       * @return true if the transitive hops >0 and <= max
100      */
101     protected boolean evaluate(Object object) {
102         CandidateURI curi = null;
103         try {
104             curi = (CandidateURI)object;
105         } catch (ClassCastException e) {
106             // if not CrawlURI, always disregard.
107             return false;
108         }
109         String hopsPath = curi.getPathFromSeed();
110         if (hopsPath == null || hopsPath.length() == 0) {
111             return false; 
112         }
113         int allCount = 0;
114         int nonrefCount = 0; 
115         int specCount = 0; 
116         for (int i = hopsPath.length() - 1; i >= 0; i--) {
117             char c = hopsPath.charAt(i);
118             if (c == Link.NAVLINK_HOP) {
119                 // end of hops counted here
120                 break;
121             }
122             allCount++;
123             if(c != Link.REFER_HOP) {
124                 nonrefCount++;
125             }
126             if(c == Link.SPECULATIVE_HOP) {
127                 specCount++;
128             }
129         }
130         // transclusion doesn't apply if there isn't at least one non-nav-hop
131         if (allCount <= 0) {
132             return false;
133         }
134         
135         // too many speculative hops disqualify from transclusion
136         if (specCount > getThresholdSpeculativeHops(object)) {
137             return false;
138         }
139         
140         // transclusion applies as long as non-ref hops less than max
141         return nonrefCount <= getThresholdHops(object);
142     }
143 
144     /***
145      * @param obj Context object.
146      * @return hops cutoff threshold
147      */
148     private int getThresholdHops(Object obj) {
149         return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_TRANS_HOPS)).
150             intValue();
151     }
152     
153     /***
154      * @param obj Context object.
155      * @return hops cutoff threshold
156      */
157     private int getThresholdSpeculativeHops(Object obj) {
158         return ((Integer)getUncheckedAttribute(obj,ATTR_MAX_SPECULATIVE_HOPS)).
159             intValue();
160     }
161 }