View Javadoc

1   /* Copyright (C) 2005 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * ClassicScope.java
20   * Created on Apr 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.scope;
25  
26  import javax.management.AttributeNotFoundException;
27  
28  import org.archive.crawler.datamodel.CandidateURI;
29  import org.archive.crawler.extractor.Link;
30  //import org.archive.crawler.filter.OrFilter;
31  import org.archive.crawler.framework.CrawlScope;
32  import org.archive.crawler.settings.SimpleType;
33  
34  /***
35   * ClassicScope: superclass with shared Scope behavior for
36   * most common scopes. 
37   *
38   * Roughly, its logic is captured in innerAccept(). A URI is 
39   * included if:
40   * <pre>
41   *    forceAccepts(uri)
42   *    || (((isSeed(uri) 
43   *         || focusAccepts(uri)) 
44   *         || additionalFocusAccepts(uri) 
45   *         || transitiveAccepts(uri))
46   *       && !excludeAccepts(uri));</pre>
47   *
48   * Subclasses should override focusAccepts, additionalFocusAccepts,
49   * and transitiveAccepts. 
50   *
51   * The excludeFilter may be specified by supplying
52   * a <code>exclude</code> subelement. If unspecified, a
53   * accepts-none filter will be used -- meaning that
54   * no URIs will pass the filter and thus be excluded.
55   *  
56   * @author gojomo
57   */
58  public class ClassicScope extends CrawlScope {
59  
60      private static final long serialVersionUID = 4494905304855590002L;
61  
62      //private static final Logger logger = Logger.getLogger(ClassicScope.class
63      //        .getName());
64  
65      public static final String ATTR_EXCLUDE_FILTER = "exclude-filter";
66  
67      public static final String ATTR_MAX_LINK_HOPS = "max-link-hops";
68  
69      public static final String ATTR_MAX_TRANS_HOPS = "max-trans-hops";
70  
71      // FIXME: Replace deprecated OrFilter with non-deprecated something
72      
73      @SuppressWarnings("deprecation")
74      private org.archive.crawler.filter.OrFilter excludeFilter;
75  
76      /***
77       * @param name
78       *            ignored by superclass
79       */
80      @SuppressWarnings("deprecation")
81      public ClassicScope(String name) {
82          super(name);
83          addElementToDefinition(new SimpleType(ATTR_MAX_LINK_HOPS,
84              "Max link hops to include. URIs more than this number "
85              + "of links from a seed will not be ruled in-scope. (Such "
86              + "determination does not preclude later inclusion if a "
87              + "shorter path is later discovered.)", new Integer(25)));
88          addElementToDefinition(new SimpleType(ATTR_MAX_TRANS_HOPS,
89              "Max transitive hops (embeds, referrals, preconditions) to " +
90              "include. URIs reached by more than this number of transitive " +
91              "hops will not be ruled in-scope, even if otherwise on an " +
92              "in-focus site. (Such determination does not preclude later " +
93              " inclusion if a shorter path is later discovered.)", 
94              new Integer(5)));
95          this.excludeFilter = (org.archive.crawler.filter.OrFilter)
96              addElementToDefinition(new org.archive.crawler.filter.OrFilter(
97                  ATTR_EXCLUDE_FILTER));
98  
99          // Try to preserve the values of these attributes when we exchange
100         // scopes.
101         setPreservedFields(new String[] { ATTR_SEEDS, ATTR_MAX_LINK_HOPS,
102             ATTR_MAX_TRANS_HOPS, ATTR_EXCLUDE_FILTER });
103     }
104 
105     /***
106      * Default constructor.
107      */
108     public ClassicScope() {
109         this(CrawlScope.ATTR_NAME);
110     }
111 
112     /***
113      * Returns whether the given object (typically a CandidateURI) falls within
114      * this scope.
115      * 
116      * @param o
117      *            Object to test.
118      * @return Whether the given object (typically a CandidateURI) falls within
119      *         this scope.
120      */
121     protected final boolean innerAccepts(Object o) {
122         return (((isSeed(o) || focusAccepts(o)) ||
123             additionalFocusAccepts(o) || transitiveAccepts(o)) &&
124             !excludeAccepts(o));
125     }
126 
127     /***
128      * Check if URI is accepted by the additional focus of this scope.
129      * 
130      * This method should be overridden in subclasses.
131      * 
132      * @param o
133      *            the URI to check.
134      * @return True if additional focus filter accepts passed object.
135      */
136     protected boolean additionalFocusAccepts(Object o) {
137         return false;
138     }
139 
140     /***
141      * @param o
142      *            the URI to check.
143      * @return True if transitive filter accepts passed object.
144      */
145     protected boolean transitiveAccepts(Object o) {
146         return false;
147     }
148 
149     /***
150      * @param o the URI to check.
151      * @return True if force-accepts filter accepts passed object.
152      */
153     protected boolean xforceAccepts(Object o) {
154         return false;
155     }
156     
157     /***
158      * Check if URI is accepted by the focus of this scope.
159      * 
160      * This method should be overridden in subclasses.
161      * 
162      * @param o
163      *            the URI to check.
164      * @return True if focus filter accepts passed object.
165      */
166     protected boolean focusAccepts(Object o) {
167         // The CrawlScope doesn't accept any URIs
168         return false;
169     }
170 
171     /***
172      * Check if URI is excluded by any filters.
173      * 
174      * @param o
175      *            the URI to check.
176      * @return True if exclude filter accepts passed object.
177      */
178     @SuppressWarnings("deprecation")
179     protected boolean excludeAccepts(Object o) {
180         return (this.excludeFilter.isEmpty(o)) ? exceedsMaxHops(o)
181                 : this.excludeFilter.accepts(o) || exceedsMaxHops(o);
182     }
183 
184     /***
185      * Check if there are too many hops
186      * 
187      * @param o
188      *            URI to check.
189      * @return true if too many hops.
190      */
191     protected boolean exceedsMaxHops(Object o) {
192         if (!(o instanceof CandidateURI)) {
193             return false;
194         }
195 
196         int maxLinkHops = 0;
197 //        int maxTransHops = 0;
198 
199         try {
200             maxLinkHops = ((Integer) getAttribute(o, ATTR_MAX_LINK_HOPS))
201                     .intValue();
202 //            maxTransHops = ((Integer) getAttribute(o, ATTR_MAX_TRANS_HOPS))
203 //                    .intValue();
204         } catch (AttributeNotFoundException e) {
205             // TODO Auto-generated catch block
206             e.printStackTrace();
207         }
208 
209         CandidateURI cand = (CandidateURI) o;
210 
211         String path = cand.getPathFromSeed();
212         int linkCount = 0;
213         int transCount = 0;
214         for (int i = path.length() - 1; i >= 0; i--) {
215             if (path.charAt(i) == Link.NAVLINK_HOP) {
216                 linkCount++;
217             } else if (linkCount == 0) {
218                 transCount++;
219             }
220         }
221 //      return (linkCount > maxLinkHops) || (transCount > maxTransHops);
222         // base only on links, don't treat trans count as hard max
223         return (linkCount > maxLinkHops);
224     }
225 
226     /***
227      * Take note of a situation (such as settings edit) where involved
228      * reconfiguration (such as reading from external files) may be necessary.
229      */
230     @SuppressWarnings("deprecation")
231     public void kickUpdate() {
232         super.kickUpdate();
233         excludeFilter.kickUpdate();
234     }
235 }