View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * BasicScope.java
20   * Created on Oct 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.scope;
25  
26  import java.util.Iterator;
27  import java.util.logging.Logger;
28  
29  import org.apache.commons.httpclient.URIException;
30  import org.archive.crawler.deciderules.DecidingScope;
31  import org.archive.crawler.filter.FilePatternFilter;
32  import org.archive.crawler.filter.TransclusionFilter;
33  import org.archive.crawler.framework.Filter;
34  import org.archive.net.UURI;
35  
36  /***
37   * A core CrawlScope suitable for the most common
38   * crawl needs.
39   *
40   * Roughly, its logic is that a URI is included if:
41   *
42   *    (( isSeed(uri) || focusFilter.accepts(uri) )
43   *      || transitiveFilter.accepts(uri) )
44   *     && ! excludeFilter.accepts(uri)
45   *
46   * The focusFilter may be specified by either:
47   *   - adding a 'mode' attribute to the
48   *     <code>scope</code> element. mode="broad" is equivalent
49   *     to no focus; modes "path", "host", and "domain"
50   *     imply a SeedExtensionFilter will be used, with
51   *     the <code>scope</code> element providing its configuration
52   *   - adding a <code>focus</code> subelement
53   * If unspecified, the focusFilter will default to
54   * an accepts-all filter.
55   *
56   * The transitiveFilter may be specified by supplying
57   * a <code>transitive</code> subelement. If unspecified, a
58   * TransclusionFilter will be used, with the <code>scope</code>
59   * element providing its configuration.
60   *
61   * The excludeFilter may be specified by supplying
62   * a <code>exclude</code> subelement. If unspecified, a
63   * accepts-none filter will be used -- meaning that
64   * no URIs will pass the filter and thus be excluded.
65   *
66   * @author gojomo
67   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingScope}.
68   */
69  public class PathScope extends SeedCachingScope {
70  
71      private static final long serialVersionUID = -2217024073240277527L;
72  
73      private static Logger logger =
74          Logger.getLogger("org.archive.crawler.basic.PathScope");
75  
76      public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
77      public static final String ATTR_ADDITIONAL_FOCUS_FILTER =
78          "additionalScopeFocus";
79  
80      Filter additionalFocusFilter;
81      Filter transitiveFilter;
82  
83      public PathScope(String name) {
84          super(name);
85          setDescription(
86              "PathScope: A scope for path crawls *Deprecated* Use " +
87              "DecidingScope instead. Crawls made with this scope" +
88              " will be limited to a specific portion of the hosts its seeds" +
89              " provide. More specifically the paths those seeds provide." +
90              " For example if one of the seeds is 'archive.org/example/'" + 
91              " all URIs under the path 'examples' will be crawled (like" +
92              " 'archive.org/examples/hello.html') but not URIs in other" +
93              " paths or root (i.e. 'archive.org/index.html).");
94          this.additionalFocusFilter = (Filter) addElementToDefinition(
95                  new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER));
96          this.transitiveFilter = (Filter) addElementToDefinition(
97                  new TransclusionFilter(ATTR_TRANSITIVE_FILTER));
98      }
99  
100     /***
101      * @param o
102      * @return True if transitive filter accepts passed object.
103      */
104     protected boolean transitiveAccepts(Object o) {
105         if (this.transitiveFilter == null) {
106             return true;
107         }
108         return this.transitiveFilter.accepts(o);
109     }
110 
111     /***
112      * @param o
113      * @return True if focus filter accepts passed object.
114      */
115     protected boolean focusAccepts(Object o) {
116         UURI u = UURI.from(o);
117         if (u == null) {
118             return false;
119         }
120         // Get the seeds to refresh 
121         Iterator iter = seedsIterator();
122         while(iter.hasNext()) {
123             UURI s = (UURI) iter.next();
124             if (isSameHost(s, u)) {
125                 try {
126                     // Protect against non-parseable URIs. See
127                     // "[ 910120 ] java.net.URI#getHost fails when
128                     // leading digit"
129                     if (s.getPath() == null || u.getPath() == null) {
130                         continue;
131                     }
132                 }
133                 catch (URIException e) {
134                     logger.severe("Failed get path on " + u + " or " + s +
135                         ": " + e.getMessage());
136                 }
137                 try {
138                     if (s.getPath().regionMatches(0, u.getPath(), 0,
139                         s.getPath().lastIndexOf('/'))) {
140                         // matches up to last '/'
141                         checkClose(iter);
142                         return true;
143                     } else {
144                         // no match; try next seed
145                         continue;
146                     }
147                 }
148                 catch (URIException e) {
149                     logger.severe("Failed get path on " + u + " or " + s +
150                         ": " + e.getMessage());
151                 }
152             }
153         }
154         // if none found, fail
155         checkClose(iter);
156         return false;
157     }
158 
159     // Javadoc inherited
160     @Override
161     protected boolean additionalFocusAccepts(Object o) {
162         return this.additionalFocusFilter.accepts(o);
163     }
164 
165 }