View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * BasicScope.java
20   * Created on Oct 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.scope;
25  
26  import java.util.Iterator;
27  
28  import org.archive.crawler.deciderules.DecidingScope;
29  import org.archive.crawler.filter.FilePatternFilter;
30  import org.archive.crawler.filter.TransclusionFilter;
31  import org.archive.crawler.framework.Filter;
32  import org.archive.net.UURI;
33  
34  /***
35   * A core CrawlScope suitable for the most common
36   * crawl needs.
37   *
38   * Roughly, its logic is that a URI is included if:
39   *
40   *    (( isSeed(uri) || focusFilter.accepts(uri) )
41   *      || transitiveFilter.accepts(uri) )
42   *     && ! excludeFilter.accepts(uri)
43   *
44   * The focusFilter may be specified by either:
45   *   - adding a 'mode' attribute to the
46   *     <code>scope</code> element. mode="broad" is equivalent
47   *     to no focus; modes "path", "host", and "domain"
48   *     imply a SeedExtensionFilter will be used, with
49   *     the <code>scope</code> element providing its configuration
50   *   - adding a <code>focus</code> subelement
51   * If unspecified, the focusFilter will default to
52   * an accepts-all filter.
53   *
54   * The transitiveFilter may be specified by supplying
55   * a <code>transitive</code> subelement. If unspecified, a
56   * TransclusionFilter will be used, with the <code>scope</code>
57   * element providing its configuration.
58   *
59   * The excludeFilter may be specified by supplying
60   * a <code>exclude</code> subelement. If unspecified, a
61   * accepts-none filter will be used -- meaning that
62   * no URIs will pass the filter and thus be excluded.
63   *
64   * @author gojomo
65   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingScope}.
66   */
67  public class HostScope extends SeedCachingScope {
68  
69      private static final long serialVersionUID = -6257664892667267266L;
70  
71      public static final String ATTR_TRANSITIVE_FILTER = "transitiveFilter";
72      public static final String ATTR_ADDITIONAL_FOCUS_FILTER =
73          "additionalScopeFocus";
74  
75      Filter additionalFocusFilter;
76      Filter transitiveFilter;
77  
78      public HostScope(String name) {
79          super(name);
80          setDescription(
81              "HostScope: A scope for host crawls *Deprecated* Use " +
82              "DecidingScope instead. Crawls made with this scope" +
83              " will be limited to the hosts its seeds. Thus if one of" +
84              " the seeds is 'archive.org' the subdomain" +
85              " 'crawler.archive.org' will not be crawled." +
86              " 'www.host' is considered to be the same as host.");
87         additionalFocusFilter = (Filter) addElementToDefinition(
88                  new FilePatternFilter(ATTR_ADDITIONAL_FOCUS_FILTER));
89          this.transitiveFilter = (Filter) addElementToDefinition(
90                  new TransclusionFilter(ATTR_TRANSITIVE_FILTER));
91      }
92  
93      /***
94       * @param o
95       * @return True if transitive filter accepts passed object.
96       */
97      protected boolean transitiveAccepts(Object o) {
98          if (this.transitiveFilter == null) {
99              return true;
100         }
101         return this.transitiveFilter.accepts(o);
102     }
103 
104     /***
105      * @param o
106      * @return True if focus filter accepts passed object.
107      */
108     protected boolean focusAccepts(Object o) {
109         UURI u = UURI.from(o);
110         if (u == null) {
111             return false;
112         }
113         // Get the seeds to refresh 
114         Iterator iter = seedsIterator();
115         while(iter.hasNext()) {
116             if (isSameHost((UURI)iter.next(), u)) {
117                 checkClose(iter);
118                 return true;
119             }
120         }
121         // if none found, fail
122         checkClose(iter);
123         return false;
124     }
125 
126    
127     // Javadoc inherited.
128     @Override
129     protected boolean additionalFocusAccepts(Object o) {
130         return additionalFocusFilter.accepts(o);
131     }
132 
133 }