View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlScope.java
20   * Created on Oct 1, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.scope;
25  
26  
27  /***
28   * A CrawlScope instance defines which URIs are "in"
29   * a particular crawl.
30   *
31   * It is essentially a Filter which determines, looking at
32   * the totality of information available about a
33   * CandidateURI/CrawlURI instamce, if that URI should be
34   * scheduled for crawling.
35   *
36   * <p>Dynamic information inherent in the discovery of the
37   * URI -- such as the path by which it was discovered --
38   * may be considered.
39   *
40   * <p>Dynamic information which requires the consultation
41   * of external and potentially volatile information --
42   * such as current robots.txt requests and the history
43   * of attempts to crawl the same URI -- should NOT be
44   * considered. Those potentially high-latency decisions
45   * should be made at another step. .
46   *
47   * @author gojomo
48   *
49   */
50  public class BroadScope extends ClassicScope {
51  
52      private static final long serialVersionUID = -2354234238454865888L;
53  
54      /***
55       * Constructor.
56       *
57       * @param name Name of this crawlscope.
58       */
59      public BroadScope(String name) {
60          super(name);
61          setDescription("BroadScope: A scope for broad crawls. Crawls made" +
62          " with this scope will not be limited to the hosts or domains of" +
63          " its seeds. NOTE: BroadScoped crawls will eventually run out of" +
64          " memory (See Release Notes).");
65      }
66  
67      /***
68       * @param o the URI to check.
69       * @return True if transitive filter accepts passed object.
70       */
71      protected boolean transitiveAccepts(Object o) {
72          return true;
73      }
74  
75      /*** Check if URI is accepted by the focus of this scope.
76       *
77       * This method should be overridden in subclasses.
78       *
79       * @param o the URI to check.
80       * @return True if focus filter accepts passed object.
81       */
82      protected boolean focusAccepts(Object o) {
83          return true;
84      }
85  }