View Javadoc

1   /* OnHostsDecideRule
2   *
3   * $Id: OnHostsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 5, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  
28  import org.archive.util.SurtPrefixSet;
29  
30  
31  /***
32   * Rule applies configured decision to any URIs that
33   * are on one of the hosts in the configured set of
34   * hosts, filled from the seed set. 
35   *
36   * @author gojomo
37   */
38  public class OnHostsDecideRule extends SurtPrefixedDecideRule {
39  
40      private static final long serialVersionUID = -7566348189389792625L;
41  
42      //private static final Logger logger =
43      //    Logger.getLogger(OnHostsDecideRule.class.getName());
44      /***
45       * Usual constructor. 
46       * @param name
47       */
48      public OnHostsDecideRule(String name) {
49          super(name);
50          setDescription(
51                   "OnHostsDecideRule. Makes the configured decision " +
52                   "for any URI which is on one of the hosts in the " +
53                   "configured set of hostnames (derived from the seed" +
54                   "list).");
55          // disable direct setting of SURTs-related options
56         getElementFromDefinition(ATTR_SEEDS_AS_SURT_PREFIXES).setTransient(true);
57         getElementFromDefinition(ATTR_SURTS_SOURCE_FILE).setTransient(true);
58         // leaving surts-dump as option helpful for debugging/learning, for now
59         //getElementFromDefinition(ATTR_SURTS_DUMP_FILE).setTransient(true);
60      }
61  
62      /***
63       * Patch the SURT prefix set so that it only includes host-enforcing prefixes
64       * 
65       * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
66       */
67      protected void readPrefixes() {
68          buildSurtPrefixSet();
69          surtPrefixes.convertAllPrefixesToHosts();
70          dumpSurtPrefixSet();
71      }
72  
73  	protected String prefixFrom(String uri) {
74  		return SurtPrefixSet.convertPrefixToHost(super.prefixFrom(uri));
75  	}
76  }