1 /* OnHostsDecideRule
2 *
3 * $Id: OnHostsDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4 *
5 * Created on Apr 5, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25 package org.archive.crawler.deciderules;
26
27
28 import org.archive.util.SurtPrefixSet;
29
30
31 /***
32 * Rule applies configured decision to any URIs that
33 * are on one of the hosts in the configured set of
34 * hosts, filled from the seed set.
35 *
36 * @author gojomo
37 */
38 public class OnHostsDecideRule extends SurtPrefixedDecideRule {
39
40 private static final long serialVersionUID = -7566348189389792625L;
41
42 //private static final Logger logger =
43 // Logger.getLogger(OnHostsDecideRule.class.getName());
44 /***
45 * Usual constructor.
46 * @param name
47 */
48 public OnHostsDecideRule(String name) {
49 super(name);
50 setDescription(
51 "OnHostsDecideRule. Makes the configured decision " +
52 "for any URI which is on one of the hosts in the " +
53 "configured set of hostnames (derived from the seed" +
54 "list).");
55 // disable direct setting of SURTs-related options
56 getElementFromDefinition(ATTR_SEEDS_AS_SURT_PREFIXES).setTransient(true);
57 getElementFromDefinition(ATTR_SURTS_SOURCE_FILE).setTransient(true);
58 // leaving surts-dump as option helpful for debugging/learning, for now
59 //getElementFromDefinition(ATTR_SURTS_DUMP_FILE).setTransient(true);
60 }
61
62 /***
63 * Patch the SURT prefix set so that it only includes host-enforcing prefixes
64 *
65 * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
66 */
67 protected void readPrefixes() {
68 buildSurtPrefixSet();
69 surtPrefixes.convertAllPrefixesToHosts();
70 dumpSurtPrefixSet();
71 }
72
73 protected String prefixFrom(String uri) {
74 return SurtPrefixSet.convertPrefixToHost(super.prefixFrom(uri));
75 }
76 }