View Javadoc

1   /* SurtPrefixedDecideRule
2   *
3   * $Id: SurtPrefixedDecideRule.java 6704 2009-11-25 01:38:55Z gojomo $
4   *
5   * Created on Apr 5, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import java.io.File;
28  import java.io.FileOutputStream;
29  import java.io.FileReader;
30  import java.io.IOException;
31  import java.io.OutputStreamWriter;
32  
33  import org.archive.crawler.datamodel.CandidateURI;
34  import org.archive.crawler.framework.CrawlScope;
35  import org.archive.crawler.scope.SeedListener;
36  import org.archive.crawler.settings.SimpleType;
37  import org.archive.crawler.settings.Type;
38  import org.archive.util.SurtPrefixSet;
39  
40  
41  
42  /***
43   * Rule applies configured decision to any URIs that, when 
44   * expressed in SURT form, begin with one of the prefixes
45   * in the configured set. 
46   * 
47   * The set can be filled with SURT prefixes implied or
48   * listed in the seeds file, or another external file. 
49   *
50   * The "also-check-via" option to implement "one hop off" 
51   * scoping derives from a contribution by Shifra Raffel
52   * of the California Digital Library. 
53   * 
54   * @author gojomo
55   */
56  public class SurtPrefixedDecideRule extends PredicatedDecideRule 
57          implements SeedListener {
58  
59      private static final long serialVersionUID = 2075790126085405015L;
60  
61      //private static final Logger logger =
62      //    Logger.getLogger(SurtPrefixedDecideRule.class.getName());
63      
64      public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
65      public static final String ATTR_SEEDS_AS_SURT_PREFIXES =
66          "seeds-as-surt-prefixes";
67      public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
68      
69      private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES =
70          new Boolean(true);
71  
72      /***
73       * Whether every config change should trigger a 
74       * rebuilding of the prefix set.
75       */
76      public static final String 
77          ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig";
78      public static final Boolean
79          DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE;
80      
81      /***
82       * Whether the 'via' of CrawlURIs should also be checked
83       * to see if it is prefixed by the set of SURT prefixes
84       */
85      public static final String 
86          ATTR_ALSO_CHECK_VIA = "also-check-via";
87      public static final Boolean
88          DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
89      
90      protected SurtPrefixSet surtPrefixes = null;
91  
92      /***
93       * Usual constructor. 
94       * @param name
95       */
96      public SurtPrefixedDecideRule(String name) {
97          super(name);
98          setDescription("SurtPrefixedDecideRule. Makes the configured decision "
99                  + "for any URI which, when expressed in SURT form, begins "
100                 + "with any of the established prefixes (from either seeds "
101                 + "specification or an external file).");
102         addElementToDefinition(new SimpleType(ATTR_SURTS_SOURCE_FILE,
103                 "Source file from which to infer SURT prefixes. Any URLs " +
104                 "in file will be converted to the implied SURT prefix, and " +
105                 "literal SURT prefixes may be listed on lines beginning " +
106                 "with a '+' character.",
107                 ""));
108         addElementToDefinition(new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
109                 "Should seeds also be interpreted as SURT prefixes.",
110                 DEFAULT_SEEDS_AS_SURT_PREFIXES));
111         Type t = addElementToDefinition(new SimpleType(ATTR_SURTS_DUMP_FILE,
112                 "Dump file to save SURT prefixes actually used: " +
113                 "Useful debugging SURTs.", ""));
114         t.setExpertSetting(true);
115         t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
116                 "Whether to also make the configured decision if a " +
117                 "URI's 'via' URI (the URI from which it was discovered) " +
118                 "in SURT form begins with any of the established prefixes. " +
119                 "For example, can be used to ACCEPT URIs that are 'one hop " +
120                 "off' URIs fitting the SURT prefixes. Default is false.",
121                 DEFAULT_ALSO_CHECK_VIA));
122         t.setOverrideable(false);
123         t.setExpertSetting(true);
124         t = addElementToDefinition(new SimpleType(ATTR_REBUILD_ON_RECONFIG,
125                 "Whether to rebuild the internal structures from source " +
126                 "files (including seeds if appropriate) every time any " +
127                 "configuration change occurs. If true, " +
128                 "rule is rebuilt from sources even when (for example) " +
129                 "unrelated new domain overrides are set. Rereading large" +
130                 "source files can take a long time.", 
131                 DEFAULT_REBUILD_ON_RECONFIG));
132         t.setOverrideable(false);
133         t.setExpertSetting(true);
134     }
135 
136     /***
137      * Evaluate whether given object's URI is covered by the SURT prefix set
138      * 
139      * @param object Item to evaluate.
140      * @return true if item, as SURT form URI, is prefixed by an item in the set
141      */
142     protected boolean evaluate(Object object) {
143         if ( (object instanceof CandidateURI) && 
144                 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
145                     .booleanValue()) {
146             if(evaluate(((CandidateURI)object).getVia())) {
147                 return true;
148             }
149         }
150         String candidateSurt;
151         candidateSurt = SurtPrefixSet.getCandidateSurt(object);
152         if (candidateSurt == null) {
153             return false;
154         }
155         return getPrefixes().containsPrefixOf(candidateSurt);
156     }
157 
158     /***
159      * Synchronized get of prefix set to use
160      * 
161      * @return SurtPrefixSet to use for check
162      */
163     private synchronized SurtPrefixSet getPrefixes() {
164         if (surtPrefixes == null) {
165             readPrefixes();
166         }
167         return surtPrefixes;
168     }
169 
170     protected void readPrefixes() {
171         buildSurtPrefixSet();
172         dumpSurtPrefixSet();
173     }
174     
175     /***
176      * Dump the current prefixes in use to configured dump file (if any)
177      */
178     protected void dumpSurtPrefixSet() {
179         // dump surts to file, if appropriate
180         String dumpPath = (String)getUncheckedAttribute(null,
181             ATTR_SURTS_DUMP_FILE);
182         if (dumpPath.length() > 0) {
183             File dump = new File(dumpPath);
184             if (!dump.isAbsolute()) {
185                 dump = new File(getSettingsHandler().getOrder().getController()
186                     .getDisk(), dumpPath);
187             }
188             try {
189                 OutputStreamWriter fw = 
190                     new OutputStreamWriter(new FileOutputStream(dump),"UTF-8");
191                 try {
192                     surtPrefixes.exportTo(fw);
193                 } finally {
194                     fw.close();
195                 }
196             } catch (IOException e) {
197                 e.printStackTrace();
198                 throw new RuntimeException(e);
199             }
200         }
201     }
202 
203     /***
204      * Construct the set of prefixes to use, from the seed list (
205      * which may include both URIs and '+'-prefixed directives).
206      */
207     protected void buildSurtPrefixSet() {
208         SurtPrefixSet newSurtPrefixes = new SurtPrefixSet();
209         FileReader fr = null;
210 
211         // read SURTs from file, if appropriate
212         String sourcePath = (String)getUncheckedAttribute(null,
213                 ATTR_SURTS_SOURCE_FILE);
214         if (sourcePath.length() > 0) {
215             File source = new File(sourcePath);
216             if (!source.isAbsolute()) {
217                 source = new File(getSettingsHandler().getOrder()
218                     .getController().getDisk(), sourcePath);
219             }
220             try {
221                 fr = new FileReader(source);
222                 try {
223                     newSurtPrefixes.importFromMixed(fr, true);
224                 } finally {
225                     fr.close();
226                 }
227             } catch (IOException e) {
228                 e.printStackTrace();
229                 throw new RuntimeException(e);
230             }
231         }
232         
233         // interpret seeds as surts, if appropriate
234         boolean deduceFromSeeds = ((Boolean)getUncheckedAttribute(null,
235                 ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();
236         if(deduceFromSeeds) {
237             try {
238                 fr = new FileReader(getSeedfile());
239                 try {
240                     newSurtPrefixes.importFromMixed(fr, deduceFromSeeds);
241                 } finally {
242                     fr.close();
243                 }
244             } catch (IOException e) {
245                 e.printStackTrace();
246                 throw new RuntimeException(e);
247             }
248         }
249 
250         surtPrefixes = newSurtPrefixes;
251     }
252 
253     /***
254      * Re-read prefixes after an update.
255      * 
256      * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
257      */
258     public synchronized void kickUpdate() {
259         super.kickUpdate();
260         if (((Boolean) getUncheckedAttribute(null, ATTR_REBUILD_ON_RECONFIG))
261                 .booleanValue()) {
262             readPrefixes();
263         }
264         // TODO: make conditional on file having actually changed,
265         // perhaps by remembering mod-time
266     }
267 
268     /***
269      * Dig through everything to get the crawl-global seeds file. 
270      * Add self as listener while at it. 
271      * 
272      * @return Seed list file
273      */
274     protected File getSeedfile() {
275         CrawlScope scope =
276             getSettingsHandler().getOrder().getController().getScope();
277         scope.addSeedListener(this);
278         return scope.getSeedfile();
279     }
280 
281     public synchronized void addedSeed(final CandidateURI curi) {
282         SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();
283         newSurtPrefixes.add(prefixFrom(curi.toString()));
284         surtPrefixes = newSurtPrefixes;
285     }
286     
287     protected String prefixFrom(String uri) {
288     	return SurtPrefixSet.prefixFromPlain(uri);
289     }
290 }