View Javadoc

1   /* ClassKeyMatchesRegExpDecideRule
2   *
3   * $Id: ClassKeyMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 4, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.archive.crawler.datamodel.CandidateURI;
31  import org.archive.util.TextUtils;
32  
33  
34  
35  /***
36   * Rule applies configured decision to any CrawlURI class key -- i.e.
37   * {@link CandidateURI#getClassKey()} -- matches matches supplied regexp.
38   *
39   * @author gojomo
40   */
41  public class ClassKeyMatchesRegExpDecideRule extends MatchesRegExpDecideRule {
42  
43      private static final long serialVersionUID = 1178873944436973294L;
44  
45      private static final Logger logger =
46          Logger.getLogger(ClassKeyMatchesRegExpDecideRule.class.getName());
47  
48      /***
49       * Usual constructor. 
50       * @param name
51       */
52      public ClassKeyMatchesRegExpDecideRule(String name) {
53          super(name);
54          setDescription("ClassKeyMatchesRegExpDecideRule. " +
55              "Applies the configured " +
56              "decision to class keys matching the supplied " +
57              "regular expression. Class keys are values set into " +
58              "an URL by the Frontier. They are usually the names " +
59              "of queues used by the Frontier. Class keys can " +
60              "look like hostname + port or be plain IPs (It will " +
61              "depend on the Frontier implementation/configuration).");
62      }
63  
64      /***
65       * Evaluate passed object.
66       * Test first that its CandidateURI.  If so, does it have a class key.
67       * If not, ask frontier for its classkey.  Then test against regex.
68       * 
69       * @param object
70       * @return true if regexp is matched
71       */
72      protected boolean evaluate(Object object) {
73          try {
74              CandidateURI cauri = (CandidateURI)object;
75              String classKey = cauri.getClassKey();
76              if (classKey == null || classKey.length() <= 0) {
77                  classKey = getSettingsHandler().getOrder().getController().
78                      getFrontier().getClassKey(cauri);
79                  cauri.setClassKey(classKey);
80              }
81              String regexp = getRegexp(cauri);
82              boolean result = (regexp == null)?
83                  false: TextUtils.matches(regexp, cauri.getClassKey());
84              if (logger.isLoggable(Level.FINE)) {
85                  logger.fine("Tested '" + cauri.getClassKey() +
86                      "' match with regex '" + regexp + " and result was " +
87                      result);
88              }
89              return result;
90          } catch (ClassCastException e) {
91              // if not CrawlURI, always disregard
92              return false; 
93          }
94      }
95  }