View Javadoc

1   /* HopsPathMatchesRegExpDecideRule
2   *
3   * $Id: HopsPathMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on June 23, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import javax.management.AttributeNotFoundException;
31  
32  import org.archive.crawler.datamodel.CandidateURI;
33  import org.archive.crawler.settings.SimpleType;
34  import org.archive.util.TextUtils;
35  
36  /***
37   * Rule applies configured decision to any CrawlURIs whose 'hops-path'
38   * (string like "LLXE" etc.) matches the supplied regexp.
39   *
40   * @author gojomo
41   */
42  public class HopsPathMatchesRegExpDecideRule extends PredicatedDecideRule {
43  
44      private static final long serialVersionUID = -8881013912393934053L;
45  
46      private static final Logger logger =
47          Logger.getLogger(HopsPathMatchesRegExpDecideRule.class.getName());
48      
49      public static final String ATTR_REGEXP = "regexp";
50  
51      /***
52       * Usual constructor. 
53       * @param name
54       */
55      public HopsPathMatchesRegExpDecideRule(String name) {
56          super(name);
57          setDescription("HopsPathMatchesRegExpDecideRule. Applies the " +
58                  "configured decision to URIs whose hops-path (string with " +
59                  "L E R X P etc) matches the supplied regular expression.");
60          addElementToDefinition(new SimpleType(ATTR_REGEXP, "Java regular" +
61              "expression to match.", ""));
62      }
63  
64      /***
65       * Evaluate whether given object (if CandidateURI) has hops-path
66       * matching configured regexp
67       * 
68       * @param object
69       * @return true if regexp is matched
70       */
71      protected boolean evaluate(Object object) {
72          try {
73              String regexp = getRegexp(object);
74              String str = ((CandidateURI)object).getPathFromSeed();
75              boolean result = (regexp == null)?
76                      false: TextUtils.matches(regexp, str);
77              if (logger.isLoggable(Level.FINE)) {
78                  logger.fine("Tested '" + str + "' match with regex '" +
79                          regexp + " and result was " + result);
80              }
81              return result;
82          } catch (ClassCastException e) {
83              // if not CrawlURI, always disregard
84              return false; 
85          }
86      }
87      
88      /*** 
89       * Get the regular expression string to match the URI against.
90       *
91       * @param o the object for which the regular expression should be
92       *          matched against.
93       * @return the regular expression to match against.
94       */
95      protected String getRegexp(Object o) {
96          try {
97              return (String) getAttribute(o, ATTR_REGEXP);
98          } catch (AttributeNotFoundException e) {
99              logger.severe(e.getMessage());
100             return null;  // Basically the filter is inactive if this occurs.
101         }
102     }
103 }