View Javadoc

1   /* MatchesRegExpDecideRule
2   *
3   * $Id: MatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4   *
5   * Created on Apr 4, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.deciderules;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import javax.management.AttributeNotFoundException;
31  
32  import org.archive.crawler.settings.SimpleType;
33  import org.archive.util.TextUtils;
34  
35  
36  
37  /***
38   * Rule applies configured decision to any CrawlURIs whose String URI
39   * matches the supplied regexp.
40   *
41   * @author gojomo
42   */
43  public class MatchesRegExpDecideRule extends PredicatedDecideRule {
44  
45      private static final long serialVersionUID = 6441410917074319295L;
46  
47      private static final Logger logger =
48          Logger.getLogger(MatchesRegExpDecideRule.class.getName());
49      
50      public static final String ATTR_REGEXP = "regexp";
51  
52      /***
53       * Usual constructor. 
54       * @param name
55       */
56      public MatchesRegExpDecideRule(String name) {
57          super(name);
58          setDescription("MatchesRegExpDecideRule. Applies the configured " +
59              "decision to URIs matching the supplied regular expression.");
60          addElementToDefinition(new SimpleType(ATTR_REGEXP, "Java regular" +
61              "expression to match.", ""));
62      }
63  
64      /***
65       * Evaluate whether given object's string version
66       * matches configured regexp
67       * 
68       * @param object
69       * @return true if regexp is matched
70       */
71      protected boolean evaluate(Object object) {
72          try {
73              String regexp = getRegexp(object);
74              String str = object.toString();
75              boolean result = (regexp == null)?
76                      false: TextUtils.matches(regexp, str);
77              if (logger.isLoggable(Level.FINE)) {
78                  logger.fine("Tested '" + str + "' match with regex '" +
79                          regexp + " and result was " + result);
80              }
81              return result;
82          } catch (ClassCastException e) {
83              // if not CrawlURI, always disregard
84              return false; 
85          }
86      }
87      
88      /*** 
89       * Get the regular expression string to match the URI against.
90       *
91       * @param o the object for which the regular expression should be
92       *          matched against.
93       * @return the regular expression to match against.
94       */
95      protected String getRegexp(Object o) {
96          try {
97              return (String) getAttribute(o, ATTR_REGEXP);
98          } catch (AttributeNotFoundException e) {
99              logger.severe(e.getMessage());
100             return null;  // Basically the filter is inactive if this occurs.
101         }
102     }
103 }