View Javadoc

1   /* MatchesListRegExpDecideRule
2    * 
3    * $Id: MatchesListRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
4    * 
5    * Created on 30.5.2005
6    *
7    * Copyright (C) 2005 Kristinn Sigurdsson
8    * 
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   * 
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   * 
16   * Heritrix is distributed in the hope that it will be useful, 
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   * 
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.deciderules;
26  
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.logging.Level;
30  import java.util.logging.Logger;
31  
32  import javax.management.AttributeNotFoundException;
33  
34  import org.archive.crawler.settings.SimpleType;
35  import org.archive.crawler.settings.StringList;
36  import org.archive.util.TextUtils;
37  
38  
39  /***
40   * Rule applies configured decision to any CrawlURIs whose String URI
41   * matches the supplied regexps.
42   * <p>
43   * The list of regular expressions can be considered logically AND or OR.
44   *
45   * @author Kristinn Sigurdsson
46   * 
47   * @see MatchesRegExpDecideRule
48   */
49  public class MatchesListRegExpDecideRule extends PredicatedDecideRule {
50  
51      private static final long serialVersionUID = 3011579758573454930L;
52  
53      private static final Logger logger =
54          Logger.getLogger(MatchesListRegExpDecideRule.class.getName());
55      
56      public static final String ATTR_REGEXP_LIST = "regexp-list";
57      public static final String ATTR_LIST_LOGIC= "list-logic";
58      
59      public static final String DEFAULT_LIST_LOGIC = "OR";
60      public static final String[] LEGAL_LIST_LOGIC = {"OR","AND"};
61  
62      /***
63       * Usual constructor. 
64       * @param name
65       */
66      public MatchesListRegExpDecideRule(String name) {
67          super(name);
68          setDescription("MatchesListRegExpDecideRule. Applies the configured " +
69              "decision to URIs matching the supplied regular expressions.\n" +
70              "The list of regular expressions can be considered logically AND " +
71              "or OR.");
72          addElementToDefinition(
73                  new SimpleType(ATTR_LIST_LOGIC, "Should the list of regular " +
74                      "expressions be considered as logically AND or OR when " +
75                      "matching.", 
76                      DEFAULT_LIST_LOGIC, LEGAL_LIST_LOGIC));
77          addElementToDefinition(new StringList(ATTR_REGEXP_LIST,"The list of " +
78               "regular expressions to evalute against the URI."));
79      }
80  
81      /***
82       * Evaluate whether given object's string version
83       * matches configured regexps
84       * 
85       * @param o
86       * @return true if regexps are matched
87       */
88      protected boolean evaluate(Object o) {
89          try {
90              List regexps = getRegexp(o);
91              if(regexps.size()==0){
92                  return false;
93              }
94              String str = o.toString();
95              Iterator it = regexps.iterator();
96              
97              boolean listLogicOR = isListLogicOR(o);
98              // Result is initialized so that if OR based the default assumption is
99              // false (find no matches) but if AND based the default assumption is
100             // true (finds no non-matches)
101             boolean result = listLogicOR == false;
102             
103             while(it.hasNext()){
104                 String regexp = (String)it.next();
105                 boolean matches = TextUtils.matches(regexp, str);
106 
107                 if (logger.isLoggable(Level.FINER)) {
108                     logger.finer("Tested '" + str + "' match with regex '" +
109                         regexp + " and result was " + matches);
110                 }
111                 
112                 if(matches){
113                     if(listLogicOR){
114                         // OR based and we just got a match, done!
115                         result = true;
116                         break;
117                     }
118                 } else {
119                     if(listLogicOR == false){
120                         // AND based and we just found a non-match, done!
121                         result = false;
122                         break;
123                     }
124                 }
125             }
126             
127             if (logger.isLoggable(Level.FINE) && result){
128                 logger.fine("Matched: " + str);
129             }
130             
131             return result;
132         } catch (ClassCastException e) {
133             // if not CrawlURI, always disregard
134             return false; 
135         }
136     }
137     
138     /*** 
139      * Get the regular expressions list to match the URI against.
140      *
141      * @param o the object for which the regular expression should be
142      *          matched against.
143      * @return the regular expression to match against.
144      */
145     protected List getRegexp(Object o) {
146         try {
147             return (StringList) getAttribute(o, ATTR_REGEXP_LIST);
148         } catch (AttributeNotFoundException e) {
149             logger.severe(e.getMessage());
150             // Basically the filter is inactive if this occurs
151             // (The caller should be returning false when regexp is null).
152             return null;  
153         }
154     }
155     
156     protected boolean isListLogicOR(Object o){
157         String logic = DEFAULT_LIST_LOGIC;
158         try {
159             logic = (String) getAttribute(o, ATTR_LIST_LOGIC);
160         } catch (AttributeNotFoundException e) {
161             logger.severe(e.getMessage());
162         }
163         return logic.equals("OR") ? true : false;
164     }
165 }