View Javadoc

1   /* $Id: FetchStatusMatchesRegExpDecideRule.java 4649 2006-09-25 17:16:55Z paul_jack $
2   *
3   * Created on Sep 4, 2006
4   *
5   * Copyright (C) 2006 Olaf Freyer.
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.crawler.deciderules;
24  
25  import java.util.logging.Level;
26  import java.util.logging.Logger;
27  import javax.management.AttributeNotFoundException;
28  
29  import org.archive.crawler.datamodel.CrawlURI;
30  import org.archive.crawler.settings.SimpleType;
31  import org.archive.util.TextUtils;
32  
33  
34  public class FetchStatusMatchesRegExpDecideRule extends PredicatedDecideRule {
35  
36      private static final long serialVersionUID = -3088156729860241312L;
37  
38      private  final Logger logger = Logger.getLogger(this.getClass().getName());
39      
40      public static final String ATTR_REGEXP = "regexp";
41      
42      /***
43       * Usual constructor. 
44       * @param name Name of this DecideRule.
45       */
46      public FetchStatusMatchesRegExpDecideRule(String name) {
47          super(name);
48          setDescription("FetchStatusMatchesRegExpDecideRule. Applies " +
49          	"configured decision to any URI that has a fetch status matching " +
50          	"the given regular expression.");
51          addElementToDefinition(new SimpleType(ATTR_REGEXP, "Java regular" +
52                  "expression to match.", ""));
53      }
54  
55      protected boolean evaluate(Object object) {
56          try {
57              String regexp = getRegexp(object);
58              CrawlURI curi = (CrawlURI)object;
59              String str = String.valueOf(curi.getFetchStatus());
60              boolean result = (regexp == null)?
61                      false: TextUtils.matches(regexp, str);
62              if (logger.isLoggable(Level.FINE)) {
63                  logger.fine("Tested '" + str + "' match with regex '" +
64                          regexp + " and result was " + result);
65              }
66              return result;
67          } catch (ClassCastException e) {
68              // if not CrawlURI, always disregard
69              return false; 
70          }
71      }
72      
73      /*** 
74       * Get the regular expression string to match the URI against.
75       *
76       * @param o the object for which the regular expression should be
77       *          matched against.
78       * @return the regular expression to match against.
79       */
80      protected String getRegexp(Object o) {
81          try {
82              return (String) getAttribute(o, ATTR_REGEXP);
83          } catch (AttributeNotFoundException e) {
84              logger.severe(e.getMessage());
85              return null;  // Basically the filter is inactive if this occurs.
86          }
87      }
88  }