View Javadoc

1   /* RegularExpressionCriteria
2    *
3    * $Id: RegularExpressionCriteria.java 3704 2005-07-18 17:30:21Z stack-sf $
4    *
5    * Created on Apr 8, 2004
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.settings.refinements;
26  
27  import org.archive.net.UURI;
28  import org.archive.util.TextUtils;
29  
30  
31  /***
32   * A refinement criteria that test if a URI matches a regular expression.
33   *
34   * @author John Erik Halse
35   */
36  public class RegularExpressionCriteria implements Criteria {
37      private String regexp = "";
38  
39      /***
40       * Create a new instance of RegularExpressionCriteria.
41       */
42      public RegularExpressionCriteria() {
43          super();
44      }
45  
46      /***
47       * Create a new instance of RegularExpressionCriteria initializing it with
48       * a regular expression.
49       *
50       * @param regexp the regular expression for this criteria.
51       */
52      public RegularExpressionCriteria(String regexp) {
53          setRegexp(regexp);
54      }
55  
56      /* (non-Javadoc)
57       * @see org.archive.crawler.settings.refinements.Criteria#isWithinRefinementBounds(org.archive.crawler.datamodel.UURI, int)
58       */
59      public boolean isWithinRefinementBounds(UURI uri) {
60          return (uri == null || uri == null)?
61              false: TextUtils.matches(regexp, uri.toString());
62      }
63  
64      /***
65       * Get the regular expression to be matched against a URI.
66       *
67       * @return Returns the regexp.
68       */
69      public String getRegexp() {
70          return regexp;
71      }
72      /***
73       * Set the regular expression to be matched against a URI.
74       *
75       * @param regexp The regexp to set.
76       */
77      public void setRegexp(String regexp) {
78          this.regexp = regexp;
79      }
80  
81      /* (non-Javadoc)
82       * @see org.archive.crawler.settings.refinements.Criteria#getName()
83       */
84      public String getName() {
85          return "Regular expression criteria";
86      }
87  
88      /* (non-Javadoc)
89       * @see org.archive.crawler.settings.refinements.Criteria#getDescription()
90       */
91      public String getDescription() {
92          return "Accept URIs that match the following regular expression: "
93              + getRegexp();
94      }
95  }