1 /* RegularExpressionCriteria
2 *
3 * $Id: RegularExpressionCriteria.java 3704 2005-07-18 17:30:21Z stack-sf $
4 *
5 * Created on Apr 8, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25 package org.archive.crawler.settings.refinements;
26
27 import org.archive.net.UURI;
28 import org.archive.util.TextUtils;
29
30
31 /***
32 * A refinement criteria that test if a URI matches a regular expression.
33 *
34 * @author John Erik Halse
35 */
36 public class RegularExpressionCriteria implements Criteria {
37 private String regexp = "";
38
39 /***
40 * Create a new instance of RegularExpressionCriteria.
41 */
42 public RegularExpressionCriteria() {
43 super();
44 }
45
46 /***
47 * Create a new instance of RegularExpressionCriteria initializing it with
48 * a regular expression.
49 *
50 * @param regexp the regular expression for this criteria.
51 */
52 public RegularExpressionCriteria(String regexp) {
53 setRegexp(regexp);
54 }
55
56 /* (non-Javadoc)
57 * @see org.archive.crawler.settings.refinements.Criteria#isWithinRefinementBounds(org.archive.crawler.datamodel.UURI, int)
58 */
59 public boolean isWithinRefinementBounds(UURI uri) {
60 return (uri == null || uri == null)?
61 false: TextUtils.matches(regexp, uri.toString());
62 }
63
64 /***
65 * Get the regular expression to be matched against a URI.
66 *
67 * @return Returns the regexp.
68 */
69 public String getRegexp() {
70 return regexp;
71 }
72 /***
73 * Set the regular expression to be matched against a URI.
74 *
75 * @param regexp The regexp to set.
76 */
77 public void setRegexp(String regexp) {
78 this.regexp = regexp;
79 }
80
81 /* (non-Javadoc)
82 * @see org.archive.crawler.settings.refinements.Criteria#getName()
83 */
84 public String getName() {
85 return "Regular expression criteria";
86 }
87
88 /* (non-Javadoc)
89 * @see org.archive.crawler.settings.refinements.Criteria#getDescription()
90 */
91 public String getDescription() {
92 return "Accept URIs that match the following regular expression: "
93 + getRegexp();
94 }
95 }