1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31
32 import javax.management.AttributeNotFoundException;
33
34 import org.archive.crawler.settings.SimpleType;
35 import org.archive.crawler.settings.StringList;
36 import org.archive.util.TextUtils;
37
38
39 /***
40 * Rule applies configured decision to any CrawlURIs whose String URI
41 * matches the supplied regexps.
42 * <p>
43 * The list of regular expressions can be considered logically AND or OR.
44 *
45 * @author Kristinn Sigurdsson
46 *
47 * @see MatchesRegExpDecideRule
48 */
49 public class MatchesListRegExpDecideRule extends PredicatedDecideRule {
50
51 private static final long serialVersionUID = 3011579758573454930L;
52
53 private static final Logger logger =
54 Logger.getLogger(MatchesListRegExpDecideRule.class.getName());
55
56 public static final String ATTR_REGEXP_LIST = "regexp-list";
57 public static final String ATTR_LIST_LOGIC= "list-logic";
58
59 public static final String DEFAULT_LIST_LOGIC = "OR";
60 public static final String[] LEGAL_LIST_LOGIC = {"OR","AND"};
61
62 /***
63 * Usual constructor.
64 * @param name
65 */
66 public MatchesListRegExpDecideRule(String name) {
67 super(name);
68 setDescription("MatchesListRegExpDecideRule. Applies the configured " +
69 "decision to URIs matching the supplied regular expressions.\n" +
70 "The list of regular expressions can be considered logically AND " +
71 "or OR.");
72 addElementToDefinition(
73 new SimpleType(ATTR_LIST_LOGIC, "Should the list of regular " +
74 "expressions be considered as logically AND or OR when " +
75 "matching.",
76 DEFAULT_LIST_LOGIC, LEGAL_LIST_LOGIC));
77 addElementToDefinition(new StringList(ATTR_REGEXP_LIST,"The list of " +
78 "regular expressions to evalute against the URI."));
79 }
80
81 /***
82 * Evaluate whether given object's string version
83 * matches configured regexps
84 *
85 * @param o
86 * @return true if regexps are matched
87 */
88 protected boolean evaluate(Object o) {
89 try {
90 List regexps = getRegexp(o);
91 if(regexps.size()==0){
92 return false;
93 }
94 String str = o.toString();
95 Iterator it = regexps.iterator();
96
97 boolean listLogicOR = isListLogicOR(o);
98
99
100
101 boolean result = listLogicOR == false;
102
103 while(it.hasNext()){
104 String regexp = (String)it.next();
105 boolean matches = TextUtils.matches(regexp, str);
106
107 if (logger.isLoggable(Level.FINER)) {
108 logger.finer("Tested '" + str + "' match with regex '" +
109 regexp + " and result was " + matches);
110 }
111
112 if(matches){
113 if(listLogicOR){
114
115 result = true;
116 break;
117 }
118 } else {
119 if(listLogicOR == false){
120
121 result = false;
122 break;
123 }
124 }
125 }
126
127 if (logger.isLoggable(Level.FINE) && result){
128 logger.fine("Matched: " + str);
129 }
130
131 return result;
132 } catch (ClassCastException e) {
133
134 return false;
135 }
136 }
137
138 /***
139 * Get the regular expressions list to match the URI against.
140 *
141 * @param o the object for which the regular expression should be
142 * matched against.
143 * @return the regular expression to match against.
144 */
145 protected List getRegexp(Object o) {
146 try {
147 return (StringList) getAttribute(o, ATTR_REGEXP_LIST);
148 } catch (AttributeNotFoundException e) {
149 logger.severe(e.getMessage());
150
151
152 return null;
153 }
154 }
155
156 protected boolean isListLogicOR(Object o){
157 String logic = DEFAULT_LIST_LOGIC;
158 try {
159 logic = (String) getAttribute(o, ATTR_LIST_LOGIC);
160 } catch (AttributeNotFoundException e) {
161 logger.severe(e.getMessage());
162 }
163 return logic.equals("OR") ? true : false;
164 }
165 }