1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.logging.Logger;
26 import java.util.regex.Matcher;
27
28 import org.archive.crawler.settings.SimpleType;
29 import org.archive.util.TextUtils;
30
31 /***
32 * General conversion rule.
33 * @author stack
34 * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
35 */
36 public class RegexRule
37 extends BaseRule {
38
39 private static final long serialVersionUID = -2658094415450237847L;
40
41 protected static Logger logger =
42 Logger.getLogger(BaseRule.class.getName());
43 private static final String DESCRIPTION = "General regex rule. " +
44 "Specify a matching regex and a format string used outputting" +
45 " result if a match was found. If problem compiling regex or" +
46 " interpreting format, problem is logged, and this rule does" +
47 " nothing. See User Manual for example usage.";
48 private static final String ATTR_REGEX = "matching-regex";
49 private static final String ATTR_FORMAT = "format";
50 private static final String ATTR_COMMENT = "comment";
51
52 public RegexRule(String name) {
53 this(name, "(.*)", "${1}");
54 }
55
56 protected RegexRule(String name, String defaultRegex,
57 String defaultFormat) {
58 super(name, DESCRIPTION);
59 addElementToDefinition(new SimpleType(ATTR_REGEX,
60 "Java regular expression. If the regex matches, we'll rewrite" +
61 " the passed url using the specified format pattern.",
62 defaultRegex));
63 addElementToDefinition(
64 new SimpleType(ATTR_FORMAT, "Pattern to use rewriting matched" +
65 "url. Use '${1}' to match first regex group, '${2}' for" +
66 "next group, etc.", defaultFormat));
67 addElementToDefinition(new SimpleType(ATTR_COMMENT,
68 "Free-text comment on why this rule was added.", ""));
69 }
70
71 public String canonicalize(String url, Object context) {
72 String regex = getNullOrAttribute(ATTR_REGEX, context);
73 if (regex == null) {
74 return url;
75 }
76 String format = getNullOrAttribute(ATTR_FORMAT, context);
77 if (format == null) {
78 return url;
79 }
80 Matcher matcher = TextUtils.getMatcher(regex, url);
81 String retVal;
82 if (matcher == null || !matcher.matches()) {
83 retVal = url;
84 } else {
85 StringBuffer buffer = new StringBuffer(url.length() * 2);
86 format(matcher, format, buffer);
87 retVal = buffer.toString();
88 }
89 TextUtils.recycleMatcher(matcher);
90 return retVal;
91 }
92
93 /***
94 * @param matcher Matched matcher.
95 * @param format Output format specifier.
96 * @param buffer Buffer to append output to.
97 */
98 protected void format(Matcher matcher, String format,
99 StringBuffer buffer) {
100 for (int i = 0; i < format.length(); i++) {
101 switch(format.charAt(i)) {
102 case '//':
103 if ((i + 1) < format.length() &&
104 format.charAt(i + 1) == '$') {
105
106 continue;
107 }
108
109 case '$':
110
111 if (i == 0 || (i > 0 && (format.charAt(i - 1) != '//'))) {
112
113
114 int start = i + 1;
115 boolean curlyBraceStart = false;
116 if (format.charAt(start) == '{') {
117 start++;
118 curlyBraceStart = true;
119 }
120 int j = start;
121 for (; j < format.length() &&
122 Character.isDigit(format.charAt(j)); j++) {
123
124 }
125 if (j > start) {
126 int groupIndex = Integer.
127 parseInt(format.substring(start, j));
128 if (groupIndex >= 0 && groupIndex < 256) {
129 String g = null;
130 try {
131 g = matcher.group(groupIndex);
132 } catch (IndexOutOfBoundsException e) {
133 logger.warning("IndexOutOfBoundsException" +
134 " getting group " + groupIndex +
135 " from " + matcher.group(0) +
136 " with format of " + format);
137 }
138 if (g != null) {
139 buffer.append(g);
140 }
141
142 if (curlyBraceStart &&
143 format.charAt(j) == '}') {
144 j++;
145 }
146
147
148 i = (j - 1);
149
150 continue;
151 }
152 }
153
154 }
155
156
157 default:
158 buffer.append(format.charAt(i));
159 }
160 }
161 }
162
163 protected String getNullOrAttribute(String name, Object context) {
164 try {
165 return (String)getAttribute(context, name);
166 } catch (Exception e) {
167 logger.severe(e.getMessage());
168 return null;
169 }
170 }
171 }