View Javadoc

1   /* RegexRule
2    * 
3    * Created on Oct 6, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url.canonicalize;
24  
25  import java.util.logging.Logger;
26  import java.util.regex.Matcher;
27  
28  import org.archive.crawler.settings.SimpleType;
29  import org.archive.util.TextUtils;
30  
31  /***
32   * General conversion rule.
33   * @author stack
34   * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
35   */
36  public class RegexRule
37  extends BaseRule {
38  
39      private static final long serialVersionUID = -2658094415450237847L;
40  
41      protected static Logger logger =
42          Logger.getLogger(BaseRule.class.getName());
43      private static final String DESCRIPTION = "General regex rule. " +
44          "Specify a matching regex and a format string used outputting" +
45          " result if a match was found.  If problem compiling regex or" +
46          " interpreting format, problem is logged, and this rule does" +
47          " nothing.  See User Manual for example usage.";
48      private static final String ATTR_REGEX = "matching-regex";
49      private static final String ATTR_FORMAT = "format";
50      private static final String ATTR_COMMENT = "comment";
51      
52      public RegexRule(String name) {
53          this(name, "(.*)", "${1}");
54      }
55      
56      protected RegexRule(String name, String defaultRegex,
57              String defaultFormat) {
58          super(name, DESCRIPTION);
59          addElementToDefinition(new SimpleType(ATTR_REGEX,
60              "Java regular expression. If the regex matches, we'll rewrite" +
61              " the passed url using the specified format pattern.",
62              defaultRegex));
63          addElementToDefinition(
64              new SimpleType(ATTR_FORMAT, "Pattern to use rewriting matched" +
65                  "url. Use '${1}' to match first regex group, '${2}' for" +
66                  "next group, etc.", defaultFormat));
67          addElementToDefinition(new SimpleType(ATTR_COMMENT,
68              "Free-text comment on why this rule was added.", ""));
69      }
70  
71      public String canonicalize(String url, Object context) {
72          String regex = getNullOrAttribute(ATTR_REGEX, context);
73          if (regex == null) {
74              return url;
75          }
76          String format = getNullOrAttribute(ATTR_FORMAT, context);
77          if (format == null) {
78              return url;
79          }
80          Matcher matcher = TextUtils.getMatcher(regex, url);
81          String retVal; 
82          if (matcher == null || !matcher.matches()) {
83              retVal = url;
84          } else {
85              StringBuffer buffer = new StringBuffer(url.length() * 2);
86              format(matcher, format, buffer);
87              retVal = buffer.toString();
88          }
89          TextUtils.recycleMatcher(matcher);
90          return retVal;
91      }
92      
93      /***
94       * @param matcher Matched matcher.
95       * @param format Output format specifier.
96       * @param buffer Buffer to append output to.
97       */
98      protected void format(Matcher matcher, String format,
99              StringBuffer buffer) {
100         for (int i = 0; i < format.length(); i++) {
101             switch(format.charAt(i)) {
102                 case '//':
103                     if ((i + 1) < format.length() &&
104                             format.charAt(i + 1) == '$') {
105                         // Don't write the escape character in output.
106                         continue;
107                     }
108                     
109                 case '$':
110                     // Check to see if its not been escaped.
111                     if (i == 0 || (i > 0 && (format.charAt(i - 1) != '//'))) {
112                         // Looks like we have a matching group specifier in
113                         // our format string, something like '$2' or '${2}'.
114                         int start = i + 1;
115                         boolean curlyBraceStart = false;
116                         if (format.charAt(start) == '{') {
117                             start++;
118                             curlyBraceStart = true;
119                         }
120                         int j = start;
121                         for (; j < format.length() &&
122                                 Character.isDigit(format.charAt(j)); j++) {
123                             // While a digit, increment.
124                         }
125                         if (j > start) {
126                             int groupIndex = Integer.
127                                 parseInt(format.substring(start, j));
128                             if (groupIndex >= 0 && groupIndex < 256) {
129                                 String g = null;
130                                 try {
131                                     g = matcher.group(groupIndex);
132                                 } catch (IndexOutOfBoundsException e) {
133                                     logger.warning("IndexOutOfBoundsException" +
134                                         " getting group " + groupIndex +
135                                         " from " + matcher.group(0) +
136                                         " with format of " + format);
137                                 }
138                                 if (g != null) {
139                                     buffer.append(g);
140                                 }
141                                 // Skip closing curly bracket if one.
142                                 if (curlyBraceStart &&
143                                         format.charAt(j) == '}') {
144                                     j++;
145                                 }
146                                 // Update the loop index so that we skip over
147                                 // the ${x} group item.
148                                 i = (j - 1);
149                                 // Don't fall through to the default.
150                                 continue;
151                             }
152                         }
153                         
154                     }
155                     // Let fall through to default rule.  The '$' was escaped.
156                     
157                 default:
158                     buffer.append(format.charAt(i));
159             }
160         }
161     }
162 
163     protected String getNullOrAttribute(String name, Object context) {
164         try {
165             return (String)getAttribute(context, name);
166         } catch (Exception e) {
167             logger.severe(e.getMessage());
168             return null;
169         }
170     }
171 }