1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.archive.crawler.url.canonicalize;
21
22 import java.util.regex.Matcher;
23 import java.util.regex.Pattern;
24
25 public class StripExtraSlashes extends BaseRule {
26
27 private static final String DESCRIPTION =
28 "Strip any extra slashes, '/', found in the path. " +
29 "Use this rule to equate 'http://www.archive.org//A//B/index.html' and " +
30 "'http://www.archive.org/A/B/index.html'.";
31
32 private static final Pattern REGEX = Pattern.compile("(^https?://.*?)//+(.*)");
33
34 public StripExtraSlashes(String name) {
35 super(name, DESCRIPTION);
36 }
37
38 public String canonicalize(String url, Object context) {
39 Matcher matcher = REGEX.matcher(url);
40 while (matcher.matches()) {
41 url = matcher.group(1) + "/" + matcher.group(2);
42 matcher = REGEX.matcher(url);
43 }
44 return url;
45 }
46 }