1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.regex.Pattern;
26
27
28
29 /***
30 * Strip any 'www[0-9]*' found on http/https URLs IF they have some
31 * path/query component (content after third slash). Top 'slash page'
32 * URIs are left unstripped: we prefer crawling redundant
33 * top pages to missing an entire site only available from either
34 * the www-full or www-less hostname, but not both.
35 * @author stack
36 * @version $Date: 2006-09-18 20:32:47 +0000 (Mon, 18 Sep 2006) $, $Revision: 4634 $
37 */
38 public class StripWWWNRule extends BaseRule {
39 private static final long serialVersionUID = 3619916990307308590L;
40
41 private static final String DESCRIPTION = "Strip any 'www[0-9]*' found. " +
42 "Use this rule to equate 'http://www.archive.org/index.html' and " +
43 "'http://www0001.archive.org/index.html' with " +
44 "'http://archive.org/index.html'. The resulting canonicalization " +
45 "returns 'http://archive.org/index.html'. It removes any www's " +
46 "or wwwNNN's found, where 'N' is one or more numerics, EXCEPT " +
47 "on URIs that have no path/query component " +
48 ". Top-level 'slash page' URIs are left unstripped: we prefer " +
49 "crawling redundant top pages to missing an entire site only " +
50 "available from either the www-full or www-less hostname, but not " +
51 "both. Operates on http and https schemes only. " +
52 "Use StripWWWRule to strip a lone 'www' only (This rule is a " +
53 "more general version of StripWWWRule).";
54
55 private static final Pattern REGEX =
56 Pattern.compile("(?i)^(https?://)(?:www[0-9]*//.)([^/]*/.+)$");
57
58 public StripWWWNRule(String name) {
59 super(name, DESCRIPTION);
60 }
61
62 public String canonicalize(String url, Object context) {
63 return doStripRegexMatch(url, REGEX.matcher(url));
64 }
65 }