1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.regex.Pattern;
26
27
28
29 /***
30 * Strip any 'www' found on http/https URLs, IF they have some
31 * path/query component (content after third slash). (Top 'slash page'
32 * URIs are left unstripped, so that we prefer crawling redundant
33 * top pages to missing an entire site only available from either
34 * the www-full or www-less hostname, but not both).
35 * @author stack
36 * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
37 */
38 public class StripWWWRule extends BaseRule {
39
40 private static final long serialVersionUID = -5416391108485746976L;
41
42 private static final String DESCRIPTION = "Strip any 'www' found. " +
43 "Use this rule to equate 'http://www.archive.org/index.html' and" +
44 " 'http://archive.org/index.html'. The resulting canonicalization" +
45 " returns 'http://archive.org/index.html'. It removes any www's " +
46 "found, except on URIs that have no path/query component " +
47 "('slash' pages). Operates on http and https schemes only. " +
48 "Use the more general StripWWWNRule if you want to strip both 'www' " +
49 "and 'www01', 'www02', etc.";
50
51 private static final Pattern REGEX =
52 Pattern.compile("(?i)^(https?://)(?:www//.)([^/]*/.+)$");
53
54 public StripWWWRule(String name) {
55 super(name, DESCRIPTION);
56 }
57
58 public String canonicalize(String url, Object context) {
59 return doStripRegexMatch(url, REGEX.matcher(url));
60 }
61 }