StripWWWRule xref

View Javadoc

1   /* StripWWWRule
2    * 
3    * Created on Oct 5, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url.canonicalize;
24  
25  import java.util.regex.Pattern;
26  
27  
28  
29  /***
30   * Strip any 'www' found on http/https URLs, IF they have some
31   * path/query component (content after third slash). (Top 'slash page' 
32   * URIs are left unstripped, so that we prefer crawling redundant
33   * top pages to missing an entire site only available from either
34   * the www-full or www-less hostname, but not both). 
35   * @author stack
36   * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
37   */
38  public class StripWWWRule extends BaseRule {
39  
40      private static final long serialVersionUID = -5416391108485746976L;
41  
42      private static final String DESCRIPTION = "Strip any 'www' found. " +
43          "Use this rule to equate 'http://www.archive.org/index.html' and" +
44          " 'http://archive.org/index.html'. The resulting canonicalization" +
45          " returns 'http://archive.org/index.html'.  It removes any www's " +
46          "found, except on URIs that have no path/query component " +
47          "('slash' pages).  Operates on http and https schemes only. " +
48          "Use the more general StripWWWNRule if you want to strip both 'www' " +
49          "and 'www01', 'www02', etc.";
50      
51      private static final Pattern REGEX =
52          Pattern.compile("(?i)^(https?://)(?:www//.)([^/]*/.+)$");
53  
54      public StripWWWRule(String name) {
55          super(name, DESCRIPTION);
56      }
57  
58      public String canonicalize(String url, Object context) {
59          return doStripRegexMatch(url, REGEX.matcher(url));
60      }
61  }