View Javadoc

1   /* StripSessionIDs
2    * 
3    * Created on Oct 6, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url.canonicalize;
24  
25  import java.util.regex.Pattern;
26  
27  
28  /***
29   * Strip known session ids.
30   * @author stack
31   * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
32   */
33  public class StripSessionIDs
34  extends BaseRule {
35  
36      private static final long serialVersionUID = -3737115200690525641L;
37  
38      private static final String DESCRIPTION = "Strip known session IDs. " +
39          "Use this rule to remove all of a set of known session IDs." +
40          " For example, this rule will strip JSESSIONID and its value from" +
41          " 'http://archive.org/index.html?" +
42          "JSESSIONID=DDDSSE233232333355FFSXXXXDSDSDS'.  The resulting" +
43          " canonicalization returns 'http://archive.org/index.html'." +
44          " This rule strips JSESSIONID, ASPSESSIONID, PHPSESSID, and 'sid'" +
45          " session ids.";
46      
47      /***
48       * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
49       * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
50       */
51      private static final Pattern BASE_PATTERN = Pattern.compile("^(.+)" +
52              "(?:(?:(?:jsessionid)|(?:phpsessid))=" +
53                   "[0-9a-zA-Z]{32})(?:&(.*))?$",  Pattern.CASE_INSENSITIVE);
54      
55      /***
56       * Example: sid=9682993c8daa2c5497996114facdc805. 
57       * 'sid=' can be tricky but all sid= followed by 32 byte string
58       * so far seen have been session ids.  Sid is a 32 byte string
59       * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
60       * so have to have it run after the phpsessid elimination.
61       */
62      private static final Pattern SID_PATTERN =
63          Pattern.compile("^(.+)" +
64              "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
65      
66      /***
67       * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
68       */
69      private static final Pattern ASPSESSION_PATTERN =
70          Pattern.compile("^(.+)" +
71              "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
72                  Pattern.CASE_INSENSITIVE);
73      
74  
75      public StripSessionIDs(String name) {
76          super(name, DESCRIPTION);
77      }
78  
79      public String canonicalize(String url, Object context) {
80          url = doStripRegexMatch(url, BASE_PATTERN.matcher(url));
81          url = doStripRegexMatch(url, SID_PATTERN.matcher(url));
82          url = doStripRegexMatch(url, ASPSESSION_PATTERN.matcher(url));
83          return url;
84      }
85  }