1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25 import java.util.regex.Pattern;
26
27
28 /***
29 * Strip known session ids.
30 * @author stack
31 * @version $Date: 2006-09-25 20:27:35 +0000 (Mon, 25 Sep 2006) $, $Revision: 4655 $
32 */
33 public class StripSessionIDs
34 extends BaseRule {
35
36 private static final long serialVersionUID = -3737115200690525641L;
37
38 private static final String DESCRIPTION = "Strip known session IDs. " +
39 "Use this rule to remove all of a set of known session IDs." +
40 " For example, this rule will strip JSESSIONID and its value from" +
41 " 'http://archive.org/index.html?" +
42 "JSESSIONID=DDDSSE233232333355FFSXXXXDSDSDS'. The resulting" +
43 " canonicalization returns 'http://archive.org/index.html'." +
44 " This rule strips JSESSIONID, ASPSESSIONID, PHPSESSID, and 'sid'" +
45 " session ids.";
46
47 /***
48 * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
49 * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
50 */
51 private static final Pattern BASE_PATTERN = Pattern.compile("^(.+)" +
52 "(?:(?:(?:jsessionid)|(?:phpsessid))=" +
53 "[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
54
55 /***
56 * Example: sid=9682993c8daa2c5497996114facdc805.
57 * 'sid=' can be tricky but all sid= followed by 32 byte string
58 * so far seen have been session ids. Sid is a 32 byte string
59 * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
60 * so have to have it run after the phpsessid elimination.
61 */
62 private static final Pattern SID_PATTERN =
63 Pattern.compile("^(.+)" +
64 "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
65
66 /***
67 * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
68 */
69 private static final Pattern ASPSESSION_PATTERN =
70 Pattern.compile("^(.+)" +
71 "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
72 Pattern.CASE_INSENSITIVE);
73
74
75 public StripSessionIDs(String name) {
76 super(name, DESCRIPTION);
77 }
78
79 public String canonicalize(String url, Object context) {
80 url = doStripRegexMatch(url, BASE_PATTERN.matcher(url));
81 url = doStripRegexMatch(url, SID_PATTERN.matcher(url));
82 url = doStripRegexMatch(url, ASPSESSION_PATTERN.matcher(url));
83 return url;
84 }
85 }