1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url;
24
25 import java.io.File;
26
27 import org.apache.commons.httpclient.URIException;
28 import org.archive.crawler.datamodel.CrawlOrder;
29 import org.archive.crawler.settings.MapType;
30 import org.archive.crawler.settings.XMLSettingsHandler;
31 import org.archive.crawler.url.canonicalize.FixupQueryStr;
32 import org.archive.crawler.url.canonicalize.LowercaseRule;
33 import org.archive.crawler.url.canonicalize.StripSessionIDs;
34 import org.archive.crawler.url.canonicalize.StripUserinfoRule;
35 import org.archive.crawler.url.canonicalize.StripWWWRule;
36 import org.archive.net.UURIFactory;
37 import org.archive.util.TmpDirTestCase;
38
39 /***
40 * Test canonicalization.
41 * @author stack
42 * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
43 */
44 public class CanonicalizerTest extends TmpDirTestCase {
45 private File orderFile;
46 protected XMLSettingsHandler settingsHandler;
47
48 private MapType rules = null;
49
50 protected void setUp() throws Exception {
51 super.setUp();
52 this.orderFile = new File(getTmpDir(), this.getClass().getName() +
53 ".order.xml");
54 this.settingsHandler = new XMLSettingsHandler(orderFile);
55 this.settingsHandler.initialize();
56
57 this.rules = (MapType)(settingsHandler.getSettingsObject(null)).
58 getModule(CrawlOrder.ATTR_NAME).
59 getAttribute(CrawlOrder.ATTR_RULES);
60 this.rules.addElement(null, new LowercaseRule("lowercase"));
61 this.rules.addElement(null, new StripUserinfoRule("userinfo"));
62 this.rules.addElement(null, new StripWWWRule("www"));
63 this.rules.addElement(null, new StripSessionIDs("ids"));
64 this.rules.addElement(null, new FixupQueryStr("querystr"));
65 }
66
67 public void testCanonicalize() throws URIException {
68 final String scheme = "http://";
69 final String nonQueryStr = "archive.org/index.html";
70 final String result = scheme + nonQueryStr;
71 assertTrue("Mangled original", result.equals(
72 Canonicalizer.canonicalize(UURIFactory.getInstance(result),
73 this.rules.iterator(UURIFactory.getInstance(result)))));
74 String tmp = scheme + "www." + nonQueryStr;
75 assertTrue("Mangled www", result.equals(
76 Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
77 this.rules.iterator(UURIFactory.getInstance(result)))));
78 tmp = scheme + "www." + nonQueryStr +
79 "?jsessionid=01234567890123456789012345678901";
80 assertTrue("Mangled sessionid", result.equals(
81 Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
82 this.rules.iterator(UURIFactory.getInstance(result)))));
83 tmp = scheme + "www." + nonQueryStr +
84 "?jsessionid=01234567890123456789012345678901";
85 assertTrue("Mangled sessionid", result.equals(
86 Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
87 this.rules.iterator(UURIFactory.getInstance(result)))));
88 }
89 }