1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.url.canonicalize;
24
25 import java.io.File;
26
27 import javax.management.InvalidAttributeValueException;
28
29 import org.apache.commons.httpclient.URIException;
30 import org.archive.crawler.datamodel.CrawlOrder;
31 import org.archive.crawler.settings.MapType;
32 import org.archive.crawler.settings.XMLSettingsHandler;
33 import org.archive.net.UURIFactory;
34 import org.archive.util.TmpDirTestCase;
35
36
37 /***
38 * Test the regex rule.
39 * @author stack
40 * @version $Date: 2005-07-18 17:30:21 +0000 (Mon, 18 Jul 2005) $, $Revision: 3704 $
41 */
42 public class RegexRuleTest extends TmpDirTestCase {
43 private File orderFile;
44 protected XMLSettingsHandler settingsHandler;
45 private MapType rules = null;
46
47 protected void setUp() throws Exception {
48 super.setUp();
49 this.orderFile = new File(getTmpDir(), this.getClass().getName() +
50 ".order.xml");
51 this.settingsHandler = new XMLSettingsHandler(orderFile);
52 this.settingsHandler.initialize();
53 this.rules = (MapType)(settingsHandler.getSettingsObject(null)).
54 getModule(CrawlOrder.ATTR_NAME).
55 getAttribute(CrawlOrder.ATTR_RULES);
56 }
57
58 public void testCanonicalize()
59 throws URIException, InvalidAttributeValueException {
60 final String url = "http://www.aRchive.Org/index.html";
61 RegexRule rr = new RegexRule("Test " + this.getClass().getName());
62 this.rules.addElement(null, rr);
63 rr.canonicalize(url, UURIFactory.getInstance(url));
64 String product = rr.canonicalize(url, null);
65 assertTrue("Default doesn't work.", url.equals(product));
66 }
67
68 public void testSessionid()
69 throws InvalidAttributeValueException {
70 final String urlBase = "http://joann.com/catalog.jhtml";
71 final String urlMinusSessionid = urlBase + "?CATID=96029";
72 final String url = urlBase +
73 ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO?CATID=96029";
74 RegexRule rr = new RegexRule("Test",
75 "^(.+)(?:;//$sessionid//$[A-Z0-9]{32})(//?.*)+$",
76 "$1$2");
77 this.rules.addElement(null, rr);
78 String product = rr.canonicalize(url, null);
79 assertTrue("Failed " + url, urlMinusSessionid.equals(product));
80 }
81
82 public void testNullFormat()
83 throws InvalidAttributeValueException {
84 final String urlBase = "http://joann.com/catalog.jhtml";
85 final String url = urlBase +
86 ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO";
87 RegexRule rr = new RegexRule("Test",
88 "^(.+)(?:;//$sessionid//$[A-Z0-9]{32})$",
89 "$1$2");
90 this.rules.addElement(null, rr);
91 String product = rr.canonicalize(url, null);
92 assertTrue("Failed " + url, urlBase.equals(product));
93 }
94 }