View Javadoc

1   /* RegexRuleTest
2    * 
3    * Created on Oct 6, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url.canonicalize;
24  
25  import java.io.File;
26  
27  import javax.management.InvalidAttributeValueException;
28  
29  import org.apache.commons.httpclient.URIException;
30  import org.archive.crawler.datamodel.CrawlOrder;
31  import org.archive.crawler.settings.MapType;
32  import org.archive.crawler.settings.XMLSettingsHandler;
33  import org.archive.net.UURIFactory;
34  import org.archive.util.TmpDirTestCase;
35  
36  
37  /***
38   * Test the regex rule.
39   * @author stack
40   * @version $Date: 2005-07-18 17:30:21 +0000 (Mon, 18 Jul 2005) $, $Revision: 3704 $
41   */
42  public class RegexRuleTest extends TmpDirTestCase {
43      private File orderFile;
44      protected XMLSettingsHandler settingsHandler;
45      private MapType rules = null;
46      
47      protected void setUp() throws Exception {
48          super.setUp();
49          this.orderFile = new File(getTmpDir(), this.getClass().getName() +
50              ".order.xml");
51          this.settingsHandler = new XMLSettingsHandler(orderFile);
52          this.settingsHandler.initialize();
53          this.rules = (MapType)(settingsHandler.getSettingsObject(null)).
54              getModule(CrawlOrder.ATTR_NAME).
55                 getAttribute(CrawlOrder.ATTR_RULES);
56      }
57      
58      public void testCanonicalize()
59      throws URIException, InvalidAttributeValueException {
60          final String url = "http://www.aRchive.Org/index.html";
61          RegexRule rr = new RegexRule("Test " + this.getClass().getName());
62          this.rules.addElement(null, rr);
63          rr.canonicalize(url, UURIFactory.getInstance(url));
64          String product = rr.canonicalize(url, null);
65          assertTrue("Default doesn't work.",  url.equals(product));
66      }
67  
68      public void testSessionid()
69      throws InvalidAttributeValueException {
70          final String urlBase = "http://joann.com/catalog.jhtml";
71          final String urlMinusSessionid = urlBase + "?CATID=96029";
72          final String url = urlBase +
73  		    ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO?CATID=96029";
74          RegexRule rr = new RegexRule("Test",
75              "^(.+)(?:;//$sessionid//$[A-Z0-9]{32})(//?.*)+$",
76          	"$1$2");
77          this.rules.addElement(null, rr);
78          String product = rr.canonicalize(url, null);
79          assertTrue("Failed " + url, urlMinusSessionid.equals(product));
80      }
81      
82      public void testNullFormat()
83      throws InvalidAttributeValueException {
84          final String urlBase = "http://joann.com/catalog.jhtml";
85          final String url = urlBase +
86              ";$sessionid$JKOFFNYAAKUTIP4SY5NBHOR50LD3OEPO";
87          RegexRule rr = new RegexRule("Test",
88              "^(.+)(?:;//$sessionid//$[A-Z0-9]{32})$",
89              "$1$2");
90          this.rules.addElement(null, rr);
91          String product = rr.canonicalize(url, null);
92          assertTrue("Failed " + url, urlBase.equals(product));
93      }
94  }