View Javadoc

1   /* CanonicalizerTest
2    * 
3    * Created on Oct 7, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.url;
24  
25  import java.io.File;
26  
27  import org.apache.commons.httpclient.URIException;
28  import org.archive.crawler.datamodel.CrawlOrder;
29  import org.archive.crawler.settings.MapType;
30  import org.archive.crawler.settings.XMLSettingsHandler;
31  import org.archive.crawler.url.canonicalize.FixupQueryStr;
32  import org.archive.crawler.url.canonicalize.LowercaseRule;
33  import org.archive.crawler.url.canonicalize.StripSessionIDs;
34  import org.archive.crawler.url.canonicalize.StripUserinfoRule;
35  import org.archive.crawler.url.canonicalize.StripWWWRule;
36  import org.archive.net.UURIFactory;
37  import org.archive.util.TmpDirTestCase;
38  
39  /***
40   * Test canonicalization.
41   * @author stack
42   * @version $Date: 2006-09-26 20:38:48 +0000 (Tue, 26 Sep 2006) $, $Revision: 4667 $
43   */
44  public class CanonicalizerTest extends TmpDirTestCase {
45      private File orderFile;
46      protected XMLSettingsHandler settingsHandler;
47  
48      private MapType rules = null;
49      
50      protected void setUp() throws Exception {
51          super.setUp();
52          this.orderFile = new File(getTmpDir(), this.getClass().getName() +
53              ".order.xml");
54          this.settingsHandler = new XMLSettingsHandler(orderFile);
55          this.settingsHandler.initialize();
56          
57          this.rules = (MapType)(settingsHandler.getSettingsObject(null)).
58              getModule(CrawlOrder.ATTR_NAME).
59                 getAttribute(CrawlOrder.ATTR_RULES);
60          this.rules.addElement(null, new LowercaseRule("lowercase"));
61          this.rules.addElement(null, new StripUserinfoRule("userinfo"));
62          this.rules.addElement(null, new StripWWWRule("www"));
63          this.rules.addElement(null, new StripSessionIDs("ids"));
64          this.rules.addElement(null, new FixupQueryStr("querystr"));
65      }
66      
67      public void testCanonicalize() throws URIException {
68          final String scheme = "http://";
69          final String nonQueryStr = "archive.org/index.html";
70          final String result = scheme + nonQueryStr;
71          assertTrue("Mangled original", result.equals(
72              Canonicalizer.canonicalize(UURIFactory.getInstance(result),
73                  this.rules.iterator(UURIFactory.getInstance(result)))));
74          String tmp = scheme + "www." + nonQueryStr;
75          assertTrue("Mangled www", result.equals(
76              Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
77                  this.rules.iterator(UURIFactory.getInstance(result)))));
78          tmp = scheme + "www." + nonQueryStr +
79              "?jsessionid=01234567890123456789012345678901";
80          assertTrue("Mangled sessionid", result.equals(
81              Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
82                  this.rules.iterator(UURIFactory.getInstance(result)))));
83          tmp = scheme + "www." + nonQueryStr +
84              "?jsessionid=01234567890123456789012345678901";
85          assertTrue("Mangled sessionid", result.equals(
86               Canonicalizer.canonicalize(UURIFactory.getInstance(tmp),
87                     this.rules.iterator(UURIFactory.getInstance(result)))));       
88      }
89  }