View Javadoc

1   /* PublicSuffixesTest.java
2    *
3    * $Id: BloomFilter32bitSplit.java 5197 2007-06-06 01:31:46Z gojomo $
4    *
5    * Created on Jun 13, 2007
6    *
7    * Copyright (C) 2007 Internet Archive
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  
26  package org.archive.net;
27  
28  import java.util.regex.Matcher;
29  
30  import junit.framework.TestCase;
31  
32  /***
33   * Test cases for PublicSuffixes utility. Confirm expected matches/nonmatches
34   * from constructed regex.
35   * 
36   * @author gojomo
37   */
38  public class PublicSuffixesTest extends TestCase {
39      Matcher m = PublicSuffixes.getTopmostAssignedSurtPrefixPattern()
40              .matcher("");
41  
42      public void testBasics() {
43          matchPrefix("com,example,www,", "com,example,");
44          matchPrefix("com,example,", "com,example,");
45          matchPrefix("org,archive,www,", "org,archive,");
46          matchPrefix("org,archive,", "org,archive,");
47          matchPrefix("fr,yahoo,www,", "fr,yahoo,");
48          matchPrefix("fr,yahoo,", "fr,yahoo,");
49          matchPrefix("au,com,foobar,www,", "au,com,foobar,");
50          matchPrefix("au,com,foobar,", "au,com,foobar,");
51          matchPrefix("uk,co,virgin,www,", "uk,co,virgin,");
52          matchPrefix("uk,co,virgin,", "uk,co,virgin,");
53          matchPrefix("au,com,example,www,", "au,com,example,");
54          matchPrefix("au,com,example,", "au,com,example,");
55          matchPrefix("jp,tokyo,public,assigned,www,",
56                  "jp,tokyo,public,assigned,");
57          matchPrefix("jp,tokyo,public,assigned,", "jp,tokyo,public,assigned,");
58      }
59  
60      public void testDomainWithDash() {
61          matchPrefix("de,bad-site,www", "de,bad-site,");
62      }
63      
64      public void testDomainWithNumbers() {
65          matchPrefix("de,archive4u,www", "de,archive4u,");
66      }
67      
68      public void testIPV4() {
69          assertEquals("unexpected reduction", 
70                  "1.2.3.4",
71                  PublicSuffixes.reduceSurtToTopmostAssigned("1.2.3.4"));
72      }
73      
74      public void testIPV6() {
75          assertEquals("unexpected reduction", 
76                  "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]",
77                  PublicSuffixes.reduceSurtToTopmostAssigned(
78                          "[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]"));
79      }
80      
81      public void testExceptions() {
82          matchPrefix("uk,bl,www,", "uk,bl,");
83          matchPrefix("uk,bl,", "uk,bl,");
84          matchPrefix("jp,tokyo,metro,subdomain,", "jp,tokyo,metro,");
85          matchPrefix("jp,tokyo,metro,", "jp,tokyo,metro,");
86      }
87  
88      public void testFakeTLD() {
89          // we assume any new/unknonwn TLD should be assumed as 2-level;
90          // this is preferable for our grouping purpose but might not be
91          // for a cookie-assigning browser (original purpose of publicsuffixlist)
92          matchPrefix("zzz,example,www,", "zzz,example,");
93      }
94  
95      public void testUnsegmentedHostname() {
96          m.reset("example");
97          assertFalse("unexpected match found in 'example'", m.find());
98      }
99  
100     public void testTopmostAssignedCaching() {
101         assertSame("topmostAssignedSurtPrefixPattern not cached",PublicSuffixes.getTopmostAssignedSurtPrefixPattern(),PublicSuffixes.getTopmostAssignedSurtPrefixPattern());
102         assertSame("topmostAssignedSurtPrefixRegex not cached",PublicSuffixes.getTopmostAssignedSurtPrefixRegex(),PublicSuffixes.getTopmostAssignedSurtPrefixRegex()); 
103     }
104     
105     // TODO: test UTF domains?
106 
107     protected void matchPrefix(String surtDomain, String expectedAssignedPrefix) {
108         m.reset(surtDomain);
109         assertTrue("expected match not found in '" + surtDomain, m.find());
110         assertEquals("expected match not found", expectedAssignedPrefix, m
111                 .group());
112     }
113 }