View Javadoc

1   /* SeedFileIteratorTest
2    *
3    * $Id: SeedFileIteratorTest.java 6910 2010-07-02 17:46:12Z gojomo $
4    *
5    * Created on May 31, 2005
6    *
7    * Copyright (C) 2005 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.scope;
26  
27  import java.io.BufferedReader;
28  import java.io.BufferedWriter;
29  import java.io.IOException;
30  import java.io.StringReader;
31  import java.io.StringWriter;
32  import java.util.LinkedList;
33  
34  import junit.framework.TestCase;
35  
36  import org.archive.net.UURI;
37  
38  /***
39   * Test {@link SeedFileIterator}.
40   * @author gojomo
41   * @version $Revision: 6910 $, $Date: 2010-07-02 17:46:12 +0000 (Fri, 02 Jul 2010) $
42   */
43  public class SeedFileIteratorTest extends TestCase {
44      public void testHyphenInHost() {
45          final String seedFileContent = "http://www.examp-le.com/";
46          StringWriter sw = new StringWriter();
47          StringReader sr = new StringReader(seedFileContent);
48          UURI seed = 
49              (UURI)(new SeedFileIterator(new BufferedReader(sr), sw)).next();
50          assertEquals("Hyphen is problem", seed.toString(),
51              seedFileContent);
52      }
53  
54      public void testGeneral() throws IOException {
55          String seedFile = "# comment\n" + // comment
56                  "\n" + // blank line
57                  "www.example.com\n" + // naked host, implied scheme
58                  "www.example.org/foo\n" + // naked host+path, implied scheme
59                  "http://www.example.net\n" + // full HTTP URL
60                  "+http://www.example.us"; // 'directive' (should be ignored)
61          StringWriter ignored = new StringWriter();
62          SeedFileIterator iter = new SeedFileIterator(new BufferedReader(
63                  new StringReader(seedFile)), new BufferedWriter(ignored));
64          LinkedList<String> seeds = new LinkedList<String>();
65          while (iter.hasNext()) {
66              UURI n = iter.next();
67              if (n instanceof UURI) {
68                  seeds.add(n.getURI());
69              }
70          }
71          assertTrue("didn't get naked host", seeds
72                  .contains("http://www.example.com/"));
73          assertTrue("didn't get naked host+path", seeds
74                  .contains("http://www.example.org/foo"));
75          assertTrue("didn't get full http URL", seeds
76                  .contains("http://www.example.net/"));
77          assertTrue("got wrong number of URLs", seeds.size() == 3);
78          assertTrue("ignored entry not reported", ignored.toString().indexOf(
79                  "+http://www.example.us") >= 0);
80      }
81      
82      public void testIgnoreBom() {
83          String bomString = "\ufeffhttp://www.example.com/";
84          SeedFileIterator si = new SeedFileIterator(new java.io.BufferedReader(new java.io.StringReader(bomString)));
85          UURI uuri = si.next();
86          assertEquals("bom not ignored","http://www.example.com/",uuri.toString());
87      }
88  }
89