1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.scope;
26
27 import java.io.BufferedReader;
28 import java.io.BufferedWriter;
29 import java.io.IOException;
30 import java.io.StringReader;
31 import java.io.StringWriter;
32 import java.util.LinkedList;
33
34 import junit.framework.TestCase;
35
36 import org.archive.net.UURI;
37
38 /***
39 * Test {@link SeedFileIterator}.
40 * @author gojomo
41 * @version $Revision: 6910 $, $Date: 2010-07-02 17:46:12 +0000 (Fri, 02 Jul 2010) $
42 */
43 public class SeedFileIteratorTest extends TestCase {
44 public void testHyphenInHost() {
45 final String seedFileContent = "http://www.examp-le.com/";
46 StringWriter sw = new StringWriter();
47 StringReader sr = new StringReader(seedFileContent);
48 UURI seed =
49 (UURI)(new SeedFileIterator(new BufferedReader(sr), sw)).next();
50 assertEquals("Hyphen is problem", seed.toString(),
51 seedFileContent);
52 }
53
54 public void testGeneral() throws IOException {
55 String seedFile = "# comment\n" +
56 "\n" +
57 "www.example.com\n" +
58 "www.example.org/foo\n" +
59 "http://www.example.net\n" + // full HTTP URL
60 "+http://www.example.us"; // 'directive' (should be ignored)
61 StringWriter ignored = new StringWriter();
62 SeedFileIterator iter = new SeedFileIterator(new BufferedReader(
63 new StringReader(seedFile)), new BufferedWriter(ignored));
64 LinkedList<String> seeds = new LinkedList<String>();
65 while (iter.hasNext()) {
66 UURI n = iter.next();
67 if (n instanceof UURI) {
68 seeds.add(n.getURI());
69 }
70 }
71 assertTrue("didn't get naked host", seeds
72 .contains("http://www.example.com/"));
73 assertTrue("didn't get naked host+path", seeds
74 .contains("http://www.example.org/foo"));
75 assertTrue("didn't get full http URL", seeds
76 .contains("http://www.example.net/"));
77 assertTrue("got wrong number of URLs", seeds.size() == 3);
78 assertTrue("ignored entry not reported", ignored.toString().indexOf(
79 "+http://www.example.us") >= 0);
80 }
81
82 public void testIgnoreBom() {
83 String bomString = "\ufeffhttp://www.example.com/";
84 SeedFileIterator si = new SeedFileIterator(new java.io.BufferedReader(new java.io.StringReader(bomString)));
85 UURI uuri = si.next();
86 assertEquals("bom not ignored","http://www.example.com/",uuri.toString());
87 }
88 }
89