1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.util;
26
27 import java.io.FileNotFoundException;
28 import java.io.IOException;
29 import java.util.ArrayList;
30 import java.util.Iterator;
31 import java.util.logging.Logger;
32
33 import junit.framework.TestCase;
34
35 import org.apache.commons.httpclient.URIException;
36 import org.archive.crawler.datamodel.CandidateURI;
37 import org.archive.crawler.datamodel.UriUniqFilter;
38 import org.archive.net.UURI;
39 import org.archive.net.UURIFactory;
40
41
42 /***
43 * Test BloomUriUniqFilter.
44 * @author gojomo
45 */
46 public class BloomUriUniqFilterTest extends TestCase
47 implements UriUniqFilter.HasUriReceiver {
48 private Logger logger =
49 Logger.getLogger(BloomUriUniqFilterTest.class.getName());
50
51 private BloomUriUniqFilter filter = null;
52
53 /***
54 * Set to true if we visited received.
55 */
56 private boolean received = false;
57
58 protected void setUp() throws Exception {
59 super.setUp();
60 this.filter = new BloomUriUniqFilter(2000,24);
61 this.filter.setDestination(this);
62 }
63
64 public void testAdding() throws URIException {
65 this.filter.add(this.getUri(),
66 new CandidateURI(UURIFactory.getInstance(this.getUri())));
67 this.filter.addNow(this.getUri(),
68 new CandidateURI(UURIFactory.getInstance(this.getUri())));
69 this.filter.addForce(this.getUri(),
70 new CandidateURI(UURIFactory.getInstance(this.getUri())));
71
72 assertTrue("Count is off", this.filter.count() == 1);
73 }
74
75 /***
76 * Test inserting.
77 * @throws URIException
78 * @throws IOException
79 * @throws FileNotFoundException
80 */
81 public void testWriting() throws URIException {
82 long start = System.currentTimeMillis();
83 ArrayList<UURI> list = new ArrayList<UURI>(1000);
84 int count = 0;
85 final int MAX_COUNT = 1000;
86 for (; count < MAX_COUNT; count++) {
87 assertEquals("count off",count,filter.count());
88 UURI u = UURIFactory.getInstance("http://www" +
89 count + ".archive.org/" + count + "/index.html");
90 assertFalse("already contained "+u.toString(),filter.bloom.contains(u.toString()));
91 logger.fine("adding "+u.toString());
92 filter.add(u.toString(), new CandidateURI(u));
93 assertTrue("not in bloom",filter.bloom.contains(u.toString()));
94 if (count > 0 && ((count % 100) == 0)) {
95 list.add(u);
96 }
97 }
98 logger.fine("Added " + count + " in " +
99 (System.currentTimeMillis() - start));
100
101 start = System.currentTimeMillis();
102 for (Iterator i = list.iterator(); i.hasNext();) {
103 UURI uuri = (UURI)i.next();
104 filter.add(uuri.toString(), new CandidateURI(uuri));
105 }
106 logger.fine("Readded subset " + list.size() + " in " +
107 (System.currentTimeMillis() - start));
108
109 assertTrue("Count is off: " + filter.count(),
110 filter.count() == MAX_COUNT);
111 }
112
113 public void testNote() {
114 filter.note(this.getUri());
115 assertFalse("Receiver was called", this.received);
116 }
117
118
119
120
121
122
123
124
125 public void receive(CandidateURI item) {
126 this.received = true;
127 }
128
129 public String getUri() {
130 return "http://www.archive.org";
131 }
132 }