View Javadoc

1   /* FPUriUniqFilterTest
2    *
3    * $Id: BloomUriUniqFilterTest.java 4647 2006-09-22 18:39:39Z paul_jack $
4    *
5    * Created on Sep 15, 2004.
6    *
7    * Copyright (C) 2003 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.util;
26  
27  import java.io.FileNotFoundException;
28  import java.io.IOException;
29  import java.util.ArrayList;
30  import java.util.Iterator;
31  import java.util.logging.Logger;
32  
33  import junit.framework.TestCase;
34  
35  import org.apache.commons.httpclient.URIException;
36  import org.archive.crawler.datamodel.CandidateURI;
37  import org.archive.crawler.datamodel.UriUniqFilter;
38  import org.archive.net.UURI;
39  import org.archive.net.UURIFactory;
40  
41  
42  /***
43   * Test BloomUriUniqFilter.
44   * @author gojomo
45   */
46  public class BloomUriUniqFilterTest extends TestCase
47  implements UriUniqFilter.HasUriReceiver {
48      private Logger logger =
49          Logger.getLogger(BloomUriUniqFilterTest.class.getName());
50  
51      private BloomUriUniqFilter filter = null;
52  
53      /***
54       * Set to true if we visited received.
55       */
56      private boolean received = false;
57  
58      protected void setUp() throws Exception {
59          super.setUp();
60          this.filter = new BloomUriUniqFilter(2000,24);
61          this.filter.setDestination(this);
62      }
63  
64      public void testAdding() throws URIException {
65          this.filter.add(this.getUri(),
66              new CandidateURI(UURIFactory.getInstance(this.getUri())));
67          this.filter.addNow(this.getUri(),
68              new CandidateURI(UURIFactory.getInstance(this.getUri())));
69          this.filter.addForce(this.getUri(),
70              new CandidateURI(UURIFactory.getInstance(this.getUri())));
71          // Should only have add 'this' once.
72          assertTrue("Count is off", this.filter.count() == 1);
73      }
74  
75      /***
76       * Test inserting.
77       * @throws URIException
78       * @throws IOException
79       * @throws FileNotFoundException
80       */
81      public void testWriting() throws URIException {
82          long start = System.currentTimeMillis();
83          ArrayList<UURI> list = new ArrayList<UURI>(1000);
84          int count = 0;
85          final int MAX_COUNT = 1000;
86          for (; count < MAX_COUNT; count++) {
87              assertEquals("count off",count,filter.count());
88              UURI u = UURIFactory.getInstance("http://www" +
89                      count + ".archive.org/" + count + "/index.html");
90              assertFalse("already contained "+u.toString(),filter.bloom.contains(u.toString()));
91              logger.fine("adding "+u.toString());
92              filter.add(u.toString(), new CandidateURI(u));
93              assertTrue("not in bloom",filter.bloom.contains(u.toString()));
94              if (count > 0 && ((count % 100) == 0)) {
95                  list.add(u);
96              }
97          }
98          logger.fine("Added " + count + " in " +
99                  (System.currentTimeMillis() - start));
100 
101         start = System.currentTimeMillis();
102         for (Iterator i = list.iterator(); i.hasNext();) {
103             UURI uuri = (UURI)i.next();
104             filter.add(uuri.toString(), new CandidateURI(uuri));
105         }
106         logger.fine("Readded subset " + list.size() + " in " +
107                 (System.currentTimeMillis() - start));
108 
109         assertTrue("Count is off: " + filter.count(),
110             filter.count() == MAX_COUNT);
111     }
112 
113     public void testNote() {
114         filter.note(this.getUri());
115         assertFalse("Receiver was called", this.received);
116     }
117 
118 // FORGET CURRENTLY UNSUPPORTED IN BloomUriUniqFilter
119 //    public void testForget() throws URIException {
120 //        this.filter.forget(this.getUri(),
121 //                new CandidateURI(UURIFactory.getInstance(this.getUri())));
122 //        assertTrue("Didn't forget", this.filter.count() == 0);
123 //    }
124 
125     public void receive(CandidateURI item) {
126         this.received = true;
127     }
128 
129     public String getUri() {
130         return "http://www.archive.org";
131     }
132 }