View Javadoc

1   /* BloomUriUniqFilter
2   *
3   * $Id: BloomUriUniqFilter.java 6891 2010-06-15 22:22:25Z gojomo $
4   *
5   * Created on June 21, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.util;
26  
27  import java.io.Serializable;
28  import java.util.logging.Logger;
29  
30  import org.archive.crawler.datamodel.CandidateURI;
31  import org.archive.util.BloomFilter;
32  import org.archive.util.BloomFilter64bit;
33  
34  
35  /***
36   * A MG4J BloomFilter-based implementation of an AlreadySeen list.
37   *
38   * This implementation performs adequately without blowing out
39   * the heap through to very large numbers of URIs. See
40   * <a href="http://crawler.archive.org/cgi-bin/wiki.pl?AlreadySeen">AlreadySeen</a>.
41   * 
42   * It is inherent to Bloom filters that as they get 'saturated', their
43   * false-positive rate rises. The default parameters used by this class
44   * attempt to maintain a 1-in-4 million (1 in 2^22) false-positive chance
45   * through 125 million unique inserts, which creates a filter structure 
46   * about 495MB in size. 
47   * 
48   * You may use the following system properties to tune the size and 
49   * false-positive rate of the bloom filter structure used by this class:
50   * 
51   *  org.archive.crawler.util.BloomUriUniqFilter.expected-size (default 125000000)
52   *  org.archive.crawler.util.BloomUriUniqFilter.hash-count (default 22)
53   * 
54   * The resulting filter will take up approximately...
55   * 
56   *    1.44 * expected-size * hash-count / 8 
57   *    
58   * ...bytes. 
59   * 
60   * The default size is very close to the maximum practical size of the 
61   * Bloom filter implementation, BloomFilter32bitSplit, created in the
62   * initialize() method, due to integer arithmetic limits. 
63   * 
64   * If you need a larger filter, you should edit the initialize 
65   * method to intantiate a BloomFilter64bit instead.  
66   * 
67   * @author gojomo
68   * @version $Date: 2010-06-15 22:22:25 +0000 (Tue, 15 Jun 2010) $, $Revision: 6891 $
69   */
70  public class BloomUriUniqFilter extends SetBasedUriUniqFilter
71  implements Serializable {
72  	private static final long serialVersionUID = 1061526253773091309L;
73  
74  	private static Logger LOGGER =
75          Logger.getLogger(BloomUriUniqFilter.class.getName());
76  
77      BloomFilter bloom; // package access for testing convenience
78      protected int expected_n; // remember bloom contruction param
79  
80      protected static final String EXPECTED_SIZE_KEY = ".expected-size";
81      protected static final String HASH_COUNT_KEY = ".hash-count";
82  
83      // these defaults create a bloom filter that is
84      // 1.44*125mil*22/8 ~= 495MB in size, and at full
85      // capacity will give a false contained indication
86      // 1/(2^22) ~= 1 in every 4 million probes
87      private static final int DEFAULT_EXPECTED_SIZE = 125000000; // 125 million
88      private static final int DEFAULT_HASH_COUNT = 22; // 1 in 4 million false pos
89  
90      /***
91       * Default constructor
92       */
93      public BloomUriUniqFilter() {
94          super();
95          String ns = System.getProperty(this.getClass().getName() + EXPECTED_SIZE_KEY);
96          int n = (ns == null) ? DEFAULT_EXPECTED_SIZE : Integer.parseInt(ns);
97          String ds = System.getProperty(this.getClass().getName() + HASH_COUNT_KEY);
98          int d = (ds == null) ? DEFAULT_HASH_COUNT : Integer.parseInt(ds);
99          initialize(n,d);
100     }
101 
102     /***
103      * Constructor.
104      *
105      * @param n the expected number of elements.
106      * @param d the number of hash functions; if the filter adds not more
107      * than <code>n</code> elements, false positives will happen with
108      * probability 2<sup>-<var>d</var></sup>.
109      */
110     public BloomUriUniqFilter( final int n, final int d ) {
111         super();
112         initialize(n, d);
113     }
114 
115     /***
116      * Initializer shared by constructors.
117      *
118      * @param n the expected number of elements.
119      * @param d the number of hash functions; if the filter adds not more
120      * than <code>n</code> elements, false positives will happen with
121      * probability 2<sup>-<var>d</var></sup>.
122      */
123     protected void initialize(final int n, final int d) {
124         this.expected_n = n;
125         bloom = new BloomFilter64bit(n,d);
126     }
127 
128     public void forget(String canonical, CandidateURI item) {
129         // TODO? could use in-memory exception list of currently-forgotten items
130         LOGGER.severe("forget(\""+canonical+"\",CandidateURI) not supported");
131     }
132 
133     
134     protected boolean setAdd(CharSequence uri) {
135         boolean added = bloom.add(uri);
136         // warn if bloom has reached its expected size (and its false-pos
137         // rate will now exceed the theoretical/designed level)
138         if( added && (count() == expected_n)) {
139             LOGGER.warning("Bloom has reached expected limit "+expected_n);
140         }
141         return added;
142     }
143 
144     protected long setCount() {
145         return bloom.size();
146     }
147 
148     protected boolean setRemove(CharSequence uri) {
149         throw new UnsupportedOperationException();
150     }
151 }