1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.util;
26
27 import java.io.Serializable;
28 import java.util.logging.Logger;
29
30 import org.archive.crawler.datamodel.CandidateURI;
31 import org.archive.util.BloomFilter;
32 import org.archive.util.BloomFilter64bit;
33
34
35 /***
36 * A MG4J BloomFilter-based implementation of an AlreadySeen list.
37 *
38 * This implementation performs adequately without blowing out
39 * the heap through to very large numbers of URIs. See
40 * <a href="http://crawler.archive.org/cgi-bin/wiki.pl?AlreadySeen">AlreadySeen</a>.
41 *
42 * It is inherent to Bloom filters that as they get 'saturated', their
43 * false-positive rate rises. The default parameters used by this class
44 * attempt to maintain a 1-in-4 million (1 in 2^22) false-positive chance
45 * through 125 million unique inserts, which creates a filter structure
46 * about 495MB in size.
47 *
48 * You may use the following system properties to tune the size and
49 * false-positive rate of the bloom filter structure used by this class:
50 *
51 * org.archive.crawler.util.BloomUriUniqFilter.expected-size (default 125000000)
52 * org.archive.crawler.util.BloomUriUniqFilter.hash-count (default 22)
53 *
54 * The resulting filter will take up approximately...
55 *
56 * 1.44 * expected-size * hash-count / 8
57 *
58 * ...bytes.
59 *
60 * The default size is very close to the maximum practical size of the
61 * Bloom filter implementation, BloomFilter32bitSplit, created in the
62 * initialize() method, due to integer arithmetic limits.
63 *
64 * If you need a larger filter, you should edit the initialize
65 * method to intantiate a BloomFilter64bit instead.
66 *
67 * @author gojomo
68 * @version $Date: 2010-06-15 22:22:25 +0000 (Tue, 15 Jun 2010) $, $Revision: 6891 $
69 */
70 public class BloomUriUniqFilter extends SetBasedUriUniqFilter
71 implements Serializable {
72 private static final long serialVersionUID = 1061526253773091309L;
73
74 private static Logger LOGGER =
75 Logger.getLogger(BloomUriUniqFilter.class.getName());
76
77 BloomFilter bloom;
78 protected int expected_n;
79
80 protected static final String EXPECTED_SIZE_KEY = ".expected-size";
81 protected static final String HASH_COUNT_KEY = ".hash-count";
82
83
84
85
86
87 private static final int DEFAULT_EXPECTED_SIZE = 125000000;
88 private static final int DEFAULT_HASH_COUNT = 22;
89
90 /***
91 * Default constructor
92 */
93 public BloomUriUniqFilter() {
94 super();
95 String ns = System.getProperty(this.getClass().getName() + EXPECTED_SIZE_KEY);
96 int n = (ns == null) ? DEFAULT_EXPECTED_SIZE : Integer.parseInt(ns);
97 String ds = System.getProperty(this.getClass().getName() + HASH_COUNT_KEY);
98 int d = (ds == null) ? DEFAULT_HASH_COUNT : Integer.parseInt(ds);
99 initialize(n,d);
100 }
101
102 /***
103 * Constructor.
104 *
105 * @param n the expected number of elements.
106 * @param d the number of hash functions; if the filter adds not more
107 * than <code>n</code> elements, false positives will happen with
108 * probability 2<sup>-<var>d</var></sup>.
109 */
110 public BloomUriUniqFilter( final int n, final int d ) {
111 super();
112 initialize(n, d);
113 }
114
115 /***
116 * Initializer shared by constructors.
117 *
118 * @param n the expected number of elements.
119 * @param d the number of hash functions; if the filter adds not more
120 * than <code>n</code> elements, false positives will happen with
121 * probability 2<sup>-<var>d</var></sup>.
122 */
123 protected void initialize(final int n, final int d) {
124 this.expected_n = n;
125 bloom = new BloomFilter64bit(n,d);
126 }
127
128 public void forget(String canonical, CandidateURI item) {
129
130 LOGGER.severe("forget(\""+canonical+"\",CandidateURI) not supported");
131 }
132
133
134 protected boolean setAdd(CharSequence uri) {
135 boolean added = bloom.add(uri);
136
137
138 if( added && (count() == expected_n)) {
139 LOGGER.warning("Bloom has reached expected limit "+expected_n);
140 }
141 return added;
142 }
143
144 protected long setCount() {
145 return bloom.size();
146 }
147
148 protected boolean setRemove(CharSequence uri) {
149 throw new UnsupportedOperationException();
150 }
151 }