1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.datamodel;
24
25 import java.io.File;
26
27 /***
28 * A UriUniqFilter passes URI objects to a destination
29 * (receiver) if the passed URI object has not been previously seen.
30 *
31 * If already seen, the passed URI object is dropped.
32 *
33 * <p>For efficiency in comparison against a large history of
34 * seen URIs, URI objects may not be passed immediately, unless
35 * the addNow() is used or a flush() is forced.
36 *
37 * @author gojomo
38 * @version $Date: 2005-12-16 03:10:54 +0000 (Fri, 16 Dec 2005) $, $Revision: 4036 $
39 */
40 public interface UriUniqFilter {
41 /***
42 * @return Count of already seen URIs.
43 */
44 public long count();
45
46 /***
47 * Count of items added, but not yet filtered in or out.
48 *
49 * Some implementations may buffer up large numbers of pending
50 * items to be evaluated in a later large batch/scan/merge with
51 * disk files.
52 *
53 * @return Count of items added not yet evaluated
54 */
55 public long pending();
56
57 /***
58 * Receiver of uniq URIs.
59 *
60 * Items that have not been seen before are pass through to this object.
61 * @param receiver Object that will be passed items. Must implement
62 * HasUriReceiver interface.
63 */
64 public void setDestination(HasUriReceiver receiver);
65
66 /***
67 * Add given uri, if not already present.
68 * @param key Usually a canonicalized version of <code>value</code>.
69 * This is the key used doing lookups, forgets and insertions on the
70 * already included list.
71 * @param value item to add.
72 */
73 public void add(String key, CandidateURI value);
74
75 /***
76 * Immediately add uri.
77 * @param key Usually a canonicalized version of <code>uri</code>.
78 * This is the key used doing lookups, forgets and insertions on the
79 * already included list.
80 * @param value item to add.
81 */
82 public void addNow(String key, CandidateURI value);
83
84 /***
85 * Add given uri, all the way through to underlying destination, even
86 * if already present.
87 *
88 * (Sometimes a URI must be fetched, or refetched, for example when
89 * DNS or robots info expires or the operator forces a refetch. A
90 * normal add() or addNow() would drop the URI without forwarding
91 * on once it is determmined to already be in the filter.)
92 *
93 * @param key Usually a canonicalized version of <code>uri</code>.
94 * This is the key used doing lookups, forgets and insertions on the
95 * already included list.
96 * @param value item to add.
97 */
98 public void addForce(String key, CandidateURI value);
99
100 /***
101 * Note item as seen, without passing through to receiver.
102 * @param key Usually a canonicalized version of an <code>URI</code>.
103 * This is the key used doing lookups, forgets and insertions on the
104 * already included list.
105 */
106 public void note(String key);
107
108 /***
109 * Forget item was seen
110 * @param key Usually a canonicalized version of an <code>URI</code>.
111 * This is the key used doing lookups, forgets and insertions on the
112 * already included list.
113 * @param value item to add.
114 */
115 public void forget(String key, CandidateURI value);
116
117 /***
118 * Request that any pending items be added/dropped. Implementors
119 * may ignore the request if a flush would be too expensive/too
120 * soon.
121 *
122 * @return Number added.
123 */
124 public long requestFlush();
125
126 /***
127 * Close down any allocated resources.
128 * Makes sense calling this when checkpointing.
129 */
130 public void close();
131
132 /***
133 * Set a File to receive a log for replay profiling.
134 */
135 public void setProfileLog(File logfile);
136
137 /***
138 * URIs that have not been seen before 'visit' this 'Visitor'.
139 *
140 * Usually implementations of Frontier implement this interface.
141 * @author gojomo
142 */
143 public interface HasUriReceiver {
144 /***
145 * @param item Candidate uri tem that is 'visiting'.
146 */
147 public void receive(CandidateURI item);
148 }
149 }