View Javadoc

1   /* UriUniqFilter
2    * 
3    * Created on Apr 17, 2003
4    * 
5    * Copyright (C) 2003 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.datamodel;
24  
25  import java.io.File;
26  
27  /***
28   * A UriUniqFilter passes URI objects to a destination
29   * (receiver) if the passed URI object has not been previously seen.
30   * 
31   * If already seen, the passed URI object is dropped.
32   *
33   * <p>For efficiency in comparison against a large history of
34   * seen URIs, URI objects may not be passed immediately, unless 
35   * the addNow() is used or a flush() is forced.
36   * 
37   * @author gojomo
38   * @version $Date: 2005-12-16 03:10:54 +0000 (Fri, 16 Dec 2005) $, $Revision: 4036 $
39   */
40  public interface UriUniqFilter {
41      /***
42       * @return Count of already seen URIs.
43       */
44      public long count();
45      
46      /***
47       * Count of items added, but not yet filtered in or out. 
48       * 
49       * Some implementations may buffer up large numbers of pending
50       * items to be evaluated in a later large batch/scan/merge with 
51       * disk files. 
52       * 
53       * @return Count of items added not yet evaluated 
54       */
55      public long pending();
56  
57      /***
58       * Receiver of uniq URIs.
59       * 
60       * Items that have not been seen before are pass through to this object.
61       * @param receiver Object that will be passed items. Must implement
62       * HasUriReceiver interface.
63       */
64      public void setDestination(HasUriReceiver receiver);
65      
66      /***
67       * Add given uri, if not already present.
68       * @param key Usually a canonicalized version of <code>value</code>.
69       * This is the key used doing lookups, forgets and insertions on the
70       * already included list.
71       * @param value item to add.
72       */
73      public void add(String key, CandidateURI value);
74      
75      /***
76       * Immediately add uri.
77       * @param key Usually a canonicalized version of <code>uri</code>.
78       * This is the key used doing lookups, forgets and insertions on the
79       * already included list.
80       * @param value item to add.
81       */
82      public void addNow(String key, CandidateURI value);
83      
84      /***
85       * Add given uri, all the way through to underlying destination, even 
86       * if already present.
87       * 
88       * (Sometimes a URI must be fetched, or refetched, for example when
89       * DNS or robots info expires or the operator forces a refetch. A
90       * normal add() or addNow() would drop the URI without forwarding
91       * on once it is determmined to already be in the filter.) 
92       * 
93       * @param key Usually a canonicalized version of <code>uri</code>.
94       * This is the key used doing lookups, forgets and insertions on the
95       * already included list.
96       * @param value item to add.
97       */
98      public void addForce(String key, CandidateURI value);
99      
100     /***
101      * Note item as seen, without passing through to receiver.
102      * @param key Usually a canonicalized version of an <code>URI</code>.
103      * This is the key used doing lookups, forgets and insertions on the
104      * already included list.
105      */
106     public void note(String key);
107     
108     /***
109      * Forget item was seen
110      * @param key Usually a canonicalized version of an <code>URI</code>.
111      * This is the key used doing lookups, forgets and insertions on the
112      * already included list.
113      * @param value item to add.
114      */
115     public void forget(String key, CandidateURI value);
116     
117     /***
118      * Request that any pending items be added/dropped. Implementors
119      * may ignore the request if a flush would be too expensive/too 
120      * soon. 
121      * 
122      * @return Number added.
123      */
124     public long requestFlush();
125     
126     /***
127      * Close down any allocated resources.
128      * Makes sense calling this when checkpointing.
129      */
130     public void close();
131     
132     /***
133      * Set a File to receive a log for replay profiling. 
134      */
135     public void setProfileLog(File logfile);
136     
137     /***
138      * URIs that have not been seen before 'visit' this 'Visitor'.
139      * 
140      * Usually implementations of Frontier implement this interface.
141      * @author gojomo
142      */
143     public interface HasUriReceiver {
144         /***
145          * @param item Candidate uri tem that is 'visiting'.
146          */
147         public void receive(CandidateURI item);
148     }
149 }