1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.frontier;
26
27 import java.io.IOException;
28 import java.util.Hashtable;
29 import java.util.logging.Logger;
30
31 import javax.management.AttributeNotFoundException;
32 import javax.management.MBeanException;
33 import javax.management.ReflectionException;
34
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.event.CrawlURIDispositionListener;
37 import org.archive.crawler.filter.OrFilter;
38 import org.archive.crawler.filter.URIRegExpFilter;
39 import org.archive.crawler.framework.CrawlController;
40 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41 import org.archive.crawler.prefetch.QuotaEnforcer;
42 import org.archive.crawler.scope.ClassicScope;
43 import org.archive.crawler.settings.CrawlerSettings;
44 import org.archive.crawler.settings.SimpleType;
45 import org.archive.crawler.settings.Type;
46
47 /***
48 * Behaves like {@link BdbFrontier} (i.e., a basic mostly breadth-first
49 * frontier), but with the addition that you can set the number of documents
50 * to download on a per site basis.
51 *
52 * Useful for case of frequent revisits of a site of frequent changes.
53 *
54 * <p>Choose the number of docs you want to download and specify
55 * the count in <code>max-docs</code>. If <code>count-per-host</code> is
56 * true, the default, then the crawler will download <code>max-docs</code>
57 * per host. If you create an override, the overridden <code>max-docs</code>
58 * count will be downloaded instead, whether it is higher or lower.
59 * <p>If <code>count-per-host</code> is false, then <code>max-docs</code>
60 * acts like the the crawl order <code>max-docs</code> and the crawler will
61 * download this total amount of docs only. Overrides will
62 * download <code>max-docs</code> total in the overridden domain.
63 *
64 * @author Oskar Grenholm <oskar dot grenholm at kb dot se>
65 * @deprecated As of release 1.10.0. Replaced by {@link BdbFrontier} and
66 * {@link QuotaEnforcer}.
67 */
68 public class DomainSensitiveFrontier extends BdbFrontier
69 implements CrawlURIDispositionListener {
70
71 private static final long serialVersionUID = -3330190056282726202L;
72
73 private static final Logger logger =
74 Logger.getLogger(DomainSensitiveFrontier.class.getName());
75
76 public static final String ATTR_MAX_DOCS = "max-docs";
77 public static final String ATTR_COUNTER_MODE = "counter-mode";
78 public static final String COUNT_OVERRIDE = "count-per-override";
79 public static final String COUNT_HOST = "count-per-host";
80 public static final String COUNT_DOMAIN = "count-per-domain";
81 public static final String[] ATTR_AVAILABLE_MODES = new String[] {
82 COUNT_OVERRIDE, COUNT_HOST, COUNT_DOMAIN };
83 public static final String DEFAULT_MODE = COUNT_OVERRIDE;
84
85
86 private Hashtable<String,Long> hostCounters = new Hashtable<String,Long>();
87 private boolean countPerOverride = true;
88 private String counterMode;
89
90 public DomainSensitiveFrontier(String name) {
91 super(ATTR_NAME, "DomainSensitiveFrontier. *Deprecated* Use " +
92 "BdbFrontier+QuotaEnforcer instead. " +
93 "Overrides BdbFrontier to add specification of number of " +
94 "documents to download (Expects 'exclude-filter' " +
95 "to be part of CrawlScope).");
96 Type e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCS,
97 "Maximum number of documents to download for host or domain" +
98 " (Zero means no limit).", new Long(0)));
99 e.setOverrideable(true);
100 e = addElementToDefinition(new SimpleType(ATTR_COUNTER_MODE,
101 "If " + COUNT_OVERRIDE + ", acts like the crawl " +
102 "order maximum download count and the crawler will download " +
103 "this total amount of docs only. Override to change the max " +
104 "count for the overridden domain or host. " +
105 "Else if " + COUNT_HOST + " the crawler will download " +
106 ATTR_MAX_DOCS + " per host. Add an override to change " +
107 "max count on a per-domain or a per-host basis.For " +
108 "example, if you set " + ATTR_MAX_DOCS + " to 30 in " +
109 "this mode, the crawler will download 30 docs from " +
110 "each host in scope. If you override for kb.se setting " +
111 ATTR_MAX_DOCS +
112 " to 20, it will instead download only 20 docs from each " +
113 "host of kb.se. (It can be a larger as well as a smaller " +
114 "value here.). " +
115 "Finally " + COUNT_DOMAIN + " behaves similar to " +
116 COUNT_HOST +
117 ", but instead sets max on a per-domain basis." +
118 "Here you can do overrides on the domain-level, but " +
119 "not on the host-level. So if you here set " +
120 ATTR_MAX_DOCS +
121 " to 30 the crawler will download 30 docs from each " +
122 "domain in scope. If you override for kb.se setting " +
123 ATTR_MAX_DOCS + " to 20, it will instead download only " +
124 "20 docs in total from the whole kb.se domain. (It can be " +
125 "a larger as well as a smaller value here.)",
126 DEFAULT_MODE, ATTR_AVAILABLE_MODES));
127 e.setOverrideable(false);
128 }
129
130 public void initialize(CrawlController c)
131 throws FatalConfigurationException, IOException {
132 super.initialize(c);
133 this.controller.addCrawlURIDispositionListener(this);
134 try {
135 counterMode = ((String)getAttribute(ATTR_COUNTER_MODE));
136 if(counterMode.equalsIgnoreCase(COUNT_DOMAIN) ||
137 counterMode.equalsIgnoreCase(COUNT_HOST))
138 countPerOverride = false;
139 else
140 countPerOverride = true;
141 } catch (AttributeNotFoundException e) {
142 e.printStackTrace();
143 } catch (MBeanException e) {
144 e.printStackTrace();
145 } catch (ReflectionException e) {
146 e.printStackTrace();
147 }
148 }
149
150 /***
151 * Check if the max document download limit for this host or domain has
152 * been reached.
153 *
154 * If so, delete the rest of the URIs for this host or domain waiting in
155 * the queue. Then add an URIRegExpFilter for this host or domain, so
156 * we won't get any more URIs from this one later on.
157 * @param curi CrawlURI.
158 * @return True if discarded queue.
159 */
160 private synchronized boolean checkDownloadLimits(CrawlURI curi) {
161 long thisMaxDocs = 0;
162 long thisCounter = 0;
163 boolean discarded = false;
164 boolean retVal = false;
165 if (curi.getUURI().getScheme().equals("dns")) {
166 return false;
167 }
168 try {
169 String host = curi.getUURI().getHost();
170 CrawlerSettings cs = controller.getSettingsHandler().
171 getSettings(host);
172 do {
173 String scope;
174 if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
175 scope = cs.getScope() != null ? cs.getScope() : "root";
176 else if(counterMode.equalsIgnoreCase(COUNT_HOST))
177 scope = host;
178 else{
179 int i = host.lastIndexOf(".");
180 i = host.lastIndexOf(".", i-1);
181 scope = host.substring(i+1, host.length());
182 }
183 thisMaxDocs =
184 ((Long) getAttribute(cs, ATTR_MAX_DOCS)).longValue();
185 thisCounter = this.hostCounters.get(scope) != null ?
186 ((Long) this.hostCounters.get(scope)).longValue(): 0;
187
188
189 if ((thisMaxDocs > 0 && thisCounter >= thisMaxDocs)) {
190 logger.fine("Discarding Queue: " + host + " ");
191 curi.addAnnotation("dsfLimit");
192 if (!discarded) {
193 long count = 0;
194 WorkQueue wq = getQueueFor(curi);
195 wq.unpeek();
196 count += wq.deleteMatching(this, ".*");
197 decrementQueuedCount(count);
198 discarded = true;
199
200
201
202 }
203
204 OrFilter or = (OrFilter) this.controller.getScope()
205 .getAttribute(ClassicScope.ATTR_EXCLUDE_FILTER);
206
207
208 String filter = scope.equalsIgnoreCase("root") ?
209 ".*" : "^((https?://)?[a-zA-Z0-9//.]*)" + scope +
210 "($|/.*)";
211 logger.fine("Adding filter: [" + filter + "].");
212 URIRegExpFilter urf =
213 new URIRegExpFilter(curi.toString(), filter);
214 or.addFilter(this.controller.getSettingsHandler().
215 getSettings(null), urf);
216 thisMaxDocs = 0;
217 thisCounter = 0;
218 retVal = true;
219 }
220 } while ((cs = cs.getParent()) != null && countPerOverride);
221 } catch (Exception e) {
222 logger.severe("ERROR: checkDownloadLimits(), "
223 + "while processing {" + curi.toString() + "}"
224 + e.getClass()
225 + "message: " + e.getMessage() + ". Stack trace:");
226 e.printStackTrace();
227 }
228 return retVal;
229 }
230
231 protected synchronized void incrementHostCounters(CrawlURI curi) {
232 if (!curi.getUURI().toString().startsWith("dns:")) {
233 try {
234 String host = curi.getUURI().getHost();
235 CrawlerSettings cs =
236 controller.getSettingsHandler().getSettings(host);
237 do {
238 String scope;
239 if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
240 scope = cs.getScope() != null? cs.getScope() : "root";
241 else if(counterMode.equalsIgnoreCase(COUNT_HOST))
242 scope = host;
243 else{
244 int i = host.lastIndexOf(".");
245 i = host.lastIndexOf(".", i-1);
246 scope = host.substring(i+1, host.length());
247 }
248 long counter = this.hostCounters.get(scope) != null ?
249 ((Long)this.hostCounters.get(scope)).longValue(): 0;
250 this.hostCounters.put(scope, new Long(++counter));
251 } while ((cs = cs.getParent()) != null && countPerOverride);
252 } catch (Exception e) {
253 logger.severe("ERROR: incrementHostCounters() " +
254 e.getMessage());
255 }
256 }
257 }
258
259 public void crawledURISuccessful(CrawlURI curi) {
260 incrementHostCounters(curi);
261 checkDownloadLimits(curi);
262 }
263
264 public void crawledURINeedRetry(CrawlURI curi) {
265 }
266
267 public void crawledURIDisregard(CrawlURI curi) {
268 }
269
270 public void crawledURIFailure(CrawlURI curi) {
271 }
272 }