1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.crawler.frontier;
27
28 import org.apache.commons.httpclient.URIException;
29 import org.archive.crawler.datamodel.CandidateURI;
30 import org.archive.crawler.datamodel.CrawlHost;
31 import org.archive.crawler.framework.CrawlController;
32
33 /***
34 * Uses the target IPs as basis for queue-assignment,
35 * distributing them over a fixed number of sub-queues.
36 *
37 * @author Christian Kohlschuetter
38 */
39 public class BucketQueueAssignmentPolicy extends QueueAssignmentPolicy {
40 private static final int DEFAULT_NOIP_BITMASK = 1023;
41 private static final int DEFAULT_QUEUES_HOSTS_MODULO = 1021;
42
43 public String getClassKey(final CrawlController controller,
44 final CandidateURI curi) {
45
46 CrawlHost host;
47 try {
48 host = controller.getServerCache().getHostFor(
49 curi.getUURI().getReferencedHost());
50 } catch (URIException e) {
51
52 e.printStackTrace();
53 host = null;
54 }
55 if(host == null) {
56 return "NO-HOST";
57 } else if(host.getIP() == null) {
58 return "NO-IP-".concat(Integer.toString(Math.abs(host.getHostName()
59 .hashCode())
60 & DEFAULT_NOIP_BITMASK));
61 } else {
62 return Integer.toString(Math.abs(host.getIP().hashCode())
63 % DEFAULT_QUEUES_HOSTS_MODULO);
64 }
65 }
66
67 public int maximumNumberOfKeys() {
68 return DEFAULT_NOIP_BITMASK + DEFAULT_QUEUES_HOSTS_MODULO + 2;
69 }
70 }