1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.frontier;
26
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CandidateURI;
32 import org.archive.crawler.framework.CrawlController;
33 import org.archive.net.UURI;
34 import org.archive.net.UURIFactory;
35
36 /***
37 * QueueAssignmentPolicy based on the hostname:port evident in the given
38 * CrawlURI.
39 *
40 * @author gojomo
41 */
42 public class HostnameQueueAssignmentPolicy extends QueueAssignmentPolicy {
43 private static final Logger logger = Logger
44 .getLogger(HostnameQueueAssignmentPolicy.class.getName());
45 /***
46 * When neat host-based class-key fails us
47 */
48 private static String DEFAULT_CLASS_KEY = "default...";
49
50 private static final String DNS = "dns";
51
52 public String getClassKey(CrawlController controller, CandidateURI cauri) {
53 String scheme = cauri.getUURI().getScheme();
54 String candidate = null;
55 try {
56 if (scheme.equals(DNS)){
57 if (cauri.getVia() != null) {
58
59
60
61
62
63
64 UURI viaUuri = UURIFactory.getInstance(cauri.flattenVia());
65 candidate = viaUuri.getAuthorityMinusUserinfo();
66
67 scheme = viaUuri.getScheme();
68 } else {
69 candidate= cauri.getUURI().getReferencedHost();
70 }
71 } else {
72 candidate = cauri.getUURI().getAuthorityMinusUserinfo();
73 }
74
75 if(candidate == null || candidate.length() == 0) {
76 candidate = DEFAULT_CLASS_KEY;
77 }
78 } catch (URIException e) {
79 logger.log(Level.INFO,
80 "unable to extract class key; using default", e);
81 candidate = DEFAULT_CLASS_KEY;
82 }
83 if (scheme != null && scheme.equals(UURIFactory.HTTPS)) {
84
85
86 if (!candidate.matches(".+:[0-9]+")) {
87 candidate += UURIFactory.HTTPS_PORT;
88 }
89 }
90
91 return candidate.replace(':','#');
92 }
93
94 }