1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.frontier;
26
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CandidateURI;
32 import org.archive.crawler.framework.CrawlController;
33 import org.archive.net.UURI;
34 import org.archive.net.UURIFactory;
35
36 /***
37 * SurtAuthorityQueueAssignmentPolicy based on the surt form of hostname.
38 */
39 public class SurtAuthorityQueueAssignmentPolicy
40 extends QueueAssignmentPolicy {
41 private static final Logger logger = Logger
42 .getLogger(SurtAuthorityQueueAssignmentPolicy.class.getName());
43 /***
44 * When neat host-based class-key fails us
45 */
46 private static String DEFAULT_CLASS_KEY = "default...";
47
48 private static final String DNS = "dns";
49
50 public String getClassKey(CrawlController controller, CandidateURI cauri) {
51 String scheme = cauri.getUURI().getScheme();
52 String candidate = null;
53 try {
54 if (scheme.equals(DNS)) {
55 UURI effectiveuuri;
56 if (cauri.getVia() != null) {
57
58
59
60
61
62
63 effectiveuuri = UURIFactory.getInstance(cauri.flattenVia());
64 } else {
65
66
67 effectiveuuri = UURIFactory.getInstance("http://" +
68 cauri.getUURI().getPath());
69 }
70 candidate = getSurtAuthority(effectiveuuri.getSurtForm());
71 } else {
72 candidate = getSurtAuthority(cauri.getUURI().getSurtForm());
73 }
74
75 if(candidate == null || candidate.length() == 0) {
76 candidate = DEFAULT_CLASS_KEY;
77 }
78 } catch (URIException e) {
79 logger.log(Level.INFO,
80 "unable to extract class key; using default", e);
81 candidate = DEFAULT_CLASS_KEY;
82 }
83
84 return candidate.replace(':','#');
85 }
86
87 protected String getSurtAuthority(String surt) {
88 int indexOfOpen = surt.indexOf("://(");
89 int indexOfClose = surt.indexOf(")");
90 if (indexOfOpen == -1 || indexOfClose == -1
91 || ((indexOfOpen + 4) >= indexOfClose)) {
92 return DEFAULT_CLASS_KEY;
93 }
94 return surt.substring(indexOfOpen + 4, indexOfClose);
95 }
96 }