1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.deciderules;
24
25 import java.net.InetAddress;
26 import java.net.UnknownHostException;
27 import java.util.logging.Level;
28 import java.util.logging.Logger;
29
30
31 import org.apache.commons.httpclient.URIException;
32 import org.archive.crawler.datamodel.CandidateURI;
33 import org.archive.crawler.datamodel.CrawlHost;
34 import org.archive.crawler.settings.SimpleType;
35 import org.xbill.DNS.Address;
36
37 /***
38 * A rule that can be configured to take alternate implementations
39 * of the ExternalGeoLocationInterface.
40 * If no implementation specified, or none found, returns configured decision.
41 * If host in URI has been resolved checks CrawlHost for the country code
42 * determination.
43 * If country code is not present, does country lookup, and saves the country
44 * code to <code>CrawlHost</code> for future consultation.
45 * If country code is present in <code>CrawlHost</code>, compares it against
46 * the configured code.
47 * Note that if a host's IP address changes during the crawl, we still consider
48 * the associated hostname to be in the country of its original IP address.
49 *
50 * @author Igor Ranitovic
51 */
52 public class ExternalGeoLocationDecideRule
53 extends PredicatedDecideRule {
54
55 private static final long serialVersionUID = -32974116429860725L;
56
57 private static final Logger LOGGER =
58 Logger.getLogger(ExternalGeoLocationDecideRule.class.getName());
59 static final String ATTR_IMPLEMENTATION = "implementation-class";
60 static final String ATTR_COUNTRY_CODE = "country-code";
61 static final String DEFAULT_COUNTRY_CODE = "--";
62 private String countryCode;
63 private ExternalGeoLookupInterface implementation = null;
64
65 /***
66 * @param name Name of this rule.
67 */
68 public ExternalGeoLocationDecideRule(String name) {
69 super(name);
70 setDescription("ExternalGeoLocationDecideRule. Rule that " +
71 "instantiates implementations of the ExternalGeoLookupInterface. " +
72 "The implementation needs to be present on the classpath. " +
73 "On initialization, the implementation is instantiated (" +
74 "assumption is that there is public constructor that takes +" +
75 "country code).");
76 addElementToDefinition(new SimpleType(ATTR_IMPLEMENTATION,
77 "Name of implementation of ExternalGeoLookupInterface class to " +
78 "instantiate.", ""));
79 addElementToDefinition(new SimpleType(ATTR_COUNTRY_CODE,
80 "Country code name.", ""));
81
82 }
83
84 protected boolean evaluate(Object obj) {
85 ExternalGeoLookupInterface impl = getConfiguredImplementation(obj);
86 if (impl == null) {
87 return false;
88 }
89 CrawlHost crawlHost = null;
90 String host;
91 InetAddress address;
92 try {
93 if (obj instanceof CandidateURI) {
94 host = ((CandidateURI) obj).getUURI().getHost();
95 crawlHost = getSettingsHandler().getOrder()
96 .getController().getServerCache().getHostFor(host);
97 if (crawlHost.getCountryCode() != null){
98 return (crawlHost.getCountryCode().equals(countryCode))
99 ? true : false;
100 }
101 address = crawlHost.getIP();
102 if (address == null) {
103 address = Address.getByName(host);
104 }
105 crawlHost.setCountryCode((String)impl.lookup(address));
106 if (crawlHost.getCountryCode().equals(countryCode)){
107 LOGGER.fine("Country Code Lookup: " + " " + host +
108 crawlHost.getCountryCode());
109 return true;
110 }
111 }
112 } catch (UnknownHostException e) {
113 LOGGER.log(Level.FINE, "Failed dns lookup " + obj, e);
114 if (crawlHost != null){
115 crawlHost.setCountryCode(DEFAULT_COUNTRY_CODE);
116 }
117 } catch (URIException e) {
118 LOGGER.log(Level.FINE, "Failed to parse hostname " + obj, e);
119 }
120
121 return false;
122 }
123
124 /***
125 * Get implementation, if one specified. If none specified, will keep trying
126 * to find one. Will be messy if the provided class is not-instantiable
127 *
128 * @param o A context object.
129 * @return Instance of <code>ExternalGeoLookupInterface</code> or null.
130 */
131 protected synchronized ExternalGeoLookupInterface
132 getConfiguredImplementation(Object o) {
133 if (this.implementation != null) {
134 return this.implementation;
135 }
136 ExternalGeoLookupInterface result = null;
137 try {
138 String className =
139 (String)getAttribute(o, ATTR_IMPLEMENTATION);
140 countryCode = (String)getAttribute(o, ATTR_COUNTRY_CODE);
141 if (className != null && className.length() != 0) {
142 Object obj = Class.forName(className).getConstructor(new Class[]
143 {String.class}).newInstance(new Object[] {countryCode});
144 if (!(obj instanceof ExternalGeoLookupInterface)) {
145 LOGGER.severe("Implementation " + className +
146 " does not implement ExternalGeoLookupInterface");
147 }
148 result = (ExternalGeoLookupInterface)obj;
149 this.implementation = result;
150 }
151 } catch (Exception e) {
152 LOGGER.severe(e.getMessage());
153 }
154 return result;
155 }
156 }