View Javadoc

1   /* ExternalGeoLocationDecideRule
2    * 
3    * Created on May 25, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.deciderules;
24  
25  import java.net.InetAddress;
26  import java.net.UnknownHostException;
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  
31  import org.apache.commons.httpclient.URIException;
32  import org.archive.crawler.datamodel.CandidateURI;
33  import org.archive.crawler.datamodel.CrawlHost;
34  import org.archive.crawler.settings.SimpleType;
35  import org.xbill.DNS.Address;
36  
37  /***
38   * A rule that can be configured to take alternate implementations
39   * of the ExternalGeoLocationInterface.
40   * If no implementation specified, or none found, returns configured decision.
41   * If host in URI has been resolved checks CrawlHost for the country code
42   * determination.
43   * If country code is not present, does country lookup, and saves the country
44   * code to <code>CrawlHost</code> for future consultation.
45   * If country code is present in <code>CrawlHost</code>, compares it against
46   * the configured code.
47   * Note that if a host's IP address changes during the crawl, we still consider
48   * the associated hostname to be in the country of its original IP address.
49   * 
50   * @author Igor Ranitovic
51   */
52  public class ExternalGeoLocationDecideRule
53  extends PredicatedDecideRule {
54  
55      private static final long serialVersionUID = -32974116429860725L;
56  
57      private static final Logger LOGGER =
58          Logger.getLogger(ExternalGeoLocationDecideRule.class.getName());
59      static final String ATTR_IMPLEMENTATION = "implementation-class";
60      static final String ATTR_COUNTRY_CODE = "country-code";
61      static final String DEFAULT_COUNTRY_CODE = "--";
62      private String countryCode;
63      private ExternalGeoLookupInterface implementation = null;
64  
65      /***
66       * @param name Name of this rule.
67       */
68      public ExternalGeoLocationDecideRule(String name) {
69          super(name);
70          setDescription("ExternalGeoLocationDecideRule. Rule that " +
71              "instantiates implementations of the ExternalGeoLookupInterface. " +
72              "The implementation needs to be present on the classpath. " +
73              "On initialization, the implementation is instantiated (" +
74              "assumption is that there is public constructor that takes +" +
75              "country code).");
76          addElementToDefinition(new SimpleType(ATTR_IMPLEMENTATION,
77              "Name of implementation of ExternalGeoLookupInterface class to " +
78              "instantiate.", ""));
79          addElementToDefinition(new SimpleType(ATTR_COUNTRY_CODE,
80                  "Country code name.", ""));
81  
82      }
83      
84      protected boolean evaluate(Object obj) {
85          ExternalGeoLookupInterface impl = getConfiguredImplementation(obj);
86          if (impl == null) {
87              return false;
88          }
89          CrawlHost crawlHost = null;
90          String host;
91          InetAddress address;
92          try {
93  			if (obj instanceof CandidateURI) {
94  				host = ((CandidateURI) obj).getUURI().getHost();
95  				crawlHost = getSettingsHandler().getOrder()
96  				   .getController().getServerCache().getHostFor(host);
97  				if (crawlHost.getCountryCode() != null){
98  				   return (crawlHost.getCountryCode().equals(countryCode))
99  				   				? true : false;
100 				}
101 				address = crawlHost.getIP();
102 				if (address == null) {
103 					address = Address.getByName(host); 
104 				}
105 				crawlHost.setCountryCode((String)impl.lookup(address));
106 				if (crawlHost.getCountryCode().equals(countryCode)){
107 					LOGGER.fine("Country Code Lookup: " + " " + host +
108 							crawlHost.getCountryCode());
109 					return true;
110 				}
111 			}
112 		} catch (UnknownHostException e) {
113 			LOGGER.log(Level.FINE, "Failed dns lookup " + obj, e);
114 			if (crawlHost != null){
115 				crawlHost.setCountryCode(DEFAULT_COUNTRY_CODE);
116 			}
117 		} catch (URIException e) {
118 			LOGGER.log(Level.FINE, "Failed to parse hostname " + obj, e);
119 		}
120 		
121 		return false;
122     }
123     
124     /***
125 	 * Get implementation, if one specified. If none specified, will keep trying
126 	 * to find one. Will be messy if the provided class is not-instantiable
127 	 * 
128 	 * @param o A context object.
129 	 * @return Instance of <code>ExternalGeoLookupInterface</code> or null.
130 	 */
131     protected synchronized ExternalGeoLookupInterface
132             getConfiguredImplementation(Object o) {
133         if (this.implementation != null) {
134             return this.implementation;
135         }
136         ExternalGeoLookupInterface result = null;
137         try {
138         	String className =
139                 (String)getAttribute(o, ATTR_IMPLEMENTATION);
140             countryCode = (String)getAttribute(o, ATTR_COUNTRY_CODE);
141             if (className != null && className.length() != 0) {
142                 Object obj = Class.forName(className).getConstructor(new Class[]
143                       {String.class}).newInstance(new Object[] {countryCode});
144                 if (!(obj instanceof ExternalGeoLookupInterface)) {
145                     LOGGER.severe("Implementation " + className + 
146                         " does not implement ExternalGeoLookupInterface");
147                 }
148                 result = (ExternalGeoLookupInterface)obj;
149                 this.implementation = result;
150             }
151         } catch (Exception e) {
152             LOGGER.severe(e.getMessage());
153         } 
154         return result;
155     }
156 }