View Javadoc

1   /* Copyright (C) 2009 Internet Archive
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlHost.java
20   * Created on Aug 5, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.Serializable;
27  import java.net.InetAddress;
28  import java.util.logging.Level;
29  import java.util.logging.Logger;
30  
31  import org.archive.util.InetAddressUtil;
32  
33  /*** 
34   * Represents a single remote "host".
35   *
36   * An host is a name for which there is a dns record or an IP-address. This
37   * might be a machine or a virtual host.
38   *
39   * @author gojomo
40   */
41  public class CrawlHost implements Serializable, CrawlSubstats.HasCrawlSubstats {
42  
43      private static final long serialVersionUID = -5494573967890942895L;
44  
45      private static final Logger logger = Logger.getLogger(CrawlHost.class.getName());
46      /*** Flag value indicating always-valid IP */
47      public static final long IP_NEVER_EXPIRES = -1;
48      /*** Flag value indicating an IP has not yet been looked up */
49      public static final long IP_NEVER_LOOKED_UP = -2;
50      private String hostname;
51      private String countryCode;
52      private InetAddress ip;
53      private long ipFetched = IP_NEVER_LOOKED_UP;
54      protected CrawlSubstats substats = new CrawlSubstats(); 
55      /***
56       * TTL gotten from dns record.
57       *
58       * From rfc2035:
59       * <pre>
60       * TTL       a 32 bit unsigned integer that specifies the time
61       *           interval (in seconds) that the resource record may be
62       *           cached before it should be discarded.  Zero values are
63       *           interpreted to mean that the RR can only be used for the
64       *           transaction in progress, and should not be cached.
65       * </pre>
66       */
67      private long ipTTL = IP_NEVER_LOOKED_UP;
68  
69      // Used when bandwith constraint are used
70      private long earliestNextURIEmitTime = 0;
71      
72      /*** 
73       * Create a new CrawlHost object.
74       *
75       * @param hostname the host name for this host.
76       */
77      public CrawlHost(String hostname) {
78      		this(hostname, null);
79      }
80  
81      /*** 
82       * Create a new CrawlHost object.
83       *
84       * @param hostname the host name for this host.
85       * @param countryCode the country code for this host.
86       */
87      public CrawlHost(String hostname, String countryCode) {
88          this.hostname = hostname;
89          this.countryCode = countryCode;
90          InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname);
91          if (tmp != null) {
92              setIP(tmp, IP_NEVER_EXPIRES);
93          }
94      }
95  
96      /*** Return true if the IP for this host has been looked up.
97       *
98       * Returns true even if the lookup failed.
99       *
100      * @return true if the IP for this host has been looked up.
101      */
102     public boolean hasBeenLookedUp() {
103         return ipFetched != IP_NEVER_LOOKED_UP;
104     }
105 
106     /***
107      * Set the IP address for this host.
108      *
109      * @param address
110      * @param ttl the TTL from the dns record in seconds or -1 if it should live
111      * forever (is a numeric IP).
112      */
113     public void setIP(InetAddress address, long ttl) {
114         this.ip = address;
115         // Assume that a lookup as occurred by the time
116         // a caller decides to set this (even to null)
117         this.ipFetched = System.currentTimeMillis();
118         this.ipTTL = ttl;
119         if (logger.isLoggable(Level.FINE)) {
120             logger.fine(hostname + ": " +
121                 ((address != null)? address.toString(): "null"));
122         }
123     }
124 
125     /*** Get the IP address for this host.
126      *
127      * @return the IP address for this host.
128      */
129     public InetAddress getIP() {
130         return ip;
131     }
132 
133     /*** Get the time when the IP address for this host was last looked up.
134      *
135      * @return the time when the IP address for this host was last looked up.
136      */
137     public long getIpFetched() {
138         return ipFetched;
139     }
140 
141     /***
142      * Get the TTL value from the dns record for this host.
143      *
144      * @return the TTL value from the dns record for this host -- in seconds --
145      * or -1 if this lookup should be valid forever (numeric ip).
146      */
147     public long getIpTTL() {
148         return this.ipTTL;
149     }
150 
151     public String toString() {
152         return "CrawlHost<" + hostname + "(ip:" + ip + ")>";
153     }
154 
155     @Override
156     public int hashCode() {
157         return this.hostname != null ? this.hostname.hashCode() : 0;
158     }
159 
160     @Override
161     public boolean equals(Object obj) {
162         if (obj == null) {
163             return false;
164         }
165         if (getClass() != obj.getClass()) {
166             return false;
167         }
168         final CrawlHost other = (CrawlHost) obj;
169         if (this.hostname != other.hostname   // identity compare
170                 && (this.hostname == null 
171                     || !this.hostname.equals(other.hostname))) {
172             return false;
173         }
174         return true;
175     }
176 
177     /***
178      * Get the host name.
179      * @return Returns the host name.
180      */
181     public String getHostName() {
182         return hostname;
183     }
184 
185     /*** 
186      * Get the earliest time a URI for this host could be emitted.
187      * This only has effect if constraints on bandwidth per host is set.
188      *
189      * @return Returns the earliestNextURIEmitTime.
190      */
191     public long getEarliestNextURIEmitTime() {
192         return earliestNextURIEmitTime;
193     }
194 
195     /*** 
196      * Set the earliest time a URI for this host could be emitted.
197      * This only has effect if constraints on bandwidth per host is set.
198      *
199      * @param earliestNextURIEmitTime The earliestNextURIEmitTime to set.
200      */
201     public void setEarliestNextURIEmitTime(long earliestNextURIEmitTime) {
202         this.earliestNextURIEmitTime = earliestNextURIEmitTime;
203     }
204 
205     /***
206      * Get country code of this host
207      * 
208      * @return Retruns country code or null if not availabe 
209      */
210 	public String getCountryCode() {
211 		return countryCode;
212 	}
213 
214 	/***
215 	 * Set country code for this hos
216 	 * 
217 	 * @param countryCode The country code of this host
218 	 */
219 	public void setCountryCode(String countryCode) {
220 		this.countryCode = countryCode;
221 	}
222     
223     /* (non-Javadoc)
224      * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
225      */
226     public CrawlSubstats getSubstats() {
227         return substats;
228     }
229 }