1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.Serializable;
27 import java.net.InetAddress;
28 import java.util.logging.Level;
29 import java.util.logging.Logger;
30
31 import org.archive.util.InetAddressUtil;
32
33 /***
34 * Represents a single remote "host".
35 *
36 * An host is a name for which there is a dns record or an IP-address. This
37 * might be a machine or a virtual host.
38 *
39 * @author gojomo
40 */
41 public class CrawlHost implements Serializable, CrawlSubstats.HasCrawlSubstats {
42
43 private static final long serialVersionUID = -5494573967890942895L;
44
45 private static final Logger logger = Logger.getLogger(CrawlHost.class.getName());
46 /*** Flag value indicating always-valid IP */
47 public static final long IP_NEVER_EXPIRES = -1;
48 /*** Flag value indicating an IP has not yet been looked up */
49 public static final long IP_NEVER_LOOKED_UP = -2;
50 private String hostname;
51 private String countryCode;
52 private InetAddress ip;
53 private long ipFetched = IP_NEVER_LOOKED_UP;
54 protected CrawlSubstats substats = new CrawlSubstats();
55 /***
56 * TTL gotten from dns record.
57 *
58 * From rfc2035:
59 * <pre>
60 * TTL a 32 bit unsigned integer that specifies the time
61 * interval (in seconds) that the resource record may be
62 * cached before it should be discarded. Zero values are
63 * interpreted to mean that the RR can only be used for the
64 * transaction in progress, and should not be cached.
65 * </pre>
66 */
67 private long ipTTL = IP_NEVER_LOOKED_UP;
68
69
70 private long earliestNextURIEmitTime = 0;
71
72 /***
73 * Create a new CrawlHost object.
74 *
75 * @param hostname the host name for this host.
76 */
77 public CrawlHost(String hostname) {
78 this(hostname, null);
79 }
80
81 /***
82 * Create a new CrawlHost object.
83 *
84 * @param hostname the host name for this host.
85 * @param countryCode the country code for this host.
86 */
87 public CrawlHost(String hostname, String countryCode) {
88 this.hostname = hostname;
89 this.countryCode = countryCode;
90 InetAddress tmp = InetAddressUtil.getIPHostAddress(hostname);
91 if (tmp != null) {
92 setIP(tmp, IP_NEVER_EXPIRES);
93 }
94 }
95
96 /*** Return true if the IP for this host has been looked up.
97 *
98 * Returns true even if the lookup failed.
99 *
100 * @return true if the IP for this host has been looked up.
101 */
102 public boolean hasBeenLookedUp() {
103 return ipFetched != IP_NEVER_LOOKED_UP;
104 }
105
106 /***
107 * Set the IP address for this host.
108 *
109 * @param address
110 * @param ttl the TTL from the dns record in seconds or -1 if it should live
111 * forever (is a numeric IP).
112 */
113 public void setIP(InetAddress address, long ttl) {
114 this.ip = address;
115
116
117 this.ipFetched = System.currentTimeMillis();
118 this.ipTTL = ttl;
119 if (logger.isLoggable(Level.FINE)) {
120 logger.fine(hostname + ": " +
121 ((address != null)? address.toString(): "null"));
122 }
123 }
124
125 /*** Get the IP address for this host.
126 *
127 * @return the IP address for this host.
128 */
129 public InetAddress getIP() {
130 return ip;
131 }
132
133 /*** Get the time when the IP address for this host was last looked up.
134 *
135 * @return the time when the IP address for this host was last looked up.
136 */
137 public long getIpFetched() {
138 return ipFetched;
139 }
140
141 /***
142 * Get the TTL value from the dns record for this host.
143 *
144 * @return the TTL value from the dns record for this host -- in seconds --
145 * or -1 if this lookup should be valid forever (numeric ip).
146 */
147 public long getIpTTL() {
148 return this.ipTTL;
149 }
150
151 public String toString() {
152 return "CrawlHost<" + hostname + "(ip:" + ip + ")>";
153 }
154
155 @Override
156 public int hashCode() {
157 return this.hostname != null ? this.hostname.hashCode() : 0;
158 }
159
160 @Override
161 public boolean equals(Object obj) {
162 if (obj == null) {
163 return false;
164 }
165 if (getClass() != obj.getClass()) {
166 return false;
167 }
168 final CrawlHost other = (CrawlHost) obj;
169 if (this.hostname != other.hostname
170 && (this.hostname == null
171 || !this.hostname.equals(other.hostname))) {
172 return false;
173 }
174 return true;
175 }
176
177 /***
178 * Get the host name.
179 * @return Returns the host name.
180 */
181 public String getHostName() {
182 return hostname;
183 }
184
185 /***
186 * Get the earliest time a URI for this host could be emitted.
187 * This only has effect if constraints on bandwidth per host is set.
188 *
189 * @return Returns the earliestNextURIEmitTime.
190 */
191 public long getEarliestNextURIEmitTime() {
192 return earliestNextURIEmitTime;
193 }
194
195 /***
196 * Set the earliest time a URI for this host could be emitted.
197 * This only has effect if constraints on bandwidth per host is set.
198 *
199 * @param earliestNextURIEmitTime The earliestNextURIEmitTime to set.
200 */
201 public void setEarliestNextURIEmitTime(long earliestNextURIEmitTime) {
202 this.earliestNextURIEmitTime = earliestNextURIEmitTime;
203 }
204
205 /***
206 * Get country code of this host
207 *
208 * @return Retruns country code or null if not availabe
209 */
210 public String getCountryCode() {
211 return countryCode;
212 }
213
214 /***
215 * Set country code for this hos
216 *
217 * @param countryCode The country code of this host
218 */
219 public void setCountryCode(String countryCode) {
220 this.countryCode = countryCode;
221 }
222
223
224
225
226 public CrawlSubstats getSubstats() {
227 return substats;
228 }
229 }