1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.datamodel;
25
26 import java.io.BufferedReader;
27 import java.io.IOException;
28 import java.io.InputStreamReader;
29 import java.io.ObjectInputStream;
30 import java.io.Serializable;
31 import java.io.StringReader;
32 import java.util.HashSet;
33 import java.util.Set;
34 import java.util.zip.Checksum;
35
36 import org.apache.commons.httpclient.URIException;
37 import org.archive.crawler.datamodel.credential.CredentialAvatar;
38 import org.archive.crawler.settings.CrawlerSettings;
39 import org.archive.crawler.settings.SettingsHandler;
40 import org.archive.io.ReplayInputStream;
41 import org.archive.net.UURIFactory;
42
43 /***
44 * Represents a single remote "server".
45 *
46 * A server is a service on a host. There might be more than one service on a
47 * host differentiated by a port number.
48 *
49 * @author gojomo
50 */
51 public class CrawlServer implements Serializable, CrawlSubstats.HasCrawlSubstats, FetchStatusCodes {
52
53 private static final long serialVersionUID = -989714570750970369L;
54
55 public static final long ROBOTS_NOT_FETCHED = -1;
56 /*** only check if robots-fetch is perhaps superfluous
57 * after this many tries */
58 public static final long MIN_ROBOTS_RETRIES = 3;
59
60 private final String server;
61 private int port;
62 private transient SettingsHandler settingsHandler;
63 private RobotsExclusionPolicy robots;
64 long robotsFetched = ROBOTS_NOT_FETCHED;
65 boolean validRobots = false;
66 Checksum robotstxtChecksum;
67 CrawlSubstats substats = new CrawlSubstats();
68
69
70
71
72 protected int consecutiveConnectionErrors = 0;
73
74 /***
75 * Set of credential avatars.
76 */
77 private transient Set<CredentialAvatar> avatars = null;
78
79 /***
80 * Creates a new CrawlServer object.
81 *
82 * @param h the host string for the server.
83 */
84 public CrawlServer(String h) {
85
86 server = h;
87 int colonIndex = server.lastIndexOf(":");
88 if (colonIndex < 0) {
89 port = -1;
90 } else {
91 try {
92 port = Integer.parseInt(server.substring(colonIndex + 1));
93 } catch (NumberFormatException e) {
94 port = -1;
95 }
96 }
97 }
98
99 /*** Get the robots exclusion policy for this server.
100 *
101 * @return the robots exclusion policy for this server.
102 */
103 public RobotsExclusionPolicy getRobots() {
104 return robots;
105 }
106
107 /*** Set the robots exclusion policy for this server.
108 *
109 * @param policy the policy to set.
110 */
111 public void setRobots(RobotsExclusionPolicy policy) {
112 robots = policy;
113 }
114
115 public String toString() {
116 return "CrawlServer("+server+")";
117 }
118
119 @Override
120 public int hashCode() {
121 return this.server != null ? this.server.hashCode() : 0;
122 }
123
124 @Override
125 public boolean equals(Object obj) {
126 if (obj == null) {
127 return false;
128 }
129 if (getClass() != obj.getClass()) {
130 return false;
131 }
132 final CrawlServer other = (CrawlServer) obj;
133 if (this.server != other.server
134 && (this.server == null
135 || !this.server.equals(other.server))) {
136 return false;
137 }
138 return true;
139 }
140
141 /*** Update the robots exclusion policy.
142 *
143 * @param curi the crawl URI containing the fetched robots.txt
144 * @throws IOException
145 */
146 public void updateRobots(CrawlURI curi) {
147 RobotsHonoringPolicy honoringPolicy =
148 settingsHandler.getOrder().getRobotsHonoringPolicy();
149
150 robotsFetched = System.currentTimeMillis();
151
152 boolean gotSomething = curi.isHttpTransaction() &&
153 (curi.getFetchStatus() > 0 || curi.getFetchStatus() == S_DEEMED_NOT_FOUND );
154
155 if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
156
157
158 validRobots = false;
159 return;
160 }
161
162 CrawlerSettings settings = getSettings(curi);
163 int type = honoringPolicy.getType(settings);
164 if (type == RobotsHonoringPolicy.IGNORE) {
165
166 robots = RobotsExclusionPolicy.ALLOWALL;
167 validRobots = true;
168 if(curi.getFetchStatus() < 0) {
169
170 curi.setFetchStatus(S_DEEMED_NOT_FOUND);
171 }
172 return;
173 }
174
175
176 if(curi.getFetchStatus() == S_CONNECT_LOST && curi.annotationContains("NoHttpResponseException")) {
177 curi.setFetchStatus(S_DEEMED_NOT_FOUND);
178 gotSomething = true;
179 }
180
181 if (!gotSomething) {
182
183 validRobots = false;
184 return;
185 }
186
187 if (!curi.is2XXSuccess()) {
188
189
190
191
192
193
194
195 robots = RobotsExclusionPolicy.ALLOWALL;
196 validRobots = true;
197 return;
198 }
199
200 ReplayInputStream contentBodyStream = null;
201 try {
202 try {
203 BufferedReader reader;
204 if (type == RobotsHonoringPolicy.CUSTOM) {
205 reader = new BufferedReader(new StringReader(honoringPolicy
206 .getCustomRobots(settings)));
207 } else {
208 contentBodyStream = curi.getHttpRecorder()
209 .getRecordedInput().getContentReplayInputStream();
210
211 contentBodyStream.setToResponseBodyStart();
212 reader = new BufferedReader(new InputStreamReader(
213 contentBodyStream));
214 }
215 robots = RobotsExclusionPolicy.policyFor(settings,
216 reader, honoringPolicy);
217 validRobots = true;
218 } finally {
219 if (contentBodyStream != null) {
220 contentBodyStream.close();
221 }
222 }
223 } catch (IOException e) {
224 robots = RobotsExclusionPolicy.ALLOWALL;
225 validRobots = true;
226 curi.addLocalizedError(getName(), e,
227 "robots.txt parsing IOException");
228 }
229 }
230
231 /***
232 * @return Returns the time when robots.txt was fetched.
233 */
234 public long getRobotsFetchedTime() {
235 return robotsFetched;
236 }
237
238 /***
239 * @return The server string which might include a port number.
240 */
241 public String getName() {
242 return server;
243 }
244
245 /*** Get the port number for this server.
246 *
247 * @return the port number or -1 if not known (uses default for protocol)
248 */
249 public int getPort() {
250 return port;
251 }
252
253 /***
254 * Called when object is being deserialized.
255 * In addition to the default java deserialization, this method
256 * re-establishes the references to settings handler and robots honoring
257 * policy.
258 *
259 * @param stream the stream to deserialize from.
260 * @throws IOException if I/O errors occur
261 * @throws ClassNotFoundException If the class for an object being restored
262 * cannot be found.
263 */
264 private void readObject(ObjectInputStream stream)
265 throws IOException, ClassNotFoundException {
266 stream.defaultReadObject();
267 settingsHandler = SettingsHandler.getThreadContextSettingsHandler();
268 postDeserialize();
269 }
270
271 private void postDeserialize() {
272 if (this.robots != null) {
273 RobotsHonoringPolicy honoringPolicy =
274 settingsHandler.getOrder().getRobotsHonoringPolicy();
275 this.robots.honoringPolicy = honoringPolicy;
276 }
277 }
278
279 /*** Get the settings handler.
280 *
281 * @return the settings handler.
282 */
283 public SettingsHandler getSettingsHandler() {
284 return this.settingsHandler;
285 }
286
287 /*** Get the settings object in effect for this server.
288 * @param curi
289 *
290 * @return the settings object in effect for this server.
291 * @throws URIException
292 */
293 private CrawlerSettings getSettings(CandidateURI curi) {
294 try {
295 return this.settingsHandler.
296 getSettings(curi.getUURI().getReferencedHost(),
297 curi.getUURI());
298 } catch (URIException e) {
299 return null;
300 }
301 }
302
303 /*** Set the settings handler to be used by this server.
304 *
305 * @param settingsHandler the settings handler to be used by this server.
306 */
307 public void setSettingsHandler(SettingsHandler settingsHandler) {
308 this.settingsHandler = settingsHandler;
309 }
310
311 public void incrementConsecutiveConnectionErrors() {
312 this.consecutiveConnectionErrors++;
313 }
314
315 public void resetConsecutiveConnectionErrors() {
316 this.consecutiveConnectionErrors = 0;
317 }
318
319 /***
320 * @return Credential avatars for this server. Returns null if none.
321 */
322 public Set<CredentialAvatar> getCredentialAvatars() {
323 return this.avatars;
324 }
325
326 /***
327 * @return True if there are avatars attached to this instance.
328 */
329 public boolean hasCredentialAvatars() {
330 return this.avatars != null && this.avatars.size() > 0;
331 }
332
333 /***
334 * Add an avatar.
335 *
336 * @param ca Credential avatar to add to set of avatars.
337 */
338 public void addCredentialAvatar(CredentialAvatar ca) {
339 if (this.avatars == null) {
340 this.avatars = new HashSet<CredentialAvatar>();
341 }
342 this.avatars.add(ca);
343 }
344
345 /***
346 * If true then valid robots.txt information has been retrieved. If false
347 * either no attempt has been made to fetch robots.txt or the attempt
348 * failed.
349 *
350 * @return Returns the validRobots.
351 */
352 public boolean isValidRobots() {
353 return validRobots;
354 }
355
356 /***
357 * Get key to use doing lookup on server instances.
358 * @param cauri CandidateURI we're to get server key for.
359 * @return String to use as server key.
360 * @throws URIException
361 */
362 public static String getServerKey(CandidateURI cauri)
363 throws URIException {
364
365
366
367 String key = cauri.getUURI().getAuthorityMinusUserinfo();
368 if (key == null) {
369
370
371
372 key = cauri.getUURI().getCurrentHierPath();
373 if(key != null && !key.matches("[-_//w//.:]+")) {
374
375
376 key = null;
377 }
378 }
379 if (key != null &&
380 cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) {
381
382
383 if (!key.matches(".+:[0-9]+")) {
384 key += UURIFactory.HTTPS_PORT;
385 }
386 }
387 return key;
388 }
389
390
391
392
393 public CrawlSubstats getSubstats() {
394 return substats;
395 }
396 }