CrawlServer xref

View Javadoc

1   /* Copyright (C) 2009 Internet Archive
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * CrawlServer.java
20   * Created on Apr 17, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.datamodel;
25  
26  import java.io.BufferedReader;
27  import java.io.IOException;
28  import java.io.InputStreamReader;
29  import java.io.ObjectInputStream;
30  import java.io.Serializable;
31  import java.io.StringReader;
32  import java.util.HashSet;
33  import java.util.Set;
34  import java.util.zip.Checksum;
35  
36  import org.apache.commons.httpclient.URIException;
37  import org.archive.crawler.datamodel.credential.CredentialAvatar;
38  import org.archive.crawler.settings.CrawlerSettings;
39  import org.archive.crawler.settings.SettingsHandler;
40  import org.archive.io.ReplayInputStream;
41  import org.archive.net.UURIFactory;
42  
43  /***
44   * Represents a single remote "server".
45   *
46   * A server is a service on a host. There might be more than one service on a
47   * host differentiated by a port number.
48   *
49   * @author gojomo
50   */
51  public class CrawlServer implements Serializable, CrawlSubstats.HasCrawlSubstats, FetchStatusCodes {
52  
53      private static final long serialVersionUID = -989714570750970369L;
54  
55      public static final long ROBOTS_NOT_FETCHED = -1;
56      /*** only check if robots-fetch is perhaps superfluous 
57       * after this many tries */
58      public static final long MIN_ROBOTS_RETRIES = 3;
59  
60      private final String server; // actually, host+port in the https case
61      private int port;
62      private transient SettingsHandler settingsHandler;
63      private RobotsExclusionPolicy robots;
64      long robotsFetched = ROBOTS_NOT_FETCHED;
65      boolean validRobots = false;
66      Checksum robotstxtChecksum;
67      CrawlSubstats substats = new CrawlSubstats();
68      
69      // how many consecutive connection errors have been encountered;
70      // could be used to drive exponentially increasing retry timeout or decision
71      // to 'freeze' entire class (queue) of URIs (but isn't yet)
72      protected int consecutiveConnectionErrors = 0;
73  
74      /***
75       * Set of credential avatars.
76       */
77      private transient Set<CredentialAvatar> avatars =  null;
78  
79      /***
80       * Creates a new CrawlServer object.
81       *
82       * @param h the host string for the server.
83       */
84      public CrawlServer(String h) {
85          // TODO: possibly check for illegal host string
86          server = h;
87          int colonIndex = server.lastIndexOf(":");
88          if (colonIndex < 0) {
89              port = -1;
90          } else {
91              try {
92                  port = Integer.parseInt(server.substring(colonIndex + 1));
93              } catch (NumberFormatException e) {
94                  port = -1;
95              }
96          }
97      }
98  
99      /*** Get the robots exclusion policy for this server.
100      *
101      * @return the robots exclusion policy for this server.
102      */
103     public RobotsExclusionPolicy getRobots() {
104         return robots;
105     }
106 
107     /*** Set the robots exclusion policy for this server.
108      *
109      * @param policy the policy to set.
110      */
111     public void setRobots(RobotsExclusionPolicy policy) {
112         robots = policy;
113     }
114 
115     public String toString() {
116         return "CrawlServer("+server+")";
117     }
118 
119     @Override
120     public int hashCode() {
121         return this.server != null ? this.server.hashCode() : 0;
122     }
123 
124     @Override
125     public boolean equals(Object obj) {
126         if (obj == null) {
127             return false;
128         }
129         if (getClass() != obj.getClass()) {
130             return false;
131         }
132         final CrawlServer other = (CrawlServer) obj;
133         if (this.server != other.server   // identity compare
134                 && (this.server == null 
135                     || !this.server.equals(other.server))) {
136             return false;
137         }
138         return true;
139     }
140 
141     /*** Update the robots exclusion policy.
142      *
143      * @param curi the crawl URI containing the fetched robots.txt
144      * @throws IOException
145      */
146     public void updateRobots(CrawlURI curi) {
147         RobotsHonoringPolicy honoringPolicy =
148             settingsHandler.getOrder().getRobotsHonoringPolicy();
149 
150         robotsFetched = System.currentTimeMillis();
151  
152         boolean gotSomething = curi.isHttpTransaction() &&
153         	(curi.getFetchStatus() > 0 || curi.getFetchStatus() == S_DEEMED_NOT_FOUND );
154         
155         if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
156             // robots.txt lookup failed, but still trying at least a few times
157             // no reason to consider IGNORE yet
158             validRobots = false;
159             return;
160         }
161         
162         CrawlerSettings settings = getSettings(curi);
163         int type = honoringPolicy.getType(settings);
164         if (type == RobotsHonoringPolicy.IGNORE) {
165             // IGNORE = ALLOWALL
166             robots = RobotsExclusionPolicy.ALLOWALL;
167             validRobots = true;
168             if(curi.getFetchStatus() < 0) {
169                 // prevent the rest of the usual retries
170                 curi.setFetchStatus(S_DEEMED_NOT_FOUND);
171             }
172             return;
173         }
174         
175         // special deeming for a particular kind of connection-lost (empty server response)
176         if(curi.getFetchStatus() == S_CONNECT_LOST && curi.annotationContains("NoHttpResponseException")) {
177         	curi.setFetchStatus(S_DEEMED_NOT_FOUND);
178         	gotSomething = true; 
179         }
180         
181         if (!gotSomething) {
182             // robots.txt fetch failed and exceptions (ignore/deeming) don't apply; no valid robots info yet
183             validRobots = false;
184             return;
185         }
186         
187         if (!curi.is2XXSuccess()) {
188             // Not found or any other HTTP status code outside the 2xx range is
189             // treated as giving access to all of a sites' content.
190             // This is the prevailing practice of Google, since 4xx
191             // responses on robots.txt are usually indicative of a 
192             // misconfiguration or blanket-block, not an intentional
193             // indicator of partial blocking. 
194             // TODO: consider handling server errors, redirects differently
195             robots = RobotsExclusionPolicy.ALLOWALL;
196             validRobots = true;
197             return;
198         }
199 
200         ReplayInputStream contentBodyStream = null;
201         try {
202             try {
203                 BufferedReader reader;
204                 if (type == RobotsHonoringPolicy.CUSTOM) {
205                     reader = new BufferedReader(new StringReader(honoringPolicy
206                             .getCustomRobots(settings)));
207                 } else {
208                     contentBodyStream = curi.getHttpRecorder()
209                             .getRecordedInput().getContentReplayInputStream();
210 
211                     contentBodyStream.setToResponseBodyStart();
212                     reader = new BufferedReader(new InputStreamReader(
213                             contentBodyStream));
214                 }
215                 robots = RobotsExclusionPolicy.policyFor(settings,
216                         reader, honoringPolicy);
217                 validRobots = true;
218             } finally {
219                 if (contentBodyStream != null) {
220                     contentBodyStream.close();
221                 }
222             }
223         } catch (IOException e) {
224             robots = RobotsExclusionPolicy.ALLOWALL;
225             validRobots = true;
226             curi.addLocalizedError(getName(), e,
227                     "robots.txt parsing IOException");
228         }
229     }
230 
231     /***
232      * @return Returns the time when robots.txt was fetched.
233      */
234     public long getRobotsFetchedTime() {
235         return robotsFetched;
236     }
237 
238     /***
239      * @return The server string which might include a port number.
240      */
241     public String getName() {
242        return server;
243     }
244 
245     /*** Get the port number for this server.
246      *
247      * @return the port number or -1 if not known (uses default for protocol)
248      */
249     public int getPort() {
250         return port;
251     }
252 
253     /*** 
254      * Called when object is being deserialized.
255      * In addition to the default java deserialization, this method
256      * re-establishes the references to settings handler and robots honoring
257      * policy.
258      *
259      * @param stream the stream to deserialize from.
260      * @throws IOException if I/O errors occur
261      * @throws ClassNotFoundException If the class for an object being restored
262      *         cannot be found.
263      */
264     private void readObject(ObjectInputStream stream)
265             throws IOException, ClassNotFoundException {
266         stream.defaultReadObject();
267         settingsHandler = SettingsHandler.getThreadContextSettingsHandler();
268         postDeserialize();
269     }
270     
271     private void postDeserialize() {
272     	if (this.robots != null) {
273     		RobotsHonoringPolicy honoringPolicy =
274                 settingsHandler.getOrder().getRobotsHonoringPolicy();
275     		this.robots.honoringPolicy = honoringPolicy;
276     	}
277     }
278 
279     /*** Get the settings handler.
280      *
281      * @return the settings handler.
282      */
283     public SettingsHandler getSettingsHandler() {
284         return this.settingsHandler;
285     }
286 
287     /*** Get the settings object in effect for this server.
288      * @param curi
289      *
290      * @return the settings object in effect for this server.
291      * @throws URIException
292      */
293     private CrawlerSettings getSettings(CandidateURI curi) {
294         try {
295             return this.settingsHandler.
296                 getSettings(curi.getUURI().getReferencedHost(),
297                     curi.getUURI());
298         } catch (URIException e) {
299             return null;
300         }
301     }
302 
303     /*** Set the settings handler to be used by this server.
304      *
305      * @param settingsHandler the settings handler to be used by this server.
306      */
307     public void setSettingsHandler(SettingsHandler settingsHandler) {
308         this.settingsHandler = settingsHandler;
309     }
310 
311     public void incrementConsecutiveConnectionErrors() {
312         this.consecutiveConnectionErrors++;
313     }
314 
315     public void resetConsecutiveConnectionErrors() {
316         this.consecutiveConnectionErrors = 0;
317     }
318 
319     /***
320      * @return Credential avatars for this server.  Returns null if none.
321      */
322     public Set<CredentialAvatar> getCredentialAvatars() {
323         return this.avatars;
324     }
325 
326     /***
327      * @return True if there are avatars attached to this instance.
328      */
329     public boolean hasCredentialAvatars() {
330         return this.avatars != null && this.avatars.size() > 0;
331     }
332 
333     /***
334      * Add an avatar.
335      *
336      * @param ca Credential avatar to add to set of avatars.
337      */
338     public void addCredentialAvatar(CredentialAvatar ca) {
339         if (this.avatars == null) {
340             this.avatars = new HashSet<CredentialAvatar>();
341         }
342         this.avatars.add(ca);
343     }
344     
345 	/***
346      * If true then valid robots.txt information has been retrieved. If false
347      * either no attempt has been made to fetch robots.txt or the attempt
348      * failed.
349      *
350 	 * @return Returns the validRobots.
351 	 */
352 	public boolean isValidRobots() {
353 		return validRobots;
354 	}
355     
356     /***
357      * Get key to use doing lookup on server instances.
358      * @param cauri CandidateURI we're to get server key for.
359      * @return String to use as server key.
360      * @throws URIException
361      */
362 	public static String getServerKey(CandidateURI cauri)
363 	throws URIException {
364 	    // TODO: evaluate if this is really necessary -- why not 
365 	    // make the server of a dns CandidateURI the looked-up domain,
366 	    // also simplifying FetchDNS?
367 	    String key = cauri.getUURI().getAuthorityMinusUserinfo();
368 	    if (key == null) {
369 	        // Fallback for cases where getAuthority() fails (eg 'dns:'.
370 	        // DNS UURIs have the 'domain' in the 'path' parameter, not
371 	        // in the authority).
372 	        key = cauri.getUURI().getCurrentHierPath();
373 	        if(key != null && !key.matches("[-_//w//.:]+")) {
374 	            // Not just word chars and dots and colons and dashes and
375 	            // underscores; throw away
376 	            key = null;
377 	        }
378 	    }
379 	    if (key != null &&
380 	            cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) {
381 	        // If https and no port specified, add default https port to
382 	        // distinuish https from http server without a port.
383 	        if (!key.matches(".+:[0-9]+")) {
384 	            key += UURIFactory.HTTPS_PORT;
385 	        }
386 	    }
387 	    return key;
388 	}
389 
390     /* (non-Javadoc)
391      * @see org.archive.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
392      */
393     public CrawlSubstats getSubstats() {
394         return substats;
395     }
396 }