View Javadoc

1   /* BucketQueueAssignmentPolicy
2    * 
3    * $Header$
4    * 
5    * Created on May 06, 2005
6    *
7    *  Copyright (C) 2005 Christian Kohlschuetter
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   *
25   */
26  package org.archive.crawler.frontier;
27  
28  import org.apache.commons.httpclient.URIException;
29  import org.archive.crawler.datamodel.CandidateURI;
30  import org.archive.crawler.datamodel.CrawlHost;
31  import org.archive.crawler.framework.CrawlController;
32  
33  /***
34   * Uses the target IPs as basis for queue-assignment,
35   * distributing them over a fixed number of sub-queues.
36   * 
37   * @author Christian Kohlschuetter
38   */
39  public class BucketQueueAssignmentPolicy extends QueueAssignmentPolicy {
40      private static final int DEFAULT_NOIP_BITMASK = 1023;
41      private static final int DEFAULT_QUEUES_HOSTS_MODULO = 1021;
42  
43      public String getClassKey(final CrawlController controller,
44          final CandidateURI curi) {
45          
46          CrawlHost host;
47          try {
48              host = controller.getServerCache().getHostFor(
49                  curi.getUURI().getReferencedHost());
50          } catch (URIException e) {
51              // FIXME error handling
52              e.printStackTrace();
53              host = null;
54          }
55          if(host == null) {
56              return "NO-HOST";
57          } else if(host.getIP() == null) {
58              return "NO-IP-".concat(Integer.toString(Math.abs(host.getHostName()
59                  .hashCode())
60                  & DEFAULT_NOIP_BITMASK));
61          } else {
62              return Integer.toString(Math.abs(host.getIP().hashCode())
63                  % DEFAULT_QUEUES_HOSTS_MODULO);
64          }
65      }
66  
67      public int maximumNumberOfKeys() {
68          return DEFAULT_NOIP_BITMASK + DEFAULT_QUEUES_HOSTS_MODULO + 2;
69      }
70  }