View Javadoc

1   /* HostnameQueueAssignmentPolicy
2   *
3   * $Id: HostnameQueueAssignmentPolicy.java 3838 2005-09-21 23:00:47Z gojomo $
4   *
5   * Created on Oct 5, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.frontier;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.datamodel.CandidateURI;
32  import org.archive.crawler.framework.CrawlController;
33  import org.archive.net.UURI;
34  import org.archive.net.UURIFactory;
35  
36  /***
37   * QueueAssignmentPolicy based on the hostname:port evident in the given
38   * CrawlURI.
39   * 
40   * @author gojomo
41   */
42  public class HostnameQueueAssignmentPolicy extends QueueAssignmentPolicy {
43      private static final Logger logger = Logger
44          .getLogger(HostnameQueueAssignmentPolicy.class.getName());
45      /***
46       * When neat host-based class-key fails us
47       */
48      private static String DEFAULT_CLASS_KEY = "default...";
49      
50      private static final String DNS = "dns";
51  
52      public String getClassKey(CrawlController controller, CandidateURI cauri) {
53          String scheme = cauri.getUURI().getScheme();
54          String candidate = null;
55          try {
56              if (scheme.equals(DNS)){
57                  if (cauri.getVia() != null) {
58                      // Special handling for DNS: treat as being
59                      // of the same class as the triggering URI.
60                      // When a URI includes a port, this ensures 
61                      // the DNS lookup goes atop the host:port
62                      // queue that triggered it, rather than 
63                      // some other host queue
64                  	UURI viaUuri = UURIFactory.getInstance(cauri.flattenVia());
65                      candidate = viaUuri.getAuthorityMinusUserinfo();
66                      // adopt scheme of triggering URI
67                      scheme = viaUuri.getScheme();
68                  } else {
69                      candidate= cauri.getUURI().getReferencedHost();
70                  }
71              } else {
72                  candidate =  cauri.getUURI().getAuthorityMinusUserinfo();
73              }
74              
75              if(candidate == null || candidate.length() == 0) {
76                  candidate = DEFAULT_CLASS_KEY;
77              }
78          } catch (URIException e) {
79              logger.log(Level.INFO,
80                      "unable to extract class key; using default", e);
81              candidate = DEFAULT_CLASS_KEY;
82          }
83          if (scheme != null && scheme.equals(UURIFactory.HTTPS)) {
84              // If https and no port specified, add default https port to
85              // distinguish https from http server without a port.
86              if (!candidate.matches(".+:[0-9]+")) {
87                  candidate += UURIFactory.HTTPS_PORT;
88              }
89          }
90          // Ensure classKeys are safe as filenames on NTFS
91          return candidate.replace(':','#');
92      }
93  
94  }