View Javadoc

1   /* SurtAuthorityQueueAssignmentPolicy
2   *
3   * $Id: SurtAuthorityQueueAssignmentPolicy.java 3889 2005-10-11 23:09:45Z gojomo $
4   *
5   * Created on Oct 5, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.frontier;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.datamodel.CandidateURI;
32  import org.archive.crawler.framework.CrawlController;
33  import org.archive.net.UURI;
34  import org.archive.net.UURIFactory;
35  
36  /***
37   * SurtAuthorityQueueAssignmentPolicy based on the surt form of hostname.
38   */
39  public class SurtAuthorityQueueAssignmentPolicy
40  extends QueueAssignmentPolicy {
41      private static final Logger logger = Logger
42              .getLogger(SurtAuthorityQueueAssignmentPolicy.class.getName());
43      /***
44       * When neat host-based class-key fails us
45       */
46      private static String DEFAULT_CLASS_KEY = "default...";
47      
48      private static final String DNS = "dns";
49  
50      public String getClassKey(CrawlController controller, CandidateURI cauri) {
51          String scheme = cauri.getUURI().getScheme();
52          String candidate = null;
53          try {
54              if (scheme.equals(DNS)) {
55              	UURI effectiveuuri;
56                  if (cauri.getVia() != null) {
57                      // Special handling for DNS: treat as being
58                      // of the same class as the triggering URI.
59                      // When a URI includes a port, this ensures 
60                      // the DNS lookup goes atop the host:port
61                      // queue that triggered it, rather than 
62                      // some other host queue
63                  	effectiveuuri = UURIFactory.getInstance(cauri.flattenVia());
64                  } else {
65                  	// To get the dns surt form, create a fake http version
66                  	// (Gordon suggestion).
67                      effectiveuuri = UURIFactory.getInstance("http://" +
68                          cauri.getUURI().getPath());
69                  }
70                  candidate = getSurtAuthority(effectiveuuri.getSurtForm());
71              } else {
72                  candidate = getSurtAuthority(cauri.getUURI().getSurtForm());
73              }
74              
75              if(candidate == null || candidate.length() == 0) {
76                  candidate = DEFAULT_CLASS_KEY;
77              }
78          } catch (URIException e) {
79              logger.log(Level.INFO,
80                      "unable to extract class key; using default", e);
81              candidate = DEFAULT_CLASS_KEY;
82          }
83          // Ensure classKeys are safe as filenames on NTFS
84          return candidate.replace(':','#');
85      }
86  
87      protected String getSurtAuthority(String surt) {
88          int indexOfOpen = surt.indexOf("://(");
89          int indexOfClose = surt.indexOf(")");
90          if (indexOfOpen == -1 || indexOfClose == -1
91                  || ((indexOfOpen + 4) >= indexOfClose)) {
92              return DEFAULT_CLASS_KEY;
93          }
94          return surt.substring(indexOfOpen + 4, indexOfClose);
95      }
96  }