View Javadoc

1   /* $Id:  $
2    *
3    * Copyright (C) 2007 Olaf Freyer
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   */
21  package org.archive.crawler.deciderules;
22  
23  import java.util.logging.Logger;
24  
25  import org.archive.crawler.datamodel.CandidateURI;
26  import org.archive.net.PublicSuffixes;
27  import org.archive.net.UURI;
28  
29  /***
30   * Applies its decision if the current URI differs in that portion of
31   * its hostname/domain that is assigned/sold by registrars (AKA its
32   * 'topmost assigned SURT' or 'public suffix'.)
33   * 
34   * @author Olaf Freyer
35   */
36  public class IsCrossTopmostAssignedSurtHopDecideRule extends PredicatedDecideRule {
37      private static final long serialVersionUID = 1L;
38      
39      private static final Logger LOGGER = Logger
40              .getLogger(IsCrossTopmostAssignedSurtHopDecideRule.class.getName());
41  
42      public IsCrossTopmostAssignedSurtHopDecideRule(String name) {
43          super(name);
44          setDescription(
45              "Matches if the registrar-assigned portion of a URI's " +
46              "hostname (AKA 'topmost assigned SURT') differs from that " +
47              "of its referrer. ");
48      }
49  
50      protected boolean evaluate(Object object) {
51          UURI via = (object instanceof CandidateURI) ? ((CandidateURI) object).getVia() : null;
52          if (via == null) {
53              return false;
54          }
55          CandidateURI curi = (CandidateURI) object;
56          if (curi == null) {
57              return false;
58          }
59          try {
60              // determine if this hop crosses domain borders
61              String myTopmostAssignedSurt = getTopmostAssignedSurt(curi.getUURI());
62              String viaTopmostAssignetSurt = getTopmostAssignedSurt(via);
63              if (myTopmostAssignedSurt != null && viaTopmostAssignetSurt != null
64                      && !myTopmostAssignedSurt.equals(viaTopmostAssignetSurt)) {
65                  LOGGER.fine("rule matched for \"" + myTopmostAssignedSurt+"\" vs. \""+viaTopmostAssignetSurt+"\"");
66                  return true;
67              }
68          } catch (Exception e) {
69              e.printStackTrace();
70              // Return false since we could not get hostname or something else
71              // went wrong
72          }
73          return false;
74      }
75      
76      private String getTopmostAssignedSurt(UURI uuri){
77          String surt = uuri.getSurtForm().replaceFirst(".*:////((.*?)//).*", "$1");
78          return PublicSuffixes.reduceSurtToTopmostAssigned(surt);
79          
80      }
81  
82  }