1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.archive.crawler.deciderules;
22
23 import java.util.logging.Logger;
24
25 import org.archive.crawler.datamodel.CandidateURI;
26 import org.archive.net.PublicSuffixes;
27 import org.archive.net.UURI;
28
29 /***
30 * Applies its decision if the current URI differs in that portion of
31 * its hostname/domain that is assigned/sold by registrars (AKA its
32 * 'topmost assigned SURT' or 'public suffix'.)
33 *
34 * @author Olaf Freyer
35 */
36 public class IsCrossTopmostAssignedSurtHopDecideRule extends PredicatedDecideRule {
37 private static final long serialVersionUID = 1L;
38
39 private static final Logger LOGGER = Logger
40 .getLogger(IsCrossTopmostAssignedSurtHopDecideRule.class.getName());
41
42 public IsCrossTopmostAssignedSurtHopDecideRule(String name) {
43 super(name);
44 setDescription(
45 "Matches if the registrar-assigned portion of a URI's " +
46 "hostname (AKA 'topmost assigned SURT') differs from that " +
47 "of its referrer. ");
48 }
49
50 protected boolean evaluate(Object object) {
51 UURI via = (object instanceof CandidateURI) ? ((CandidateURI) object).getVia() : null;
52 if (via == null) {
53 return false;
54 }
55 CandidateURI curi = (CandidateURI) object;
56 if (curi == null) {
57 return false;
58 }
59 try {
60
61 String myTopmostAssignedSurt = getTopmostAssignedSurt(curi.getUURI());
62 String viaTopmostAssignetSurt = getTopmostAssignedSurt(via);
63 if (myTopmostAssignedSurt != null && viaTopmostAssignetSurt != null
64 && !myTopmostAssignedSurt.equals(viaTopmostAssignetSurt)) {
65 LOGGER.fine("rule matched for \"" + myTopmostAssignedSurt+"\" vs. \""+viaTopmostAssignetSurt+"\"");
66 return true;
67 }
68 } catch (Exception e) {
69 e.printStackTrace();
70
71
72 }
73 return false;
74 }
75
76 private String getTopmostAssignedSurt(UURI uuri){
77 String surt = uuri.getSurtForm().replaceFirst(".*:////((.*?)//).*", "$1");
78 return PublicSuffixes.reduceSurtToTopmostAssigned(surt);
79
80 }
81
82 }