View Javadoc

1   /* ScopePlusOneDecideRule
2   *
3   * Created on Aug 22, 2005
4   *
5   * Copyright 2005 Regents of the University of California, All rights reserved
6   *
7   * This file is part of the Heritrix web crawler (crawler.archive.org).
8   *
9   * Heritrix is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * any later version.
13  *
14  * Heritrix is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU Lesser Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Heritrix; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23  package org.archive.crawler.deciderules;
24  
25  import java.util.logging.Level;
26  import java.util.logging.Logger;
27  
28  import javax.management.AttributeNotFoundException;
29  
30  import org.archive.crawler.datamodel.CandidateURI;
31  import org.archive.crawler.settings.SimpleType;
32  import org.archive.net.UURI;
33  import org.archive.util.SurtPrefixSet;
34  
35  /***
36   * Rule allows one level of discovery beyond configured scope
37   * (e.g. Domain, plus the first otherwise out-of-scope link from an
38   * in-scope page, but not further hops from that first page)
39   *
40   * @author Shifra Raffel
41   * @version $Date: 2006-09-25 17:16:55 +0000 (Mon, 25 Sep 2006) $ $Revision: 4649 $
42   */
43  public class ScopePlusOneDecideRule extends SurtPrefixedDecideRule {
44  
45      private static final long serialVersionUID = -6344162369024146340L;
46  
47      public static final String ATTR_SCOPE = "host-or-domain-scope";
48      public static final String HOST = "Host";
49      public static final String DOMAIN = "Domain";
50      
51      private static final Logger logger =
52          Logger.getLogger(ScopePlusOneDecideRule.class.getName());
53      
54      /***
55       * Constructor.
56       * @param name
57       */
58      public ScopePlusOneDecideRule(String name) {
59          super(name);
60          setDescription(
61              "ScopePlusOneDecideRule. Rule allows one level of discovery " +
62              "beyond configured scope (e.g. Domain, plus the first " +
63              "otherwise out-of-scope link from an in-scope page, but " +
64              "no further hops from that first otherwise-out-of-scope page). " +
65              "surts-source-file is optional. Use surts-dump-file option " +
66              "when testing.");
67          addElementToDefinition(new SimpleType(ATTR_SCOPE,
68              "Restrict to host, e.g. archive.org excludes audio.archive.org, " +
69              "or expand to domain as well, e.g. archive.org includes all " +
70              "*.archive.org", DOMAIN, new String[] {HOST, DOMAIN}));
71      }
72  
73      /***
74       * Evaluate whether given object comes from a URI which is in scope
75       *
76       * @param object to evaluate
77       * @return true if URI is either in scope or its via is
78       */
79      protected boolean evaluate(Object object) {
80          boolean result = false;
81          if (!(object instanceof CandidateURI)) {
82              // Can't evaluate if not a candidate URI
83              return false; 
84          }
85          SurtPrefixSet set = getPrefixes(object);
86          UURI u = UURI.from(object);
87          // First, is the URI itself in scope?
88          boolean firstResult = isInScope(u, set);
89          if (logger.isLoggable(Level.FINE)) {
90              logger.fine("Tested scope of UURI itself '" + u +
91                          " and result was " + firstResult);
92          }                        
93          if (firstResult == true) {
94              result = true;
95          } else {
96              // This object is not itself within scope, but
97              // see whether its via might be
98              UURI via = getVia(object);
99              if (via == null) {
100                 // If there is no via and the URL doesn't match scope,reject it
101                 return false;
102             }
103             // If the via is within scope, accept it
104             result = isInScope (via, set);
105             if (logger.isLoggable(Level.FINE)) {
106                 logger.fine("Tested via UURI '" + via +
107                         " and result was " + result);
108             }            
109         }
110         return result;
111     }
112     
113     /***
114      * Synchronized get of prefix set to use
115      * 
116      * @return SurtPrefixSet to use for check
117      *@see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
118      */
119     protected synchronized SurtPrefixSet getPrefixes() {
120         return getPrefixes(null);
121     } 
122     
123     /***
124      * Synchronized get of prefix set to use.
125      * @param o Context object.
126      * 
127      * @return SurtPrefixSet to use for check
128      * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#getPrefixes()
129      */
130     protected synchronized SurtPrefixSet getPrefixes(Object o) {
131         if (surtPrefixes == null) {
132             readPrefixes(o);
133         }
134         return surtPrefixes;
135     }    
136     
137     /***
138      * Patch the SURT prefix set so that it only includes the appropriate
139      * prefixes.
140      * @param o Context object.
141      * @see org.archive.crawler.deciderules.SurtPrefixedDecideRule#readPrefixes()
142      */
143     protected void readPrefixes(Object o) {
144         buildSurtPrefixSet();
145         // See whether Host or Domain was chosen
146         String scope = this.getScope(o);
147         if (scope.equals(HOST)){
148             surtPrefixes.convertAllPrefixesToHosts();            
149         } else if (scope.equals(DOMAIN)) {
150             surtPrefixes.convertAllPrefixesToDomains();            
151         }
152         dumpSurtPrefixSet();
153     }        
154     
155     private UURI getVia(Object o){
156         return (o instanceof CandidateURI)? ((CandidateURI)o).getVia(): null;
157     }    
158 
159     /***
160      * Decide whether using host or domain scope
161      * @param o Context
162      * @return String Host or domain
163      * 
164      */
165     protected String getScope(Object o) {
166         try {
167             String scope = (String)getAttribute(o, ATTR_SCOPE);
168             if (scope.equals(HOST)) {
169                 return HOST;
170             } else if (scope.equals(DOMAIN)) {
171                 return DOMAIN;
172             } else {
173                 assert false : "Unrecognized scope " + scope
174                         + ". Should never happen!";
175             }
176         } catch (AttributeNotFoundException e) {
177             logger.severe(e.getMessage());
178         }
179         return null; // Basically the rule is inactive if this occurs.
180     }
181     
182     //check that the URI is in scope
183     private boolean isInScope (Object o, SurtPrefixSet set) {
184         boolean iResult = false;
185         UURI u = (UURI)o;
186         if (u == null) {
187             return false;
188         }
189         String candidateSurt = u.getSurtForm();
190         // also want to treat https as http
191         if (candidateSurt.startsWith("https:")) {
192             candidateSurt = "http:" + candidateSurt.substring(6);
193         }
194         if (set.containsPrefixOf(candidateSurt)){
195             iResult = true;          
196         }
197         return iResult;
198     }
199 }