View Javadoc

1   /* AddRedirectFromRootServerToScope
2    * 
3    * Created on May 25, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  
24  package org.archive.crawler.deciderules;
25  
26  import java.util.logging.Logger;
27  import org.apache.commons.httpclient.URIException;
28  import org.archive.crawler.datamodel.CandidateURI;
29  import org.archive.net.UURI;
30  
31  
32  public class AddRedirectFromRootServerToScope extends PredicatedDecideRule {
33  
34      private static final long serialVersionUID = 2644131585813079064L;
35  
36      private static final Logger LOGGER =
37  	        Logger.getLogger(AddRedirectFromRootServerToScope.class.getName());
38  	private static final String SLASH = "/";
39  	public AddRedirectFromRootServerToScope(String name) {
40  		super(name);
41  		setDescription("Allow URI only if it is a redirect and via URI is a " +
42  				"root server (host's slash page) that is within the " +
43  				"scope. Also mark the URI as a seed."); 
44  	}
45  
46  	protected boolean evaluate(Object object) {
47  		UURI via = getVia(object);
48  		if (via == null) {
49  			return false;
50  		}
51  		CandidateURI curi = (CandidateURI) object;
52  		if ( curi == null) {
53  			return false;
54  		}
55  		try {
56  			// Mark URI as seed if via is from different host, URI is not a seed
57  			// already, URI is redirect and via is root server
58  			if (curi.getUURI().getHostBasename() != null &&
59  					via.getHostBasename() != null &&
60  					!curi.getUURI().getHostBasename().equals(via.getHostBasename())
61  				    && curi.isLocation()
62  					&& via.getPath().equals(SLASH)) {
63  				curi.setIsSeed(true);
64  				LOGGER.info("Adding " + object.toString() + " to seeds via "
65  						+ getVia(object).toString());
66  				return true;
67  			}
68  		} catch (URIException e) {
69  			e.printStackTrace();
70  		} catch (Exception e) {
71              e.printStackTrace();
72  			// Return false since we could not get hostname or something else 
73  			// went wrong
74  		}		
75  		return false;
76  	}
77  
78      private UURI getVia(Object o){
79          return (o instanceof CandidateURI)? ((CandidateURI)o).getVia(): null;
80      }    
81  }