1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.deciderules;
25
26 import java.util.logging.Logger;
27 import org.apache.commons.httpclient.URIException;
28 import org.archive.crawler.datamodel.CandidateURI;
29 import org.archive.net.UURI;
30
31
32 public class AddRedirectFromRootServerToScope extends PredicatedDecideRule {
33
34 private static final long serialVersionUID = 2644131585813079064L;
35
36 private static final Logger LOGGER =
37 Logger.getLogger(AddRedirectFromRootServerToScope.class.getName());
38 private static final String SLASH = "/";
39 public AddRedirectFromRootServerToScope(String name) {
40 super(name);
41 setDescription("Allow URI only if it is a redirect and via URI is a " +
42 "root server (host's slash page) that is within the " +
43 "scope. Also mark the URI as a seed.");
44 }
45
46 protected boolean evaluate(Object object) {
47 UURI via = getVia(object);
48 if (via == null) {
49 return false;
50 }
51 CandidateURI curi = (CandidateURI) object;
52 if ( curi == null) {
53 return false;
54 }
55 try {
56
57
58 if (curi.getUURI().getHostBasename() != null &&
59 via.getHostBasename() != null &&
60 !curi.getUURI().getHostBasename().equals(via.getHostBasename())
61 && curi.isLocation()
62 && via.getPath().equals(SLASH)) {
63 curi.setIsSeed(true);
64 LOGGER.info("Adding " + object.toString() + " to seeds via "
65 + getVia(object).toString());
66 return true;
67 }
68 } catch (URIException e) {
69 e.printStackTrace();
70 } catch (Exception e) {
71 e.printStackTrace();
72
73
74 }
75 return false;
76 }
77
78 private UURI getVia(Object o){
79 return (o instanceof CandidateURI)? ((CandidateURI)o).getVia(): null;
80 }
81 }