1 /* Robots.java
2 *
3 * $Id: Robotstxt.java 6824 2010-04-13 22:43:44Z gojomo $
4 *
5 * Created Sep 1, 2005
6 *
7 * Copyright (C) 2005 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25 package org.archive.crawler.datamodel;
26
27 import java.io.BufferedReader;
28 import java.io.IOException;
29 import java.io.Serializable;
30 import java.util.HashMap;
31 import java.util.LinkedList;
32 import java.util.List;
33 import java.util.Map;
34
35 /***
36 * Utility class for parsing and representing 'robots.txt' format
37 * directives, into a list of named user-agents and map from user-agents
38 * to RobotsDirectives.
39 */
40 public class Robotstxt implements Serializable {
41 static final long serialVersionUID = 7025386509301303890L;
42
43 // all user agents contained in this robots.txt
44 // may be thinned of irrelevant entries
45 LinkedList<String> userAgents = new LinkedList<String>();
46 // map user-agents to directives
47 Map<String,RobotsDirectives> agentsToDirectives =
48 new HashMap<String,RobotsDirectives>();
49 //
50 boolean hasErrors = false;
51
52 static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
53
54 public Robotstxt(BufferedReader reader) throws IOException {
55 String read;
56 // current is the disallowed paths for the preceding User-Agent(s)
57 RobotsDirectives current = null;
58 // whether a non-'User-Agent' directive has been encountered
59 boolean hasDirectivesYet = false;
60 String catchall = null;
61 while (reader != null) {
62 do {
63 read = reader.readLine();
64 // Skip comments & blanks
65 } while ((read != null) && ((read = read.trim()).startsWith("#") ||
66 read.length() == 0));
67 if (read == null) {
68 reader.close();
69 reader = null;
70 } else {
71 // remove any html markup
72 read = read.replaceAll("<[^>]+>","");
73 int commentIndex = read.indexOf("#");
74 if (commentIndex > -1) {
75 // Strip trailing comment
76 read = read.substring(0, commentIndex);
77 }
78 read = read.trim();
79 if (read.matches("(?i)^User-agent:.*")) {
80 String ua = read.substring(11).trim().toLowerCase();
81 if (current == null || hasDirectivesYet ) {
82 // only create new rules-list if necessary
83 // otherwise share with previous user-agent
84 current = new RobotsDirectives();
85 hasDirectivesYet = false;
86 }
87 if (ua.equals("*")) {
88 ua = "";
89 catchall = ua;
90 } else {
91 userAgents.addLast(ua);
92 }
93 agentsToDirectives.put(ua, current);
94 continue;
95 }
96 if (read.matches("(?i)Disallow:.*")) {
97 if (current == null) {
98 // buggy robots.txt
99 hasErrors = true;
100 continue;
101 }
102 String path = read.substring(9).trim();
103 // tolerate common error of ending path with '*' character
104 // (not allowed by original spec; redundant but harmless with
105 // Google's wildcarding extensions -- which we don't yet fully
106 // support).
107 if(path.endsWith("*")) {
108 path = path.substring(0,path.length()-1);
109 }
110 current.addDisallow(path);
111 hasDirectivesYet = true;
112 continue;
113 }
114 if (read.matches("(?i)Crawl-delay:.*")) {
115 if (current == null) {
116 // buggy robots.txt
117 hasErrors = true;
118 continue;
119 }
120 // consider a crawl-delay, even though we don't
121 // yet understand it, as sufficient to end a
122 // grouping of User-Agent lines
123 hasDirectivesYet = true;
124 String val = read.substring(12).trim();
125 val = val.split("[^//d//.]+")[0];
126 try {
127 current.setCrawlDelay(Float.parseFloat(val));
128 } catch (NumberFormatException nfe) {
129 // ignore
130 }
131 continue;
132 }
133 if (read.matches("(?i)Allow:.*")) {
134 if (current == null) {
135 // buggy robots.txt
136 hasErrors = true;
137 continue;
138 }
139 String path = read.substring(6).trim();
140 // tolerate common error of ending path with '*' character
141 // (not allowed by original spec; redundant but harmless with
142 // Google's wildcarding extensions -- which we don't yet fully
143 // support).
144 if(path.endsWith("*")) {
145 path = path.substring(0,path.length()-1);
146 }
147 current.addAllow(path);
148 hasDirectivesYet = true;
149 continue;
150 }
151 // unknown line; do nothing for now
152 }
153 }
154
155 if (catchall != null) {
156 userAgents.addLast(catchall);
157 }
158 }
159
160
161 /***
162 * Does this policy effectively allow everything? (No
163 * disallows or timing (crawl-delay) directives?)
164 * @return
165 */
166 public boolean allowsAll() {
167 // TODO: refine so directives that are all empty are also
168 // recognized as allowing all
169 return agentsToDirectives.isEmpty();
170 }
171
172 public List<String> getUserAgents() {
173 return userAgents;
174 }
175
176 public RobotsDirectives getDirectivesFor(String ua) {
177 // find matching ua
178 for(String uaListed : userAgents) {
179 if(ua.indexOf(uaListed)>-1) {
180 return agentsToDirectives.get(uaListed);
181 }
182 }
183 // no applicable user-agents, so empty directives
184 return NO_DIRECTIVES;
185 }
186 }