1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 package org.archive.net;
27
28 import java.io.BufferedReader;
29 import java.io.BufferedWriter;
30 import java.io.FileOutputStream;
31 import java.io.FileReader;
32 import java.io.IOException;
33 import java.io.InputStreamReader;
34 import java.io.OutputStreamWriter;
35 import java.util.ArrayList;
36 import java.util.Collections;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.SortedSet;
40 import java.util.TreeSet;
41 import java.util.regex.Matcher;
42 import java.util.regex.Pattern;
43
44 import org.apache.commons.io.IOUtils;
45 import org.archive.util.TextUtils;
46
47 /***
48 * Utility class for making use of the information about 'public suffixes' at
49 * http://publicsuffix.org.
50 *
51 * The public suffix list (once known as 'effective TLDs') was motivated by the
52 * need to decide on which broader domains a subdomain was allowed to set
53 * cookies. For example, a server at 'www.example.com' can set cookies for
54 * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set
55 * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'.
56 * The number of rules for all top-level-domains and 2nd- or 3rd- level domains
57 * has become quite long; essentially the broadest domain a subdomain may assign
58 * to is the one that was sold/registered to a specific name registrant.
59 *
60 * This concept should be useful in other contexts, too. Grouping URIs (or
61 * queues of URIs to crawl) together with others sharing the same registered
62 * suffix may be useful for applying the same rules to all, such as assigning
63 * them to the same queue or crawler in a multi- machine setup.
64 *
65 * @author Gojomo
66 */
67 public class PublicSuffixes {
68 protected static Pattern topmostAssignedSurtPrefixPattern;
69 protected static String topmostAssignedSurtPrefixRegex;
70
71 /***
72 * Utility method for dumping a regex String, based on a published public
73 * suffix list, which matches any SURT-form hostname up through the broadest
74 * 'private' (assigned/sold) domain-segment. That is, for any of the
75 * SURT-form hostnames...
76 *
77 * com,example, com,example,www, com,example,california,www
78 *
79 * ...the regex will match 'com,example,'.
80 *
81 * @param args
82 * @throws IOException
83 */
84 public static void main(String args[]) throws IOException {
85
86 String regex;
87
88 if (args.length == 0 || "=".equals(args[0])) {
89
90 regex = getTopmostAssignedSurtPrefixRegex();
91 } else {
92
93 BufferedReader reader = new BufferedReader(new FileReader(args[0]));
94 regex = getTopmostAssignedSurtPrefixRegex(reader);
95 IOUtils.closeQuietly(reader);
96 }
97
98 boolean needsClose = false;
99 BufferedWriter writer;
100 if (args.length >= 2) {
101
102 writer = new BufferedWriter(
103 new OutputStreamWriter(
104 new FileOutputStream(args[1]),"UTF-8"));
105 needsClose = true;
106 } else {
107
108 writer = new BufferedWriter(new OutputStreamWriter(System.out));
109 }
110 writer.append(regex);
111 writer.flush();
112 if (needsClose) {
113 writer.close();
114 }
115 }
116
117 /***
118 * Reads a file of the format promulgated by publicsuffix.org, ignoring
119 * comments and '!' exceptions/notations, converting domain segments to
120 * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns sorted
121 * list of unique SURT-ordered prefixes.
122 *
123 * @param reader
124 * @return
125 * @throws IOException
126 */
127 public static List<String> readPublishedFileToSurtList(BufferedReader reader)
128 throws IOException {
129 String line;
130 List<String> list = new ArrayList<String>();
131 while ((line = reader.readLine()) != null) {
132
133
134 line = line.trim();
135 if (line.length() == 0 || line.startsWith("//")) {
136 continue;
137 }
138
139 line = line.split("//s+")[0];
140 line = line.toLowerCase();
141
142
143 String[] segs = line.split("//.");
144 StringBuilder surtregex = new StringBuilder();
145 for (int i = segs.length - 1; i >= 0; i--) {
146 if (segs[i].length() > 0) {
147
148 String fixed = segs[i].replaceAll("//?", "_");
149
150
151 fixed = fixed.replaceAll("!", "+");
152 surtregex.append(fixed + ",");
153 }
154 }
155 list.add(surtregex.toString());
156 }
157
158 Collections.sort(list);
159
160 String last = "";
161 Iterator<String> iter = list.iterator();
162 while (iter.hasNext()) {
163 String s = iter.next();
164 if (s.equals(last)) {
165 iter.remove();
166 continue;
167 }
168 last = s;
169
170 }
171 return list;
172 }
173
174 /***
175 * Converts SURT-ordered list of public prefixes into a Java regex which
176 * matches the public-portion "plus one" segment, giving the domain on which
177 * cookies can be set or other policy grouping should occur. Also adds to
178 * regex a fallback matcher that for any new/unknown TLDs assumes the
179 * second-level domain is assignable. (Eg: 'zzz,example,').
180 *
181 * @param list
182 * @return
183 */
184 private static String surtPrefixRegexFromSurtList(List<String> list) {
185 StringBuilder regex = new StringBuilder();
186 regex.append("(?ix)^\n");
187 TreeSet<String> prefixes = new TreeSet<String>(Collections
188 .reverseOrder());
189 prefixes.addAll(list);
190 prefixes.add("*,");
191 buildRegex("", regex, prefixes);
192 regex.append("\n([//-//w]+,)");
193 String rstring = regex.toString();
194
195 rstring = rstring.replaceAll("//*", "[////-////w]+");
196 return rstring;
197 }
198
199 protected static void buildRegex(String stem, StringBuilder regex,
200 SortedSet<String> prefixes) {
201 if (prefixes.isEmpty()) {
202 return;
203 }
204 if (prefixes.size() == 1 && prefixes.first().equals(stem)) {
205
206 return;
207 }
208 regex.append("(?:");
209 if (stem.length() == 0) {
210 regex.append("\n ");
211 }
212 Iterator<String> iter = prefixes.iterator();
213 char c = 0;
214 while (iter.hasNext()) {
215 String s = iter.next();
216 if (s.length() > stem.length()) {
217 char d = s.charAt(stem.length());
218
219 if (d == '+') {
220
221 regex.append("(?=" + s.substring(stem.length() + 1) + ")");
222 } else {
223 if (d == c) {
224 continue;
225 }
226 c = d;
227 regex.append(c);
228 String newStem = s.substring(0, stem.length() + 1);
229 SortedSet<String> tail = prefixes.tailSet(newStem);
230 SortedSet<String> range = null;
231 successor: for (String candidate : tail) {
232 if (!candidate.equals(newStem)) {
233 range = prefixes.subSet(s, candidate);
234 break successor;
235 }
236 }
237 if (range == null) {
238 range = prefixes.tailSet(s);
239 }
240 buildRegex(newStem, regex, range);
241 }
242 regex.append('|');
243 } else {
244
245 regex.append('@');
246 }
247 }
248
249 regex.deleteCharAt(regex.length() - 1);
250 regex.append(')');
251 if (stem.length() == 1) {
252 regex.append('\n');
253 }
254 }
255
256 public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() {
257 if (topmostAssignedSurtPrefixPattern == null) {
258 topmostAssignedSurtPrefixPattern = Pattern
259 .compile(getTopmostAssignedSurtPrefixRegex());
260 }
261 return topmostAssignedSurtPrefixPattern;
262 }
263
264 public static synchronized String getTopmostAssignedSurtPrefixRegex() {
265 if (topmostAssignedSurtPrefixRegex == null) {
266
267 BufferedReader reader = new BufferedReader(new InputStreamReader(
268 PublicSuffixes.class.getClassLoader().getResourceAsStream(
269 "effective_tld_names.dat")));
270 topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
271 IOUtils.closeQuietly(reader);
272 }
273 return topmostAssignedSurtPrefixRegex;
274 }
275
276 public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) {
277 List<String> list;
278 try {
279 list = readPublishedFileToSurtList(reader);
280 } catch (IOException e) {
281 throw new RuntimeException(e);
282 }
283 return surtPrefixRegexFromSurtList(list);
284 }
285
286 /***
287 * Truncate SURT to its topmost assigned domain segment; that is,
288 * the public suffix plus one segment, but as a SURT-ordered prefix.
289 *
290 * if the pattern doesn't match, the passed-in SURT is returned.
291 *
292 * @param surt SURT to truncate
293 * @return truncated-to-topmost-assigned SURT prefix
294 */
295 public static String reduceSurtToTopmostAssigned(String surt) {
296 Matcher matcher = TextUtils.getMatcher(
297 getTopmostAssignedSurtPrefixRegex(), surt);
298 if (matcher.find()) {
299 surt = matcher.group();
300 }
301 TextUtils.recycleMatcher(matcher);
302 return surt;
303 }
304 }