View Javadoc

1   /* PublicSuffixes.java
2    *
3    * $Id: BloomFilter32bitSplit.java 5197 2007-06-06 01:31:46Z gojomo $
4    *
5    * Created on Jun 13, 2007
6    *
7    * Copyright (C) 2007 Internet Archive
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  
26  package org.archive.net;
27  
28  import java.io.BufferedReader;
29  import java.io.BufferedWriter;
30  import java.io.FileOutputStream;
31  import java.io.FileReader;
32  import java.io.IOException;
33  import java.io.InputStreamReader;
34  import java.io.OutputStreamWriter;
35  import java.util.ArrayList;
36  import java.util.Collections;
37  import java.util.Iterator;
38  import java.util.List;
39  import java.util.SortedSet;
40  import java.util.TreeSet;
41  import java.util.regex.Matcher;
42  import java.util.regex.Pattern;
43  
44  import org.apache.commons.io.IOUtils;
45  import org.archive.util.TextUtils;
46  
47  /***
48   * Utility class for making use of the information about 'public suffixes' at
49   * http://publicsuffix.org.
50   * 
51   * The public suffix list (once known as 'effective TLDs') was motivated by the
52   * need to decide on which broader domains a subdomain was allowed to set
53   * cookies. For example, a server at 'www.example.com' can set cookies for
54   * 'www.example.com' or 'example.com' but not 'com'. 'www.example.co.uk' can set
55   * cookies for 'www.example.co.uk' or 'example.co.uk' but not 'co.uk' or 'uk'.
56   * The number of rules for all top-level-domains and 2nd- or 3rd- level domains
57   * has become quite long; essentially the broadest domain a subdomain may assign
58   * to is the one that was sold/registered to a specific name registrant.
59   * 
60   * This concept should be useful in other contexts, too. Grouping URIs (or
61   * queues of URIs to crawl) together with others sharing the same registered
62   * suffix may be useful for applying the same rules to all, such as assigning
63   * them to the same queue or crawler in a multi- machine setup.
64   * 
65   * @author Gojomo
66   */
67  public class PublicSuffixes {
68      protected static Pattern topmostAssignedSurtPrefixPattern;
69      protected static String topmostAssignedSurtPrefixRegex;
70  
71      /***
72       * Utility method for dumping a regex String, based on a published public
73       * suffix list, which matches any SURT-form hostname up through the broadest
74       * 'private' (assigned/sold) domain-segment. That is, for any of the
75       * SURT-form hostnames...
76       * 
77       * com,example, com,example,www, com,example,california,www
78       * 
79       * ...the regex will match 'com,example,'.
80       * 
81       * @param args
82       * @throws IOException
83       */
84      public static void main(String args[]) throws IOException {
85  
86          String regex;
87          
88          if (args.length == 0 || "=".equals(args[0])) {
89              // use bundled list
90              regex = getTopmostAssignedSurtPrefixRegex();
91          } else {
92              // use specified filename
93              BufferedReader reader = new BufferedReader(new FileReader(args[0]));
94              regex = getTopmostAssignedSurtPrefixRegex(reader);
95              IOUtils.closeQuietly(reader);
96          }
97  
98          boolean needsClose = false;
99          BufferedWriter writer;
100         if (args.length >= 2) {
101             // writer to specified file
102             writer = new BufferedWriter(
103                 new OutputStreamWriter(
104                     new FileOutputStream(args[1]),"UTF-8"));
105             needsClose = true;
106         } else {
107             // write to stdout
108             writer = new BufferedWriter(new OutputStreamWriter(System.out));
109         }
110         writer.append(regex);
111         writer.flush();
112         if (needsClose) {
113             writer.close();
114         }
115     }
116 
117     /***
118      * Reads a file of the format promulgated by publicsuffix.org, ignoring
119      * comments and '!' exceptions/notations, converting domain segments to
120      * SURT-ordering. Leaves glob-style '*' wildcarding in place. Returns sorted
121      * list of unique SURT-ordered prefixes.
122      * 
123      * @param reader
124      * @return
125      * @throws IOException
126      */
127     public static List<String> readPublishedFileToSurtList(BufferedReader reader)
128             throws IOException {
129         String line;
130         List<String> list = new ArrayList<String>();
131         while ((line = reader.readLine()) != null) {
132 
133             // discard whitespace, empty lines, comments, exceptions
134             line = line.trim();
135             if (line.length() == 0 || line.startsWith("//")) {
136                 continue;
137             }
138             // discard utf8 notation after entry
139             line = line.split("//s+")[0];
140             line = line.toLowerCase();
141 
142             // SURT-order domain segments
143             String[] segs = line.split("//.");
144             StringBuilder surtregex = new StringBuilder();
145             for (int i = segs.length - 1; i >= 0; i--) {
146                 if (segs[i].length() > 0) {
147                     // current list has a stray '?' in a .no domain
148                     String fixed = segs[i].replaceAll("//?", "_");
149                     // replace '!' with '+' to indicate lookahead-for-exceptions
150                     // (gets those to sort before '*' at later build-step)
151                     fixed = fixed.replaceAll("!", "+");
152                     surtregex.append(fixed + ",");
153                 }
154             }
155             list.add(surtregex.toString());
156         }
157 
158         Collections.sort(list);
159         // uniq
160         String last = "";
161         Iterator<String> iter = list.iterator();
162         while (iter.hasNext()) {
163             String s = iter.next();
164             if (s.equals(last)) {
165                 iter.remove();
166                 continue;
167             }
168             last = s;
169 //            System.out.println(s);
170         }
171         return list;
172     }
173 
174     /***
175      * Converts SURT-ordered list of public prefixes into a Java regex which
176      * matches the public-portion "plus one" segment, giving the domain on which
177      * cookies can be set or other policy grouping should occur. Also adds to
178      * regex a fallback matcher that for any new/unknown TLDs assumes the
179      * second-level domain is assignable. (Eg: 'zzz,example,').
180      * 
181      * @param list
182      * @return
183      */
184     private static String surtPrefixRegexFromSurtList(List<String> list) {
185         StringBuilder regex = new StringBuilder();
186         regex.append("(?ix)^\n");
187         TreeSet<String> prefixes = new TreeSet<String>(Collections
188                 .reverseOrder());
189         prefixes.addAll(list);
190         prefixes.add("*,"); // for new/unknown TLDs
191         buildRegex("", regex, prefixes);
192         regex.append("\n([//-//w]+,)");
193         String rstring = regex.toString();
194         // convert glob-stars to word-char-runs
195         rstring = rstring.replaceAll("//*", "[////-////w]+");
196         return rstring;
197     }
198 
199     protected static void buildRegex(String stem, StringBuilder regex,
200             SortedSet<String> prefixes) {
201         if (prefixes.isEmpty()) {
202             return;
203         }
204         if (prefixes.size() == 1 && prefixes.first().equals(stem)) {
205             // avoid unnecessary "(?:)"
206             return;
207         }
208         regex.append("(?:");
209         if (stem.length() == 0) {
210             regex.append("\n "); // linebreak-space before first character
211         }
212         Iterator<String> iter = prefixes.iterator();
213         char c = 0;
214         while (iter.hasNext()) {
215             String s = iter.next();
216             if (s.length() > stem.length()) {
217                 char d = s.charAt(stem.length());
218 
219                 if (d == '+') {
220                     // convert exception to zero-width-positive-lookahead
221                     regex.append("(?=" + s.substring(stem.length() + 1) + ")");
222                 } else {
223                     if (d == c) {
224                         continue;
225                     }
226                     c = d;
227                     regex.append(c);
228                     String newStem = s.substring(0, stem.length() + 1);
229                     SortedSet<String> tail = prefixes.tailSet(newStem);
230                     SortedSet<String> range = null;
231                     successor: for (String candidate : tail) {
232                         if (!candidate.equals(newStem)) {
233                             range = prefixes.subSet(s, candidate);
234                             break successor;
235                         }
236                     }
237                     if (range == null) {
238                         range = prefixes.tailSet(s);
239                     }
240                     buildRegex(newStem, regex, range);
241                 }
242                 regex.append('|');
243             } else {
244                 // empty suffix; insert dummy to be eaten when loop exits
245                 regex.append('@');
246             }
247         }
248         // eat the trailing '|' (if no empty '@') or dummy
249         regex.deleteCharAt(regex.length() - 1);
250         regex.append(')');
251         if (stem.length() == 1) {
252             regex.append('\n'); // linebreak for TLDs
253         }
254     }
255 
256     public static synchronized Pattern getTopmostAssignedSurtPrefixPattern() {
257         if (topmostAssignedSurtPrefixPattern == null) {
258             topmostAssignedSurtPrefixPattern = Pattern
259                     .compile(getTopmostAssignedSurtPrefixRegex());
260         }
261         return topmostAssignedSurtPrefixPattern;
262     }
263 
264     public static synchronized String getTopmostAssignedSurtPrefixRegex() {
265         if (topmostAssignedSurtPrefixRegex == null) {
266             // use bundled list
267             BufferedReader reader = new BufferedReader(new InputStreamReader(
268                     PublicSuffixes.class.getClassLoader().getResourceAsStream(
269                             "effective_tld_names.dat")));
270             topmostAssignedSurtPrefixRegex = getTopmostAssignedSurtPrefixRegex(reader);
271             IOUtils.closeQuietly(reader);
272         }
273         return topmostAssignedSurtPrefixRegex;
274     }
275 
276     public static String getTopmostAssignedSurtPrefixRegex(BufferedReader reader) {
277         List<String> list;
278         try {
279             list = readPublishedFileToSurtList(reader);
280         } catch (IOException e) {
281             throw new RuntimeException(e);
282         }
283         return surtPrefixRegexFromSurtList(list);
284     }
285 
286     /***
287      * Truncate SURT to its topmost assigned domain segment; that is, 
288      * the public suffix plus one segment, but as a SURT-ordered prefix. 
289      * 
290      * if the pattern doesn't match, the passed-in SURT is returned.
291      * 
292      * @param surt SURT to truncate
293      * @return truncated-to-topmost-assigned SURT prefix
294      */
295     public static String reduceSurtToTopmostAssigned(String surt) {
296         Matcher matcher = TextUtils.getMatcher(
297                 getTopmostAssignedSurtPrefixRegex(), surt);
298         if (matcher.find()) {
299             surt = matcher.group();
300         }
301         TextUtils.recycleMatcher(matcher);
302         return surt;
303     }
304 }