SurtPrefixSet xref

View Javadoc

1   /* SURTPrefixSet
2   *
3   * $Id: SurtPrefixSet.java 6704 2009-11-25 01:38:55Z gojomo $
4   *
5   * Created on Jul 23, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.util;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedOutputStream;
29  import java.io.BufferedReader;
30  import java.io.FileInputStream;
31  import java.io.FileOutputStream;
32  import java.io.IOException;
33  import java.io.InputStream;
34  import java.io.InputStreamReader;
35  import java.io.PrintStream;
36  import java.io.Reader;
37  import java.io.Writer;
38  import java.util.Iterator;
39  
40  import org.apache.commons.httpclient.URIException;
41  import org.archive.net.UURI;
42  import org.archive.net.UURIFactory;
43  import org.archive.util.iterator.LineReadingIterator;
44  import org.archive.util.iterator.RegexpLineIterator;
45  
46  /***
47   * Specialized TreeSet for keeping a set of String prefixes. 
48   * 
49   * Redundant prefixes (those that are themselves prefixed
50   * by other set entries) are eliminated.
51   * 
52   * @author gojomo
53   */
54  public class SurtPrefixSet extends PrefixSet {
55  
56      private static final long serialVersionUID = 2598365040524933110L;
57  
58      private static final String SURT_PREFIX_DIRECTIVE = "+";
59  
60      /***
61       * Read a set of SURT prefixes from a reader source; keep sorted and 
62       * with redundant entries removed.
63       * 
64       * @param r reader over file of SURT_format strings
65       * @throws IOException
66       */
67      public void importFrom(Reader r) {
68          BufferedReader reader = new BufferedReader(r);
69          String s;
70          
71          Iterator iter = 
72              new RegexpLineIterator(
73                      new LineReadingIterator(reader),
74                      RegexpLineIterator.COMMENT_LINE,
75                      RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
76                      RegexpLineIterator.ENTRY);
77  
78          while (iter.hasNext()) {
79              s = (String) iter.next();
80              add(s.toLowerCase());
81          }
82      }
83  
84      /***
85       * @param r Where to read from.
86       */
87      public void importFromUris(Reader r) {
88          BufferedReader reader = new BufferedReader(r);
89          String s;
90          
91          Iterator iter = 
92              new RegexpLineIterator(
93                      new LineReadingIterator(reader),
94                      RegexpLineIterator.COMMENT_LINE,
95                      RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
96                      RegexpLineIterator.ENTRY);
97  
98          while (iter.hasNext()) {
99              s = (String) iter.next();
100             // s is a URI (or even fragmentary hostname), not a SURT
101             addFromPlain(s);
102         }
103     }
104 
105     /***
106      * Import SURT prefixes from a reader with mixed URI and SURT prefix
107      * format. 
108      * 
109      * @param r  the reader to import the prefixes from
110      * @param deduceFromSeeds   true to also import SURT prefixes implied
111      *                          from normal URIs/hostname seeds
112      */
113     public void importFromMixed(Reader r, boolean deduceFromSeeds) {
114         BufferedReader reader = new BufferedReader(r);
115         String s;
116         
117         Iterator iter = 
118             new RegexpLineIterator(
119                     new LineReadingIterator(reader),
120                     RegexpLineIterator.COMMENT_LINE,
121                     RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
122                     RegexpLineIterator.ENTRY);
123 
124         while (iter.hasNext()) {
125             s = (String) iter.next();
126             if(s.startsWith(SURT_PREFIX_DIRECTIVE)) {
127                 // it's specifically a SURT prefix line
128                 String u = s.substring(SURT_PREFIX_DIRECTIVE.length()).trim();
129                 if(u.indexOf("(")>0) {
130                     // formal SURT prefix; toLowerCase just in case
131                     add(u.toLowerCase());
132                 } else {
133                     // hostname/normal form URI from which 
134                     // to deduce SURT prefix
135                     addFromPlain(u);
136                 }
137                 
138                 continue; 
139             } else {
140                 if(deduceFromSeeds) {
141                     // also deducing 'implied' SURT prefixes 
142                     // from normal URIs/hostname seeds
143                     addFromPlain(s);
144                 }
145             }
146         }
147     }
148     
149     /***
150      * Given a plain URI or hostname, deduce an implied SURT prefix from
151      * it and add to active prefixes. 
152      * 
153      * @param u String of URI or hostname
154      */
155     private void addFromPlain(String u) {
156         u = prefixFromPlain(u);
157         add(u);
158     }
159 
160     /***
161      * Given a plain URI or hostname/hostname+path, deduce an implied SURT 
162      * prefix from it. Results may be unpredictable on strings that cannot
163      * be interpreted as URIs. 
164      * 
165      * UURI 'fixup' is applied to the URI that is built. 
166      *
167      * @param u URI or almost-URI to consider
168      * @return implied SURT prefix form
169      */
170     public static String prefixFromPlain(String u) {
171         u = ArchiveUtils.addImpliedHttpIfNecessary(u);
172         u = coerceFromHttpsForComparison(u);
173         boolean trailingSlash = u.endsWith("/");
174         // ensure all typical UURI cleanup (incl. IDN-punycoding) is done
175         try {
176             u = UURIFactory.getInstance(u).toString();
177         } catch (URIException e) {
178             e.printStackTrace();
179             // allow to continue with original string uri
180         }
181         // except: don't let UURI-fixup add a trailing slash
182         // if it wasn't already there (presence or absence of
183         // such slash has special meaning specifying implied
184         // SURT prefixes)
185         if(!trailingSlash && u.endsWith("/")) {
186             u = u.substring(0,u.length()-1);
187         }
188         // convert to full SURT
189         u = SURT.fromURI(u);
190         // truncate to implied prefix
191         u = SurtPrefixSet.asPrefix(u);
192         return u;
193     }
194 
195     /***
196      * For SURT comparisons -- prefixes or candidates being checked against
197      * those prefixes -- we treat https URIs as if they were http.
198      * 
199      * @param u string to coerce if it has https scheme
200      * @return string converted to http scheme, or original if not necessary
201      */
202     private static String coerceFromHttpsForComparison(String u) {
203         if (u.startsWith("https://")) {
204             u = "http" + u.substring("https".length());
205         }
206         return u;
207     }
208 
209     /***
210      * Utility method for truncating a SURT that came from a 
211      * full URI (as a seed, for example) into a prefix
212      * for determining inclusion.
213      * 
214      * This involves: 
215      * <pre>
216      *    (1) removing the last path component, if any
217      *        (anything after the last '/', if there are
218      *        at least 3 '/'s)
219      *    (2) removing a trailing ')', if present, opening
220      *        the possibility of proper subdomains. (This
221      *        means that the presence or absence of a
222      *        trailing '/' after a hostname in a seed list
223      *        is significant for the how the SURT prefix is 
224      *        created, even though it is not signficant for 
225      *        the URI's treatment as a seed.)
226      * </pre>
227      *
228      * @param s String to work on.
229      * @return As prefix.
230      */
231     private static String asPrefix(String s) {
232         // Strip last path-segment, if more than 3 slashes
233         s = s.replaceAll("^(.*//.*/)[^/]*","$1");
234         // Strip trailing ")", if present and NO path (no 3rd slash).
235         if (!s.endsWith("/")) {
236             s = s.replaceAll("^(.*)//)","$1");
237         }
238         return s;
239     }
240 
241     /***
242      * Calculate the SURT form URI to use as a candidate against prefixes
243      * from the given Object (CandidateURI or UURI)
244      * 
245      * @param object CandidateURI or UURI
246      * @return SURT form of URI for evaluation, or null if unavailable
247      */
248     public static String getCandidateSurt(Object object) {
249         UURI u = UURI.from(object);
250         if (u == null) {
251             return null;
252         }
253         String candidateSurt = u.getSurtForm();
254         // also want to treat https as http
255         candidateSurt = coerceFromHttpsForComparison(candidateSurt);
256         return candidateSurt;
257     }
258     /***
259      * @param fw
260      * @throws IOException
261      */
262     public void exportTo(Writer fw) throws IOException {
263         Iterator iter = this.iterator();
264         while(iter.hasNext()) {
265             fw.write((String)iter.next() + "\n");
266         }
267     }
268 
269     /***
270      * Changes all prefixes so that they enforce an exact host. For
271      * prefixes that already include a ')', this means discarding 
272      * anything after ')' (path info). For prefixes that don't include
273      * a ')' -- domain prefixes open to subdomains -- add the closing
274      * ')' (or ",)").  
275      */
276     public void convertAllPrefixesToHosts() {
277         SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
278         Iterator iter = iterCopy.iterator();
279         while (iter.hasNext()) {
280             String prefix = (String) iter.next();
281             String convPrefix = convertPrefixToHost(prefix);
282             if(prefix!=convPrefix) {
283             	// if returned value not unchanged, update set
284             	this.remove(prefix);
285             	this.add(convPrefix);
286             }
287         }
288     }
289     
290     public static String convertPrefixToHost(String prefix) {
291         if(prefix.endsWith(")")) {
292             return prefix; // no change necessary
293         }
294         if(prefix.indexOf(')')<0) {
295             // open-ended domain prefix
296             if(!prefix.endsWith(",")) {
297                 prefix += ",";
298             }
299             prefix += ")";
300         } else {
301             // prefix with excess path-info
302             prefix = prefix.substring(0,prefix.indexOf(')')+1);
303         }
304         return prefix;
305     }
306 
307     /***
308      * Changes all prefixes so that they only enforce a general
309      * domain (allowing subdomains).For prefixes that don't include
310      * a ')', no change is necessary. For others, truncate everything
311      * from the ')' onward. Additionally, truncate off "www," if it
312      * appears.
313      */
314     public void convertAllPrefixesToDomains() {
315         SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
316         Iterator iter = iterCopy.iterator();
317         while (iter.hasNext()) {
318             String prefix = (String) iter.next();
319             String convPrefix = convertPrefixToDomain(prefix);
320             if(prefix!=convPrefix) {
321             	// if returned value not unchanged, update set
322             	this.remove(prefix);
323             	this.add(convPrefix);
324             }
325         } 
326     }
327     
328     public static String convertPrefixToDomain(String prefix) {
329         if(prefix.indexOf(')')>=0) {
330             prefix = prefix.substring(0,prefix.indexOf(')'));
331         }
332         // strip 'www,' when present
333         if(prefix.endsWith("www,")) {
334             prefix = prefix.substring(0,prefix.length()-4);
335         }
336         return prefix;
337     }
338     
339     /***
340      * Allow class to be used as a command-line tool for converting 
341      * URL lists (or naked host or host/path fragments implied
342      * to be HTTP URLs) to implied SURT prefix form. 
343      * 
344      * Read from stdin or first file argument. Writes to stdout. 
345      *
346      * @param args cmd-line arguments: may include input file
347      * @throws IOException
348      */
349     public static void main(String[] args) throws IOException {
350         InputStream in = args.length > 0 ? new BufferedInputStream(
351                 new FileInputStream(args[0])) : System.in;
352         PrintStream out = args.length > 1 ? new PrintStream(
353                 new BufferedOutputStream(new FileOutputStream(args[1])))
354                 : System.out;
355         BufferedReader br =
356             new BufferedReader(new InputStreamReader(in));
357         String line;
358         while((line = br.readLine())!=null) {
359             if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
360             line = line.trim();
361             if(line.length()==0) continue;
362             out.println(prefixFromPlain(line));
363         }
364         br.close();
365         out.close();
366     }
367 }