1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.util;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedOutputStream;
29 import java.io.BufferedReader;
30 import java.io.FileInputStream;
31 import java.io.FileOutputStream;
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.io.InputStreamReader;
35 import java.io.PrintStream;
36 import java.io.Reader;
37 import java.io.Writer;
38 import java.util.Iterator;
39
40 import org.apache.commons.httpclient.URIException;
41 import org.archive.net.UURI;
42 import org.archive.net.UURIFactory;
43 import org.archive.util.iterator.LineReadingIterator;
44 import org.archive.util.iterator.RegexpLineIterator;
45
46 /***
47 * Specialized TreeSet for keeping a set of String prefixes.
48 *
49 * Redundant prefixes (those that are themselves prefixed
50 * by other set entries) are eliminated.
51 *
52 * @author gojomo
53 */
54 public class SurtPrefixSet extends PrefixSet {
55
56 private static final long serialVersionUID = 2598365040524933110L;
57
58 private static final String SURT_PREFIX_DIRECTIVE = "+";
59
60 /***
61 * Read a set of SURT prefixes from a reader source; keep sorted and
62 * with redundant entries removed.
63 *
64 * @param r reader over file of SURT_format strings
65 * @throws IOException
66 */
67 public void importFrom(Reader r) {
68 BufferedReader reader = new BufferedReader(r);
69 String s;
70
71 Iterator iter =
72 new RegexpLineIterator(
73 new LineReadingIterator(reader),
74 RegexpLineIterator.COMMENT_LINE,
75 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
76 RegexpLineIterator.ENTRY);
77
78 while (iter.hasNext()) {
79 s = (String) iter.next();
80 add(s.toLowerCase());
81 }
82 }
83
84 /***
85 * @param r Where to read from.
86 */
87 public void importFromUris(Reader r) {
88 BufferedReader reader = new BufferedReader(r);
89 String s;
90
91 Iterator iter =
92 new RegexpLineIterator(
93 new LineReadingIterator(reader),
94 RegexpLineIterator.COMMENT_LINE,
95 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
96 RegexpLineIterator.ENTRY);
97
98 while (iter.hasNext()) {
99 s = (String) iter.next();
100
101 addFromPlain(s);
102 }
103 }
104
105 /***
106 * Import SURT prefixes from a reader with mixed URI and SURT prefix
107 * format.
108 *
109 * @param r the reader to import the prefixes from
110 * @param deduceFromSeeds true to also import SURT prefixes implied
111 * from normal URIs/hostname seeds
112 */
113 public void importFromMixed(Reader r, boolean deduceFromSeeds) {
114 BufferedReader reader = new BufferedReader(r);
115 String s;
116
117 Iterator iter =
118 new RegexpLineIterator(
119 new LineReadingIterator(reader),
120 RegexpLineIterator.COMMENT_LINE,
121 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
122 RegexpLineIterator.ENTRY);
123
124 while (iter.hasNext()) {
125 s = (String) iter.next();
126 if(s.startsWith(SURT_PREFIX_DIRECTIVE)) {
127
128 String u = s.substring(SURT_PREFIX_DIRECTIVE.length()).trim();
129 if(u.indexOf("(")>0) {
130
131 add(u.toLowerCase());
132 } else {
133
134
135 addFromPlain(u);
136 }
137
138 continue;
139 } else {
140 if(deduceFromSeeds) {
141
142
143 addFromPlain(s);
144 }
145 }
146 }
147 }
148
149 /***
150 * Given a plain URI or hostname, deduce an implied SURT prefix from
151 * it and add to active prefixes.
152 *
153 * @param u String of URI or hostname
154 */
155 private void addFromPlain(String u) {
156 u = prefixFromPlain(u);
157 add(u);
158 }
159
160 /***
161 * Given a plain URI or hostname/hostname+path, deduce an implied SURT
162 * prefix from it. Results may be unpredictable on strings that cannot
163 * be interpreted as URIs.
164 *
165 * UURI 'fixup' is applied to the URI that is built.
166 *
167 * @param u URI or almost-URI to consider
168 * @return implied SURT prefix form
169 */
170 public static String prefixFromPlain(String u) {
171 u = ArchiveUtils.addImpliedHttpIfNecessary(u);
172 u = coerceFromHttpsForComparison(u);
173 boolean trailingSlash = u.endsWith("/");
174
175 try {
176 u = UURIFactory.getInstance(u).toString();
177 } catch (URIException e) {
178 e.printStackTrace();
179
180 }
181
182
183
184
185 if(!trailingSlash && u.endsWith("/")) {
186 u = u.substring(0,u.length()-1);
187 }
188
189 u = SURT.fromURI(u);
190
191 u = SurtPrefixSet.asPrefix(u);
192 return u;
193 }
194
195 /***
196 * For SURT comparisons -- prefixes or candidates being checked against
197 * those prefixes -- we treat https URIs as if they were http.
198 *
199 * @param u string to coerce if it has https scheme
200 * @return string converted to http scheme, or original if not necessary
201 */
202 private static String coerceFromHttpsForComparison(String u) {
203 if (u.startsWith("https://")) {
204 u = "http" + u.substring("https".length());
205 }
206 return u;
207 }
208
209 /***
210 * Utility method for truncating a SURT that came from a
211 * full URI (as a seed, for example) into a prefix
212 * for determining inclusion.
213 *
214 * This involves:
215 * <pre>
216 * (1) removing the last path component, if any
217 * (anything after the last '/', if there are
218 * at least 3 '/'s)
219 * (2) removing a trailing ')', if present, opening
220 * the possibility of proper subdomains. (This
221 * means that the presence or absence of a
222 * trailing '/' after a hostname in a seed list
223 * is significant for the how the SURT prefix is
224 * created, even though it is not signficant for
225 * the URI's treatment as a seed.)
226 * </pre>
227 *
228 * @param s String to work on.
229 * @return As prefix.
230 */
231 private static String asPrefix(String s) {
232
233 s = s.replaceAll("^(.*//.*/)[^/]*","$1");
234
235 if (!s.endsWith("/")) {
236 s = s.replaceAll("^(.*)//)","$1");
237 }
238 return s;
239 }
240
241 /***
242 * Calculate the SURT form URI to use as a candidate against prefixes
243 * from the given Object (CandidateURI or UURI)
244 *
245 * @param object CandidateURI or UURI
246 * @return SURT form of URI for evaluation, or null if unavailable
247 */
248 public static String getCandidateSurt(Object object) {
249 UURI u = UURI.from(object);
250 if (u == null) {
251 return null;
252 }
253 String candidateSurt = u.getSurtForm();
254
255 candidateSurt = coerceFromHttpsForComparison(candidateSurt);
256 return candidateSurt;
257 }
258 /***
259 * @param fw
260 * @throws IOException
261 */
262 public void exportTo(Writer fw) throws IOException {
263 Iterator iter = this.iterator();
264 while(iter.hasNext()) {
265 fw.write((String)iter.next() + "\n");
266 }
267 }
268
269 /***
270 * Changes all prefixes so that they enforce an exact host. For
271 * prefixes that already include a ')', this means discarding
272 * anything after ')' (path info). For prefixes that don't include
273 * a ')' -- domain prefixes open to subdomains -- add the closing
274 * ')' (or ",)").
275 */
276 public void convertAllPrefixesToHosts() {
277 SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
278 Iterator iter = iterCopy.iterator();
279 while (iter.hasNext()) {
280 String prefix = (String) iter.next();
281 String convPrefix = convertPrefixToHost(prefix);
282 if(prefix!=convPrefix) {
283
284 this.remove(prefix);
285 this.add(convPrefix);
286 }
287 }
288 }
289
290 public static String convertPrefixToHost(String prefix) {
291 if(prefix.endsWith(")")) {
292 return prefix;
293 }
294 if(prefix.indexOf(')')<0) {
295
296 if(!prefix.endsWith(",")) {
297 prefix += ",";
298 }
299 prefix += ")";
300 } else {
301
302 prefix = prefix.substring(0,prefix.indexOf(')')+1);
303 }
304 return prefix;
305 }
306
307 /***
308 * Changes all prefixes so that they only enforce a general
309 * domain (allowing subdomains).For prefixes that don't include
310 * a ')', no change is necessary. For others, truncate everything
311 * from the ')' onward. Additionally, truncate off "www," if it
312 * appears.
313 */
314 public void convertAllPrefixesToDomains() {
315 SurtPrefixSet iterCopy = (SurtPrefixSet) this.clone();
316 Iterator iter = iterCopy.iterator();
317 while (iter.hasNext()) {
318 String prefix = (String) iter.next();
319 String convPrefix = convertPrefixToDomain(prefix);
320 if(prefix!=convPrefix) {
321
322 this.remove(prefix);
323 this.add(convPrefix);
324 }
325 }
326 }
327
328 public static String convertPrefixToDomain(String prefix) {
329 if(prefix.indexOf(')')>=0) {
330 prefix = prefix.substring(0,prefix.indexOf(')'));
331 }
332
333 if(prefix.endsWith("www,")) {
334 prefix = prefix.substring(0,prefix.length()-4);
335 }
336 return prefix;
337 }
338
339 /***
340 * Allow class to be used as a command-line tool for converting
341 * URL lists (or naked host or host/path fragments implied
342 * to be HTTP URLs) to implied SURT prefix form.
343 *
344 * Read from stdin or first file argument. Writes to stdout.
345 *
346 * @param args cmd-line arguments: may include input file
347 * @throws IOException
348 */
349 public static void main(String[] args) throws IOException {
350 InputStream in = args.length > 0 ? new BufferedInputStream(
351 new FileInputStream(args[0])) : System.in;
352 PrintStream out = args.length > 1 ? new PrintStream(
353 new BufferedOutputStream(new FileOutputStream(args[1])))
354 : System.out;
355 BufferedReader br =
356 new BufferedReader(new InputStreamReader(in));
357 String line;
358 while((line = br.readLine())!=null) {
359 if(line.indexOf("#")>0) line=line.substring(0,line.indexOf("#"));
360 line = line.trim();
361 if(line.length()==0) continue;
362 out.println(prefixFromPlain(line));
363 }
364 br.close();
365 out.close();
366 }
367 }