1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.archive.crawler.extractor;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.util.regex.Matcher;
27
28 import javax.management.AttributeNotFoundException;
29
30 import org.apache.commons.io.IOUtils;
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.settings.SimpleType;
34 import org.archive.crawler.settings.Type;
35 import org.archive.net.UURI;
36 import org.archive.util.TextUtils;
37
38 /***
39 * A last ditch extractor that will look at the raw byte code and try to extract
40 * anything that <i>looks</i> like a link.
41 *
42 * If used, it should always be specified as the last link extractor in the
43 * order file.
44 * <p>
45 * To accomplish this it will scan through the bytecode and try and build up
46 * strings of consecutive bytes that all represent characters that are valid
47 * in a URL (see #isURLableChar(int) for details).
48 * Once it hits the end of such a string (i.e. finds a character that
49 * should not be in a URL) it will try to determine if it has found a URL.
50 * This is done be seeing if the string is an IP address prefixed with
51 * http(s):// or contains a dot followed by a Top Level Domain and end of
52 * string or a slash.
53 *
54 * @author Kristinn Sigurdsson
55 */
56 public class ExtractorUniversal extends Extractor
57 implements CoreAttributeConstants {
58
59 private static final long serialVersionUID = -7593380118857156939L;
60
61
62
63
64 private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";
65
66 /*** Default value for how far into an unknown document we should scan
67 * - 10k. A value of 0 or lower will disable this.
68 */
69 private static long DEFAULT_MAX_DEPTH_BYTES = 10240;
70
71 private static String ATTR_MAX_URL_LENGTH = "max-url-length";
72
73 /*** Maximum length for a URI that we try to match.*/
74 private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH;
75
76 /***
77 * Matches any string that begins with http:// or https:// followed by
78 * something that looks like an ip address (four numbers, none longer then
79 * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are
80 * each in the range 0-255.
81 */
82 static final String IP_ADDRESS =
83 "((http://)|(https://))(//d(//d)?(//d)?//.//d(//d)?(//d)?//.//d(//d)?(//d)?//.//d(//d)?(//d)?)";
84
85 /***
86 * Matches any string that begins with a TLD (no .) followed by a '/' slash
87 * or end of string. If followed by slash then nothing after the slash is
88 * of consequence.
89 */
90 public static final String TLDs =
91 "(ac(/.*)?)"
92 + "|(ad(/.*)?)"
93 + "|(ae(/.*)?)"
94 + "|(af(/.*)?)"
95 + "|(ag(/.*)?)"
96 + "|(ai(/.*)?)"
97 + "|(al(/.*)?)"
98 + "|(am(/.*)?)"
99 + "|(an(/.*)?)"
100 + "|(ao(/.*)?)"
101 + "|(aero(/.*)?)"
102 + "|(aq(/.*)?)"
103 + "|(ar(/.*)?)"
104 + "|(as(/.*)?)"
105 + "|(at(/.*)?)"
106 + "|(au(/.*)?)"
107 + "|(aw(/.*)?)"
108 + "|(az(/.*)?)"
109 + "|(ba(/.*)?)"
110 + "|(bb(/.*)?)"
111 + "|(bd(/.*)?)"
112 + "|(be(/.*)?)"
113 + "|(bf(/.*)?)"
114 + "|(bg(/.*)?)"
115 + "|(bh(/.*)?)"
116 + "|(bi(/.*)?)"
117 + "|(biz(/.*)?)"
118 + "|(bj(/.*)?)"
119 + "|(bm(/.*)?)"
120 + "|(bn(/.*)?)"
121 + "|(bo(/.*)?)"
122 + "|(br(/.*)?)"
123 + "|(bs(/.*)?)"
124 + "|(bt(/.*)?)"
125 + "|(bv(/.*)?)"
126 + "|(bw(/.*)?)"
127 + "|(by(/.*)?)"
128 + "|(bz(/.*)?)"
129 + "|(ca(/.*)?)"
130 + "|(cc(/.*)?)"
131 + "|(cd(/.*)?)"
132 + "|(cf(/.*)?)"
133 + "|(cg(/.*)?)"
134 + "|(ch(/.*)?)"
135 + "|(ci(/.*)?)"
136 + "|(ck(/.*)?)"
137 + "|(cl(/.*)?)"
138 + "|(cm(/.*)?)"
139 + "|(cn(/.*)?)"
140 + "|(co(/.*)?)"
141 + "|(com(/.*)?)"
142 + "|(coop(/.*)?)"
143 + "|(cr(/.*)?)"
144 + "|(cs(/.*)?)"
145 + "|(cu(/.*)?)"
146 + "|(cv(/.*)?)"
147 + "|(cx(/.*)?)"
148 + "|(cy(/.*)?)"
149 + "|(cz(/.*)?)"
150 + "|(de(/.*)?)"
151 + "|(dj(/.*)?)"
152 + "|(dk(/.*)?)"
153 + "|(dm(/.*)?)"
154 + "|(do(/.*)?)"
155 + "|(dz(/.*)?)"
156 + "|(ec(/.*)?)"
157 + "|(edu(/.*)?)"
158 + "|(ee(/.*)?)"
159 + "|(eg(/.*)?)"
160 + "|(eh(/.*)?)"
161 + "|(er(/.*)?)"
162 + "|(es(/.*)?)"
163 + "|(et(/.*)?)"
164 + "|(fi(/.*)?)"
165 + "|(fj(/.*)?)"
166 + "|(fk(/.*)?)"
167 + "|(fm(/.*)?)"
168 + "|(fo(/.*)?)"
169 + "|(fr(/.*)?)"
170 + "|(ga(/.*)?)"
171 + "|(gd(/.*)?)"
172 + "|(ge(/.*)?)"
173 + "|(gf(/.*)?)"
174 + "|(gg(/.*)?)"
175 + "|(gh(/.*)?)"
176 + "|(gi(/.*)?)"
177 + "|(gl(/.*)?)"
178 + "|(gm(/.*)?)"
179 + "|(gn(/.*)?)"
180 + "|(gov(/.*)?)"
181 + "|(gp(/.*)?)"
182 + "|(gq(/.*)?)"
183 + "|(gr(/.*)?)"
184 + "|(gs(/.*)?)"
185 + "|(gt(/.*)?)"
186 + "|(gu(/.*)?)"
187 + "|(gw(/.*)?)"
188 + "|(gy(/.*)?)"
189 + "|(hk(/.*)?)"
190 + "|(hm(/.*)?)"
191 + "|(hn(/.*)?)"
192 + "|(hr(/.*)?)"
193 + "|(ht(/.*)?)"
194 + "|(hu(/.*)?)"
195 + "|(id(/.*)?)"
196 + "|(ie(/.*)?)"
197 + "|(il(/.*)?)"
198 + "|(im(/.*)?)"
199 + "|(in(/.*)?)"
200 + "|(info(/.*)?)"
201 + "|(int(/.*)?)"
202 + "|(io(/.*)?)"
203 + "|(iq(/.*)?)"
204 + "|(ir(/.*)?)"
205 + "|(is(/.*)?)"
206 + "|(it(/.*)?)"
207 + "|(je(/.*)?)"
208 + "|(jm(/.*)?)"
209 + "|(jo(/.*)?)"
210 + "|(jp(/.*)?)"
211 + "|(ke(/.*)?)"
212 + "|(kg(/.*)?)"
213 + "|(kh(/.*)?)"
214 + "|(ki(/.*)?)"
215 + "|(km(/.*)?)"
216 + "|(kn(/.*)?)"
217 + "|(kp(/.*)?)"
218 + "|(kr(/.*)?)"
219 + "|(kw(/.*)?)"
220 + "|(ky(/.*)?)"
221 + "|(kz(/.*)?)"
222 + "|(la(/.*)?)"
223 + "|(lb(/.*)?)"
224 + "|(lc(/.*)?)"
225 + "|(li(/.*)?)"
226 + "|(lk(/.*)?)"
227 + "|(lr(/.*)?)"
228 + "|(ls(/.*)?)"
229 + "|(lt(/.*)?)"
230 + "|(lu(/.*)?)"
231 + "|(lv(/.*)?)"
232 + "|(ly(/.*)?)"
233 + "|(ma(/.*)?)"
234 + "|(mc(/.*)?)"
235 + "|(md(/.*)?)"
236 + "|(mg(/.*)?)"
237 + "|(mh(/.*)?)"
238 + "|(mil(/.*)?)"
239 + "|(mk(/.*)?)"
240 + "|(ml(/.*)?)"
241 + "|(mm(/.*)?)"
242 + "|(mn(/.*)?)"
243 + "|(mo(/.*)?)"
244 + "|(mp(/.*)?)"
245 + "|(mq(/.*)?)"
246 + "|(mr(/.*)?)"
247 + "|(ms(/.*)?)"
248 + "|(mt(/.*)?)"
249 + "|(mu(/.*)?)"
250 + "|(museum(/.*)?)"
251 + "|(mv(/.*)?)"
252 + "|(mw(/.*)?)"
253 + "|(mx(/.*)?)"
254 + "|(my(/.*)?)"
255 + "|(mz(/.*)?)"
256 + "|(na(/.*)?)"
257 + "|(name(/.*)?)"
258 + "|(nc(/.*)?)"
259 + "|(ne(/.*)?)"
260 + "|(net(/.*)?)"
261 + "|(nf(/.*)?)"
262 + "|(ng(/.*)?)"
263 + "|(ni(/.*)?)"
264 + "|(nl(/.*)?)"
265 + "|(no(/.*)?)"
266 + "|(np(/.*)?)"
267 + "|(nr(/.*)?)"
268 + "|(nt(/.*)?)"
269 + "|(nu(/.*)?)"
270 + "|(nz(/.*)?)"
271 + "|(om(/.*)?)"
272 + "|(org(/.*)?)"
273 + "|(pa(/.*)?)"
274 + "|(pe(/.*)?)"
275 + "|(pf(/.*)?)"
276 + "|(pg(/.*)?)"
277 + "|(ph(/.*)?)"
278 + "|(pk(/.*)?)"
279 + "|(pl(/.*)?)"
280 + "|(pm(/.*)?)"
281 + "|(pn(/.*)?)"
282 + "|(pr(/.*)?)"
283 + "|(pro(/.*)?)"
284 + "|(ps(/.*)?)"
285 + "|(pt(/.*)?)"
286 + "|(pw(/.*)?)"
287 + "|(py(/.*)?)"
288 + "|(qa(/.*)?)"
289 + "|(re(/.*)?)"
290 + "|(ro(/.*)?)"
291 + "|(ru(/.*)?)"
292 + "|(rw(/.*)?)"
293 + "|(sa(/.*)?)"
294 + "|(sb(/.*)?)"
295 + "|(sc(/.*)?)"
296 + "|(sd(/.*)?)"
297 + "|(se(/.*)?)"
298 + "|(sg(/.*)?)"
299 + "|(sh(/.*)?)"
300 + "|(si(/.*)?)"
301 + "|(sj(/.*)?)"
302 + "|(sk(/.*)?)"
303 + "|(sl(/.*)?)"
304 + "|(sm(/.*)?)"
305 + "|(sn(/.*)?)"
306 + "|(so(/.*)?)"
307 + "|(sr(/.*)?)"
308 + "|(sv(/.*)?)"
309 + "|(st(/.*)?)"
310 + "|(sy(/.*)?)"
311 + "|(sz(/.*)?)"
312 + "|(tc(/.*)?)"
313 + "|(td(/.*)?)"
314 + "|(tf(/.*)?)"
315 + "|(tg(/.*)?)"
316 + "|(th(/.*)?)"
317 + "|(tj(/.*)?)"
318 + "|(tk(/.*)?)"
319 + "|(tm(/.*)?)"
320 + "|(tn(/.*)?)"
321 + "|(to(/.*)?)"
322 + "|(tp(/.*)?)"
323 + "|(tr(/.*)?)"
324 + "|(tt(/.*)?)"
325 + "|(tv(/.*)?)"
326 + "|(tw(/.*)?)"
327 + "|(tz(/.*)?)"
328 + "|(ua(/.*)?)"
329 + "|(ug(/.*)?)"
330 + "|(uk(/.*)?)"
331 + "|(um(/.*)?)"
332 + "|(us(/.*)?)"
333 + "|(uy(/.*)?)"
334 + "|(uz(/.*)?)"
335 + "|(va(/.*)?)"
336 + "|(vc(/.*)?)"
337 + "|(ve(/.*)?)"
338 + "|(vg(/.*)?)"
339 + "|(vi(/.*)?)"
340 + "|(vn(/.*)?)"
341 + "|(vu(/.*)?)"
342 + "|(wf(/.*)?)"
343 + "|(ws(/.*)?)"
344 + "|(ye(/.*)?)"
345 + "|(yt(/.*)?)"
346 + "|(yu(/.*)?)"
347 + "|(za(/.*)?)"
348 + "|(zm(/.*)?)"
349 + "|(zw(/.*)?)"
350 ;
351
352 protected long numberOfCURIsHandled = 0;
353 protected long numberOfLinksExtracted= 0;
354
355 /***
356 * Constructor
357 * @param name The name of the module.
358 */
359 public ExtractorUniversal(String name) {
360 super(name, "Link extraction on unknown file types. A best effort" +
361 " extractor that looks at the raw byte code of any file " +
362 "that has not been handled by another extractor and tries" +
363 " to find URIs. Will only match absolute URIs.");
364 Type e;
365 e = addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES,
366 "How deep to look into files for URI strings, in bytes",
367 new Long(DEFAULT_MAX_DEPTH_BYTES)));
368 e.setExpertSetting(true);
369 e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH,
370 "Max length of URIs in bytes", new Long(DEFAULT_MAX_URL_LENGTH)));
371 e.setExpertSetting(true);
372 }
373
374 protected void extract(CrawlURI curi) {
375 if (!isHttpTransactionContentToProcess(curi)) {
376 return;
377 }
378
379 numberOfCURIsHandled++;
380
381 InputStream instream = null;
382 try {
383 instream = curi.getHttpRecorder().getRecordedInput().
384 getContentReplayInputStream();
385 int ch = instream.read();
386 StringBuffer lookat = new StringBuffer();
387 long counter = 0;
388 long maxdepth = ((Long)getAttribute(ATTR_MAX_DEPTH_BYTES,curi)).
389 longValue();
390 if(maxdepth<=0){
391 maxdepth = Long.MAX_VALUE;
392 }
393 long maxURLLength = ((Long)getAttribute(ATTR_MAX_URL_LENGTH,curi)).
394 longValue();
395 boolean foundDot = false;
396 while(ch != -1 && ++counter <= maxdepth) {
397 if(lookat.length()>maxURLLength){
398
399 lookat = new StringBuffer();
400 foundDot = false;
401 }
402 else if(isURLableChar(ch)){
403
404 if(ch == 46){
405
406 foundDot = true;
407 }
408 lookat.append((char)ch);
409 } else if(lookat.length() > 3 && foundDot) {
410
411
412
413 String newURL = lookat.toString();
414 if(looksLikeAnURL(newURL))
415 {
416
417
418
419
420 if(newURL.toLowerCase().indexOf("http") > 0){
421
422 newURL = newURL.substring(newURL.toLowerCase().
423 indexOf("http"));
424 }
425 while(newURL.substring(newURL.length()-1).equals("."))
426 {
427
428 newURL = newURL.substring(0,newURL.length()-1);
429 }
430
431
432 numberOfLinksExtracted++;
433 curi.createAndAddLink(newURL,Link.SPECULATIVE_MISC,Link.SPECULATIVE_HOP);
434 }
435
436 lookat = new StringBuffer();
437 foundDot = false;
438 } else if(lookat.length()>0) {
439
440 lookat = new StringBuffer();
441 foundDot = false;
442 }
443 ch = instream.read();
444 }
445 } catch(IOException e){
446
447 e.printStackTrace();
448 } catch (AttributeNotFoundException e) {
449
450 e.printStackTrace();
451 } finally {
452 IOUtils.closeQuietly(instream);
453 }
454
455 curi.linkExtractorFinished();
456 }
457
458 /***
459 * This method takes a look at a string and determines if it could be a URL.
460 * To qualify the string must either begin with "http://" (https would also
461 * work) followed by something that looks like an IP address or contain
462 * within the string (possible at the end but not at the beginning) a TLD
463 * (Top Level Domain) preceded by a dot.
464 *
465 * @param lookat The string to examine in an effort to determine if it
466 * could be a URL
467 * @return True if the string matches the above criteria for a URL.
468 */
469 private boolean looksLikeAnURL(String lookat) {
470 if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){
471
472
473 Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat);
474 boolean testVal = ip.matches();
475 TextUtils.recycleMatcher(ip);
476 if(testVal){
477 return true;
478 }
479 }
480
481 int dot = lookat.indexOf(".");
482 if(dot!=0){
483 while(dot != -1 && dot < lookat.length()){
484 lookat = lookat.substring(dot+1);
485 if (isTLD(lookat.substring(0, lookat.length() <= 6?
486 lookat.length(): 6)))
487 {
488 return true;
489 }
490 dot = lookat.indexOf(".");
491 }
492 }
493
494 return false;
495 }
496
497 /***
498 * Checks if a string is equal to known Top Level Domain. The string may
499 * contain additional characters <i>after</i> the TLD but not before.
500 * @param potentialTLD The string (usually 2-6 chars) to check if it starts
501 * with a TLD.
502 * @return True if the given string starts with the name of a known TLD
503 *
504 * @see #TLDs
505 */
506 private boolean isTLD(String potentialTLD) {
507 if(potentialTLD.length()<2){
508 return false;
509 }
510
511 potentialTLD.toLowerCase();
512 Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD);
513 boolean ret = uri.matches();
514 TextUtils.recycleMatcher(uri);
515 return ret;
516 }
517
518 /***
519 * Determines if a char (as represented by an int in the range of 0-255) is
520 * a character (in the Ansi character set) that can be present in a URL.
521 * This method takes a <b>strict</b> approach to what characters can be in
522 * a URL.
523 * <p>
524 * The following are considered to be 'URLable'<br>
525 * <ul>
526 * <li> <code># $ % & + , - . /</code> values 35-38,43-47
527 * <li> <code>[0-9]</code> values 48-57
528 * <li> <code>: ; = ? @</code> value 58-59,61,63-64
529 * <li> <code>[A-Z]</code> values 65-90
530 * <li> <code>_</code> value 95
531 * <li> <code>[a-z]</code> values 97-122
532 * <li> <code>~</code> value 126
533 * </ul>
534 * <p>
535 * To summerize, the following ranges are considered URLable:<br>
536 * 35-38,43-59,61,63-90,95,97-122,126
537 *
538 * @param ch The character (represented by an int) to test.
539 * @return True if it is a URLable character, false otherwise.
540 */
541 private boolean isURLableChar(int ch) {
542 return (ch>=35 && ch<=38)
543 || (ch>=43 && ch<=59)
544 || (ch==61)
545 || (ch>=63 && ch<=90)
546 || (ch==95)
547 || (ch>=97 && ch<=122)
548 || (ch==126);
549 }
550
551
552
553
554 public String report() {
555 StringBuffer ret = new StringBuffer();
556 ret.append("Processor: org.archive.crawler.extractor." +
557 "ExtractorUniversal\n");
558 ret.append(" Function: Link extraction on unknown file" +
559 " types.\n");
560 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
561 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
562
563 return ret.toString();
564 }
565 }