ExtractorUniversal xref

View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jan 15, 2004
20   *
21   */
22  package org.archive.crawler.extractor;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.util.regex.Matcher;
27  
28  import javax.management.AttributeNotFoundException;
29  
30  import org.apache.commons.io.IOUtils;
31  import org.archive.crawler.datamodel.CoreAttributeConstants;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.settings.SimpleType;
34  import org.archive.crawler.settings.Type;
35  import org.archive.net.UURI;
36  import org.archive.util.TextUtils;
37  
38  /***
39   * A last ditch extractor that will look at the raw byte code and try to extract
40   * anything that <i>looks</i> like a link.
41   *
42   * If used, it should always be specified as the last link extractor in the
43   * order file.
44   * <p>
45   * To accomplish this it will scan through the bytecode and try and build up
46   * strings of consecutive bytes that all represent characters that are valid
47   * in a URL (see #isURLableChar(int) for details).
48   * Once it hits the end of such a string (i.e. finds a character that
49   * should not be in a URL) it will try to determine if it has found a URL.
50   * This is done be seeing if the string is an IP address prefixed with
51   * http(s):// or contains a dot followed by a Top Level Domain and end of
52   * string or a slash.
53   *
54   * @author Kristinn Sigurdsson
55   */
56  public class ExtractorUniversal extends Extractor
57  implements CoreAttributeConstants {
58  
59      private static final long serialVersionUID = -7593380118857156939L;
60  
61  //    private static final Logger logger =
62  //        Logger.getLogger(ExtractorUniversal.class.getName());
63      
64      private static String ATTR_MAX_DEPTH_BYTES = "max-depth-bytes";
65  
66      /*** Default value for how far into an unknown document we should scan
67       * - 10k. A value of 0 or lower will disable this.
68       */
69      private static long DEFAULT_MAX_DEPTH_BYTES = 10240;
70  
71      private static String ATTR_MAX_URL_LENGTH = "max-url-length";
72  
73      /*** Maximum length for a URI that we try to match.*/
74      private static long DEFAULT_MAX_URL_LENGTH = UURI.MAX_URL_LENGTH;
75  
76      /***
77       * Matches any string that begins with http:// or https:// followed by
78       * something that looks like an ip address (four numbers, none longer then
79       * 3 chars seperated by 3 dots). Does <b>not</b> ensure that the numbers are
80       * each in the range 0-255.
81       */
82      static final String IP_ADDRESS =
83          "((http://)|(https://))(//d(//d)?(//d)?//.//d(//d)?(//d)?//.//d(//d)?(//d)?//.//d(//d)?(//d)?)";
84  
85      /***
86       * Matches any string that begins with a TLD (no .) followed by a '/' slash
87       * or end of string. If followed by slash then nothing after the slash is
88       * of consequence.
89       */
90      public static final String TLDs =
91            "(ac(/.*)?)"  // ac  Ascension Island
92          + "|(ad(/.*)?)" // ad  Andorra
93          + "|(ae(/.*)?)" // ae  United Arab Emirates
94          + "|(af(/.*)?)" // af  Afghanistan
95          + "|(ag(/.*)?)" // ag  Antigua and Barbuda
96          + "|(ai(/.*)?)" // ai  Anguilla
97          + "|(al(/.*)?)" // al  Albania
98          + "|(am(/.*)?)" // am  Armenia
99          + "|(an(/.*)?)" // an  Netherlands Antilles
100         + "|(ao(/.*)?)" // ao  Angola
101         + "|(aero(/.*)?)" // aero Air-transport industry
102         + "|(aq(/.*)?)" // aq  Antarctica
103         + "|(ar(/.*)?)" // ar  Argentina
104         + "|(as(/.*)?)" // as  American Samoa
105         + "|(at(/.*)?)" // at  Austria
106         + "|(au(/.*)?)" // au  Australia
107         + "|(aw(/.*)?)" // aw  Aruba
108         + "|(az(/.*)?)" // az  Azerbaijan
109         + "|(ba(/.*)?)" // ba  Bosnia Hercegovina
110         + "|(bb(/.*)?)" // bb  Barbados
111         + "|(bd(/.*)?)" // bd  Bangladesh
112         + "|(be(/.*)?)" // be  Belgium
113         + "|(bf(/.*)?)" // bf  Burkina Faso
114         + "|(bg(/.*)?)" // bg  Bulgaria
115         + "|(bh(/.*)?)" // bh  Bahrain
116         + "|(bi(/.*)?)" // bi  Burundi
117         + "|(biz(/.*)?)" // biz Businesses
118         + "|(bj(/.*)?)" // bj  Benin
119         + "|(bm(/.*)?)" // bm  Bermuda
120         + "|(bn(/.*)?)" // bn  Brunei Darussalam
121         + "|(bo(/.*)?)" // bo  Bolivia
122         + "|(br(/.*)?)" // br  Brazil
123         + "|(bs(/.*)?)" // bs  Bahamas
124         + "|(bt(/.*)?)" // bt  Bhutan
125         + "|(bv(/.*)?)" // bv  Bouvet Island
126         + "|(bw(/.*)?)" // bw  Botswana
127         + "|(by(/.*)?)" // by  Belarus (Byelorussia)
128         + "|(bz(/.*)?)" // bz  Belize
129         + "|(ca(/.*)?)" // ca  Canada
130         + "|(cc(/.*)?)" // cc  Cocos Islands (Keeling)
131         + "|(cd(/.*)?)" // cd  Congo, Democratic Republic of the
132         + "|(cf(/.*)?)" // cf  Central African Republic
133         + "|(cg(/.*)?)" // cg  Congo, Republic of
134         + "|(ch(/.*)?)" // ch  Switzerland
135         + "|(ci(/.*)?)" // ci  Cote d'Ivoire (Ivory Coast)
136         + "|(ck(/.*)?)" // ck  Cook Islands
137         + "|(cl(/.*)?)" // cl  Chile
138         + "|(cm(/.*)?)" // cm  Cameroon
139         + "|(cn(/.*)?)" // cn  China
140         + "|(co(/.*)?)" // co  Colombia
141         + "|(com(/.*)?)" // com Commercial
142         + "|(coop(/.*)?)" // coop Cooperatives
143         + "|(cr(/.*)?)" // cr  Costa Rica
144         + "|(cs(/.*)?)" // cs  Czechoslovakia
145         + "|(cu(/.*)?)" // cu  Cuba
146         + "|(cv(/.*)?)" // cv  Cap Verde
147         + "|(cx(/.*)?)" // cx  Christmas Island
148         + "|(cy(/.*)?)" // cy  Cyprus
149         + "|(cz(/.*)?)" // cz  Czech Republic
150         + "|(de(/.*)?)" // de  Germany
151         + "|(dj(/.*)?)" // dj  Djibouti
152         + "|(dk(/.*)?)" // dk  Denmark
153         + "|(dm(/.*)?)" // dm  Dominica
154         + "|(do(/.*)?)" // do  Dominican Republic
155         + "|(dz(/.*)?)" // dz  Algeria
156         + "|(ec(/.*)?)" // ec  Ecuador
157         + "|(edu(/.*)?)" // edu Educational Institution
158         + "|(ee(/.*)?)" // ee  Estonia
159         + "|(eg(/.*)?)" // eg  Egypt
160         + "|(eh(/.*)?)" // eh  Western Sahara
161         + "|(er(/.*)?)" // er  Eritrea
162         + "|(es(/.*)?)" // es  Spain
163         + "|(et(/.*)?)" // et  Ethiopia
164         + "|(fi(/.*)?)" // fi  Finland
165         + "|(fj(/.*)?)" // fj  Fiji
166         + "|(fk(/.*)?)" // fk  Falkland Islands
167         + "|(fm(/.*)?)" // fm  Micronesia, Federal State of
168         + "|(fo(/.*)?)" // fo  Faroe Islands
169         + "|(fr(/.*)?)" // fr  France
170         + "|(ga(/.*)?)" // ga  Gabon
171         + "|(gd(/.*)?)" // gd  Grenada
172         + "|(ge(/.*)?)" // ge  Georgia
173         + "|(gf(/.*)?)" // gf  French Guiana
174         + "|(gg(/.*)?)" // gg  Guernsey
175         + "|(gh(/.*)?)" // gh  Ghana
176         + "|(gi(/.*)?)" // gi  Gibraltar
177         + "|(gl(/.*)?)" // gl  Greenland
178         + "|(gm(/.*)?)" // gm  Gambia
179         + "|(gn(/.*)?)" // gn  Guinea
180         + "|(gov(/.*)?)" // gov Government (US)
181         + "|(gp(/.*)?)" // gp  Guadeloupe
182         + "|(gq(/.*)?)" // gq  Equatorial Guinea
183         + "|(gr(/.*)?)" // gr  Greece
184         + "|(gs(/.*)?)" // gs  South Georgia and the South Sandwich Islands
185         + "|(gt(/.*)?)" // gt  Guatemala
186         + "|(gu(/.*)?)" // gu  Guam
187         + "|(gw(/.*)?)" // gw  Guinea-Bissau
188         + "|(gy(/.*)?)" // gy  Guyana
189         + "|(hk(/.*)?)" // hk  Hong Kong
190         + "|(hm(/.*)?)" // hm  Heard and McDonald Islands
191         + "|(hn(/.*)?)" // hn  Honduras
192         + "|(hr(/.*)?)" // hr  Croatia/Hrvatska
193         + "|(ht(/.*)?)" // ht  Haiti
194         + "|(hu(/.*)?)" // hu  Hungary
195         + "|(id(/.*)?)" // id  Indonesia
196         + "|(ie(/.*)?)" // ie  Ireland
197         + "|(il(/.*)?)" // il  Israel
198         + "|(im(/.*)?)" // im  Isle of Man
199         + "|(in(/.*)?)" // in  India
200         + "|(info(/.*)?)" // info
201         + "|(int(/.*)?)" // int Int. Organizations
202         + "|(io(/.*)?)" // io  British Indian Ocean Territory
203         + "|(iq(/.*)?)" // iq  Iraq
204         + "|(ir(/.*)?)" // ir  Iran, Islamic Republic of
205         + "|(is(/.*)?)" // is  Iceland
206         + "|(it(/.*)?)" // it  Italy
207         + "|(je(/.*)?)" // je  Jersey
208         + "|(jm(/.*)?)" // jm  Jamaica
209         + "|(jo(/.*)?)" // jo  Jordan
210         + "|(jp(/.*)?)" // jp  Japan
211         + "|(ke(/.*)?)" // ke  Kenya
212         + "|(kg(/.*)?)" // kg  Kyrgyzstan
213         + "|(kh(/.*)?)" // kh  Cambodia
214         + "|(ki(/.*)?)" // ki  Kiribati
215         + "|(km(/.*)?)" // km  Comoros
216         + "|(kn(/.*)?)" // kn  Saint Kitts and Nevis
217         + "|(kp(/.*)?)" // kp  Korea, Democratic People's Republic
218         + "|(kr(/.*)?)" // kr  Korea, Republic of
219         + "|(kw(/.*)?)" // kw  Kuwait
220         + "|(ky(/.*)?)" // ky  Cayman Islands
221         + "|(kz(/.*)?)" // kz  Kazakhstan
222         + "|(la(/.*)?)" // la  Lao People's Democratic Republic
223         + "|(lb(/.*)?)" // lb  Lebanon
224         + "|(lc(/.*)?)" // lc  Saint Lucia
225         + "|(li(/.*)?)" // li  Liechtenstein
226         + "|(lk(/.*)?)" // lk  Sri Lanka
227         + "|(lr(/.*)?)" // lr  Liberia
228         + "|(ls(/.*)?)" // ls  Lesotho
229         + "|(lt(/.*)?)" // lt  Lithuania
230         + "|(lu(/.*)?)" // lu  Luxembourg
231         + "|(lv(/.*)?)" // lv  Latvia
232         + "|(ly(/.*)?)" // ly  Libyan Arab Jamahiriya
233         + "|(ma(/.*)?)" // ma  Morocco
234         + "|(mc(/.*)?)" // mc  Monaco
235         + "|(md(/.*)?)" // md  Moldova, Republic of
236         + "|(mg(/.*)?)" // mg  Madagascar
237         + "|(mh(/.*)?)" // mh  Marshall Islands
238         + "|(mil(/.*)?)" // mil Military (US Dept of Defense)
239         + "|(mk(/.*)?)" // mk  Macedonia, Former Yugoslav Republic
240         + "|(ml(/.*)?)" // ml  Mali
241         + "|(mm(/.*)?)" // mm  Myanmar
242         + "|(mn(/.*)?)" // mn  Mongolia
243         + "|(mo(/.*)?)" // mo  Macau
244         + "|(mp(/.*)?)" // mp  Northern Mariana Islands
245         + "|(mq(/.*)?)" // mq  Martinique
246         + "|(mr(/.*)?)" // mr  Mauritani
247         + "|(ms(/.*)?)" // ms  Montserrat
248         + "|(mt(/.*)?)" // mt  Malta
249         + "|(mu(/.*)?)" // mu  Mauritius
250         + "|(museum(/.*)?)" // museum Museums
251         + "|(mv(/.*)?)" // mv  Maldives
252         + "|(mw(/.*)?)" // mw  Malawi
253         + "|(mx(/.*)?)" // mx  Mexico
254         + "|(my(/.*)?)" // my  Malaysia
255         + "|(mz(/.*)?)" // mz  Mozambique
256         + "|(na(/.*)?)" // na  Namibia
257         + "|(name(/.*)?)" // name Individuals
258         + "|(nc(/.*)?)" // nc  New Caledonia
259         + "|(ne(/.*)?)" // ne  Niger
260         + "|(net(/.*)?)" // net networks
261         + "|(nf(/.*)?)" // nf  Norfolk Island
262         + "|(ng(/.*)?)" // ng  Nigeria
263         + "|(ni(/.*)?)" // ni  Nicaragua
264         + "|(nl(/.*)?)" // nl  Netherlands
265         + "|(no(/.*)?)" // no  Norway
266         + "|(np(/.*)?)" // np  Nepal
267         + "|(nr(/.*)?)" // nr  Nauru
268         + "|(nt(/.*)?)" // nt  Neutral Zone
269         + "|(nu(/.*)?)" // nu  Niue
270         + "|(nz(/.*)?)" // nz  New Zealand
271         + "|(om(/.*)?)" // om  Oman
272         + "|(org(/.*)?)" // org Organization (non-profit)
273         + "|(pa(/.*)?)" // pa  Panama
274         + "|(pe(/.*)?)" // pe  Peru
275         + "|(pf(/.*)?)" // pf  French Polynesia
276         + "|(pg(/.*)?)" // pg  Papua New Guinea
277         + "|(ph(/.*)?)" // ph  Philippines
278         + "|(pk(/.*)?)" // pk  Pakistan
279         + "|(pl(/.*)?)" // pl  Poland
280         + "|(pm(/.*)?)" // pm  St. Pierre and Miquelon
281         + "|(pn(/.*)?)" // pn  Pitcairn Island
282         + "|(pr(/.*)?)" // pr  Puerto Rico
283         + "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians
284         + "|(ps(/.*)?)" // ps  Palestinian Territories
285         + "|(pt(/.*)?)" // pt  Portugal
286         + "|(pw(/.*)?)" // pw  Palau
287         + "|(py(/.*)?)" // py  Paraguay
288         + "|(qa(/.*)?)" // qa  Qatar
289         + "|(re(/.*)?)" // re  Reunion Island
290         + "|(ro(/.*)?)" // ro  Romania
291         + "|(ru(/.*)?)" // ru  Russian Federation
292         + "|(rw(/.*)?)" // rw  Rwanda
293         + "|(sa(/.*)?)" // sa  Saudi Arabia
294         + "|(sb(/.*)?)" // sb  Solomon Islands
295         + "|(sc(/.*)?)" // sc  Seychelles
296         + "|(sd(/.*)?)" // sd  Sudan
297         + "|(se(/.*)?)" // se  Sweden
298         + "|(sg(/.*)?)" // sg  Singapore
299         + "|(sh(/.*)?)" // sh  St. Helena
300         + "|(si(/.*)?)" // si  Slovenia
301         + "|(sj(/.*)?)" // sj  Svalbard and Jan Mayen Islands
302         + "|(sk(/.*)?)" // sk  Slovak Republic
303         + "|(sl(/.*)?)" // sl  Sierra Leone
304         + "|(sm(/.*)?)" // sm  San Marino
305         + "|(sn(/.*)?)" // sn  Senegal
306         + "|(so(/.*)?)" // so  Somalia
307         + "|(sr(/.*)?)" // sr  Suriname
308         + "|(sv(/.*)?)" // sv  El Salvador
309         + "|(st(/.*)?)" // st  Sao Tome and Principe
310         + "|(sy(/.*)?)" // sy  Syrian Arab Republic
311         + "|(sz(/.*)?)" // sz  Swaziland
312         + "|(tc(/.*)?)" // tc  Turks and Caicos Islands
313         + "|(td(/.*)?)" // td  Chad
314         + "|(tf(/.*)?)" // tf  French Southern Territories
315         + "|(tg(/.*)?)" // tg  Togo
316         + "|(th(/.*)?)" // th  Thailand
317         + "|(tj(/.*)?)" // tj  Tajikistan
318         + "|(tk(/.*)?)" // tk  Tokelau
319         + "|(tm(/.*)?)" // tm  Turkmenistan
320         + "|(tn(/.*)?)" // tn  Tunisia
321         + "|(to(/.*)?)" // to  Tonga
322         + "|(tp(/.*)?)" // tp  East Timor
323         + "|(tr(/.*)?)" // tr  Turkey
324         + "|(tt(/.*)?)" // tt  Trinidad and Tobago
325         + "|(tv(/.*)?)" // tv  Tuvalu
326         + "|(tw(/.*)?)" // tw  Taiwan
327         + "|(tz(/.*)?)" // tz  Tanzania
328         + "|(ua(/.*)?)" // ua  Ukraine
329         + "|(ug(/.*)?)" // ug  Uganda
330         + "|(uk(/.*)?)" // uk  United Kingdom
331         + "|(um(/.*)?)" // um  US Minor Outlying Islands
332         + "|(us(/.*)?)" // us  United States
333         + "|(uy(/.*)?)" // uy  Uruguay
334         + "|(uz(/.*)?)" // uz  Uzbekistan
335         + "|(va(/.*)?)" // va  Holy See (City Vatican State)
336         + "|(vc(/.*)?)" // vc  Saint Vincent and the Grenadines
337         + "|(ve(/.*)?)" // ve  Venezuela
338         + "|(vg(/.*)?)" // vg  Virgin Islands (British)
339         + "|(vi(/.*)?)" // vi  Virgin Islands (USA)
340         + "|(vn(/.*)?)" // vn  Vietnam
341         + "|(vu(/.*)?)" // vu  Vanuatu
342         + "|(wf(/.*)?)" // wf  Wallis and Futuna Islands
343         + "|(ws(/.*)?)" // ws  Western Samoa
344         + "|(ye(/.*)?)" // ye  Yemen
345         + "|(yt(/.*)?)" // yt  Mayotte
346         + "|(yu(/.*)?)" // yu  Yugoslavia
347         + "|(za(/.*)?)" // za  South Africa
348         + "|(zm(/.*)?)" // zm  Zambia
349         + "|(zw(/.*)?)" // zw  Zimbabwe
350         ;
351 
352     protected long numberOfCURIsHandled = 0;
353     protected long numberOfLinksExtracted= 0;
354 
355     /***
356      * Constructor
357      * @param name The name of the module.
358      */
359     public ExtractorUniversal(String name) {
360         super(name, "Link extraction on unknown file types. A best effort" +
361                 " extractor that looks at the raw byte code of any file " +
362                 "that has not been handled by another extractor and tries" +
363                 " to find URIs. Will only match absolute URIs.");
364         Type e;
365         e = addElementToDefinition(new SimpleType(ATTR_MAX_DEPTH_BYTES,
366             "How deep to look into files for URI strings, in bytes",
367             new Long(DEFAULT_MAX_DEPTH_BYTES)));
368         e.setExpertSetting(true);
369         e = addElementToDefinition(new SimpleType(ATTR_MAX_URL_LENGTH,
370             "Max length of URIs in bytes", new Long(DEFAULT_MAX_URL_LENGTH)));
371         e.setExpertSetting(true);
372     }
373 
374     protected void extract(CrawlURI curi) {
375         if (!isHttpTransactionContentToProcess(curi)) {
376             return;
377         }
378 
379         numberOfCURIsHandled++;
380 
381         InputStream instream = null;
382         try {
383             instream = curi.getHttpRecorder().getRecordedInput().
384                 getContentReplayInputStream();
385             int ch = instream.read();
386             StringBuffer lookat = new StringBuffer();
387             long counter = 0;
388             long maxdepth = ((Long)getAttribute(ATTR_MAX_DEPTH_BYTES,curi)).
389                 longValue();
390             if(maxdepth<=0){
391                 maxdepth = Long.MAX_VALUE;
392             }
393             long maxURLLength = ((Long)getAttribute(ATTR_MAX_URL_LENGTH,curi)).
394                 longValue();
395             boolean foundDot = false;
396             while(ch != -1 && ++counter <= maxdepth) {
397                 if(lookat.length()>maxURLLength){
398                     //Exceeded maximum length of a URL. Start fresh.
399                     lookat = new StringBuffer();
400                     foundDot = false;
401                 }
402                 else if(isURLableChar(ch)){
403                     //Add to buffer.
404                     if(ch == 46){
405                         // Current character is a dot '.'
406                         foundDot = true;
407                     }
408                     lookat.append((char)ch);
409                 } else if(lookat.length() > 3 && foundDot) {
410                     // It takes a bare mininum of 4 characters to form a URL
411                     // Since we have at least that many let's try link
412                     // extraction.
413                     String newURL = lookat.toString();
414                     if(looksLikeAnURL(newURL))
415                     {
416                         // Looks like we found something.
417 
418                         // Let's start with a little cleanup as we may have
419                         // junk in front or at the end.
420                         if(newURL.toLowerCase().indexOf("http") > 0){
421                             // Got garbage in front of the protocol. Remove.
422                             newURL = newURL.substring(newURL.toLowerCase().
423                                 indexOf("http"));
424                         }
425                         while(newURL.substring(newURL.length()-1).equals("."))
426                         {
427                             // URLs can't end with a dot. Strip it off.
428                             newURL = newURL.substring(0,newURL.length()-1);
429                         }
430 
431                         // And add the URL to speculative embeds.
432                         numberOfLinksExtracted++;
433                         curi.createAndAddLink(newURL,Link.SPECULATIVE_MISC,Link.SPECULATIVE_HOP);
434                     }
435                     // Reset lookat for next string.
436                     lookat = new StringBuffer();
437                     foundDot = false;
438                 } else if(lookat.length()>0) {
439                     // Didn't get enough chars. Reset lookat for next string.
440                     lookat = new StringBuffer();
441                     foundDot = false;
442                 }
443                 ch = instream.read();
444             }
445         } catch(IOException e){
446             //TODO: Handle this exception.
447             e.printStackTrace();
448         } catch (AttributeNotFoundException e) {
449             // TODO Auto-generated catch block
450             e.printStackTrace();
451         } finally {
452             IOUtils.closeQuietly(instream);
453         }
454         // Set flag to indicate that link extraction is completed.
455         curi.linkExtractorFinished();
456     }
457 
458     /***
459      * This method takes a look at a string and determines if it could be a URL.
460      * To qualify the string must either begin with "http://" (https would also
461      * work) followed by something that looks like an IP address or contain
462      * within the string (possible at the end but not at the beginning) a TLD
463      * (Top Level Domain) preceded by a dot.
464      *
465      * @param lookat The string to examine in an effort to determine if it
466      * could be a URL
467      * @return True if the string matches the above criteria for a URL.
468      */
469     private boolean looksLikeAnURL(String lookat) {
470         if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){
471             //Check if the rest of the string looks like an IP address.
472             //if so return true. Otherwise continue on.
473             Matcher ip = TextUtils.getMatcher(IP_ADDRESS, lookat);
474             boolean testVal = ip.matches();
475             TextUtils.recycleMatcher(ip);
476             if(testVal){
477                 return true;
478             }
479         }
480 
481         int dot = lookat.indexOf(".");
482         if(dot!=0){//An URL can't start with a .tld.
483             while(dot != -1 && dot < lookat.length()){
484                 lookat = lookat.substring(dot+1);
485                 if (isTLD(lookat.substring(0, lookat.length() <= 6?
486                     lookat.length(): 6)))
487                 {
488                     return true;
489                 }
490                 dot = lookat.indexOf(".");
491             }
492         }
493 
494         return false;
495     }
496 
497     /***
498      * Checks if a string is equal to known Top Level Domain. The string may
499      * contain additional characters <i>after</i> the TLD but not before.
500      * @param potentialTLD The string (usually 2-6 chars) to check if it starts
501      * with a TLD.
502      * @return True if the given string starts with the name of a known TLD
503      *
504      * @see #TLDs
505      */
506     private boolean isTLD(String potentialTLD) {
507         if(potentialTLD.length()<2){
508             return false;
509         }
510 
511         potentialTLD.toLowerCase();
512         Matcher uri = TextUtils.getMatcher(TLDs, potentialTLD);
513         boolean ret = uri.matches();
514         TextUtils.recycleMatcher(uri);
515         return ret;
516     }
517 
518     /***
519      * Determines if a char (as represented by an int in the range of 0-255) is
520      * a character (in the Ansi character set) that can be present in a URL.
521      * This method takes a <b>strict</b> approach to what characters can be in
522      * a URL.
523      * <p>
524      * The following are considered to be 'URLable'<br>
525      * <ul>
526      *  <li> <code># $ % & + , - . /</code> values 35-38,43-47
527      *  <li> <code>[0-9]</code> values 48-57
528      *  <li> <code>: ; = ? @</code> value 58-59,61,63-64
529      *  <li> <code>[A-Z]</code> values 65-90
530      *  <li> <code>_</code> value 95
531      *  <li> <code>[a-z]</code> values 97-122
532      *  <li> <code>~</code> value 126
533      * </ul>
534      * <p>
535      * To summerize, the following ranges are considered URLable:<br>
536      * 35-38,43-59,61,63-90,95,97-122,126
537      *
538      * @param ch The character (represented by an int) to test.
539      * @return True if it is a URLable character, false otherwise.
540      */
541     private boolean isURLableChar(int ch) {
542         return (ch>=35 && ch<=38)
543             || (ch>=43 && ch<=59)
544             || (ch==61)
545             || (ch>=63 && ch<=90)
546             || (ch==95)
547             || (ch>=97 && ch<=122)
548             || (ch==126);
549     }
550 
551     /* (non-Javadoc)
552      * @see org.archive.crawler.framework.Processor#report()
553      */
554     public String report() {
555         StringBuffer ret = new StringBuffer();
556         ret.append("Processor: org.archive.crawler.extractor." +
557             "ExtractorUniversal\n");
558         ret.append("  Function:          Link extraction on unknown file" +
559             " types.\n");
560         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
561         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
562 
563         return ret.toString();
564     }
565 }