View Javadoc

1   /*
2    * ExtractorURI
3    *
4    * $Id: ExtractorURI.java 4671 2006-09-26 23:47:15Z paul_jack $
5    *
6    * Created on July 20, 2006
7    *
8    * Copyright (C) 2006 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.util.ArrayList;
30  import java.util.Collection;
31  import java.util.List;
32  import java.util.logging.Level;
33  import java.util.logging.Logger;
34  import java.util.regex.Matcher;
35  
36  import org.apache.commons.codec.DecoderException;
37  import org.apache.commons.httpclient.URIException;
38  import org.archive.crawler.datamodel.CoreAttributeConstants;
39  import org.archive.crawler.datamodel.CrawlURI;
40  import org.archive.net.LaxURLCodec;
41  import org.archive.net.UURI;
42  import org.archive.util.TextUtils;
43  
44  /***
45   * An extractor for finding URIs inside other URIs. Unlike most other
46   * extractors, this works on URIs discovered by previous extractors. Thus 
47   * it should appear near the end of any set of extractors.
48   *
49   * Initially, only finds absolute HTTP(S) URIs in query-string or its 
50   * parameters.
51   *
52   * TODO: extend to find URIs in path-info
53   *
54   * @author Gordon Mohr
55   *
56   **/
57  
58  public class ExtractorURI extends Extractor implements CoreAttributeConstants {
59  
60      private static final long serialVersionUID = -6273897743240970822L;
61  
62      private static Logger LOGGER =
63          Logger.getLogger(ExtractorURI.class.getName());
64  
65      static final String ABS_HTTP_URI_PATTERN = "^https?://[^//s<>]*$";
66      
67      // FIXME: these counters are not incremented atomically; totals may not
68      // be correct
69      private long numberOfCURIsHandled = 0;
70      private long numberOfLinksExtracted = 0;
71  
72      /***
73       * Constructor
74       * 
75       * @param name
76       */
77      public ExtractorURI(String name) {
78          super(name, "URI Extractor. Extracts links inside other " +
79                  "discovered URIs. Should appear last among extractors.");
80      }
81  
82      /***
83       * Perform usual extraction on a CrawlURI
84       * 
85       * @param curi Crawl URI to process.
86       */
87      public void extract(CrawlURI curi) {
88  
89          this.numberOfCURIsHandled++;
90          // use array copy because discoveriess will add to outlinks
91          Collection<Link> links = curi.getOutLinks();
92          Link[] sourceLinks = links.toArray(new Link[links.size()]);
93          for (Link wref: sourceLinks) {
94              extractLink(curi,wref);
95          }
96      }
97  
98      /***
99       * Consider a single Link for internal URIs
100      * 
101      * @param curi CrawlURI to add discoveries to 
102      * @param wref Link to examine for internal URIs
103      */
104     protected void extractLink(CrawlURI curi, Link wref) {
105         UURI source = UURI.from(wref.getDestination());
106         if(source == null) {
107             // shouldn't happen
108             return; 
109         }
110         List<String> found = extractQueryStringLinks(source);
111         for (String uri : found) {
112             try {
113                 curi.createAndAddLink(
114                         uri, 
115                         Link.SPECULATIVE_MISC,
116                         Link.SPECULATIVE_HOP);
117                 numberOfLinksExtracted++;
118             } catch (URIException e) {
119                 LOGGER.log(Level.FINE, "bad URI", e);
120             }
121         }
122         // TODO: consider path URIs too
123         
124     }
125 
126     /***
127      * Look for URIs inside the supplied UURI.
128      * 
129      * Static for ease of testing or outside use. 
130      * 
131      * @param source UURI to example
132      * @return List of discovered String URIs.
133      */
134     protected static List<String> extractQueryStringLinks(UURI source) {
135         List<String> results = new ArrayList<String>(); 
136         String decodedQuery;
137         try {
138             decodedQuery = source.getQuery();
139         } catch (URIException e1) {
140             // shouldn't happen
141             return results;
142         }
143         if(decodedQuery==null) {
144             return results;
145         }
146         // check if full query-string appears to be http(s) URI
147         Matcher m = TextUtils.getMatcher(ABS_HTTP_URI_PATTERN,decodedQuery);
148         if(m.matches()) {
149             TextUtils.recycleMatcher(m);
150             results.add(decodedQuery);
151         }
152         // split into params, see if any param value is http(s) URI
153         String rawQuery = new String(source.getRawQuery());
154         String[] params = rawQuery.split("&");
155         for (String param : params) {
156             String[] keyVal = param.split("=");
157             if(keyVal.length==2) {
158                 String candidate;
159                 try {
160                     candidate = LaxURLCodec.DEFAULT.decode(keyVal[1]);
161                 } catch (DecoderException e) {
162                     continue;
163                 }
164                 // TODO: use other non-UTF8 codecs when appropriate
165                 m.reset(candidate);
166                 if(m.matches()) {
167                     results.add(candidate);
168                 }
169             }
170         }
171         return results;
172     }
173 
174     public String report() {
175         StringBuffer ret = new StringBuffer();
176         ret.append("Processor: "+ExtractorURI.class.getName()+"\n");
177         ret.append("  Function:          Extracts links inside other URIs\n");
178         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
179         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
180 
181         return ret.toString();
182     }
183 }