View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Nov 17, 2003
20   *
21   * To change the template for this generated file go to
22   * Window>Preferences>Java>Code Generation>Code and Comments
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.io.IOException;
27  import java.util.logging.Logger;
28  import java.util.regex.Matcher;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.apache.commons.lang.StringEscapeUtils;
32  import org.archive.crawler.datamodel.CoreAttributeConstants;
33  import org.archive.crawler.datamodel.CrawlURI;
34  import org.archive.crawler.framework.CrawlController;
35  import org.archive.io.ReplayCharSequence;
36  import org.archive.net.UURI;
37  import org.archive.util.DevUtils;
38  import org.archive.util.TextUtils;
39  import org.archive.util.UriUtils;
40  
41  /***
42   * Processes Javascript files for strings that are likely to be
43   * crawlable URIs.
44   *
45   * @contributor gojomo
46   * @contributor szznax
47   *
48   */
49  public class ExtractorJS extends Extractor implements CoreAttributeConstants {
50  
51      private static final long serialVersionUID = -2231962381454717720L;
52  
53      private static Logger LOGGER =
54          Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
55  
56      // finds whitespace-free strings in Javascript
57      // (areas between paired ' or " characters, possibly backslash-quoted
58      // on the ends, but not in the middle)
59      static final String JAVASCRIPT_STRING_EXTRACTOR =
60          "(////{0,8}+(?:\"|\'))(//S{0,"+UURI.MAX_URL_LENGTH+"}?)(?://1)";
61      // GROUPS:
62      // (G1) ' or " with optional leading backslashes
63      // (G2) whitespace-free string delimited on boths ends by G1
64  
65  
66      protected long numberOfCURIsHandled = 0;
67      protected static long numberOfLinksExtracted = 0;
68  
69      
70      // URIs known to produce false-positives with the current JS extractor.
71      // e.g. currently (2.0.3) the JS extractor produces 13 false-positive 
72      // URIs from http://www.google-analytics.com/urchin.js and only 2 
73      // good URIs, which are merely one pixel images.
74      // TODO: remove this blacklist when JS extractor is improved 
75      protected final static String[] EXTRACTOR_URI_EXCEPTIONS = {
76          "http://www.google-analytics.com/urchin.js"
77          };
78      
79      /***
80       * @param name
81       */
82      public ExtractorJS(String name) {
83          super(name, "JavaScript extractor. Link extraction on JavaScript" +
84                  " files (.js).");
85      }
86  
87      /* (non-Javadoc)
88       * @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
89       */
90      public void extract(CrawlURI curi) {
91          // special-cases, for when we know our current JS extractor does poorly.
92          // TODO: remove this test when JS extractor is improved 
93          for (String s: EXTRACTOR_URI_EXCEPTIONS) {
94              if (curi.toString().equals(s))
95                  return;
96          }
97              
98          if (!isHttpTransactionContentToProcess(curi)) {
99              return;
100         }
101         String contentType = curi.getContentType();
102         if ((contentType == null)) {
103             return;
104         }
105         // If content type is not js and if the viaContext
106         // does not begin with 'script', return.
107         if((contentType.indexOf("javascript") < 0) &&
108             (contentType.indexOf("jscript") < 0) &&
109             (contentType.indexOf("ecmascript") < 0) &&
110             (!curi.toString().toLowerCase().endsWith(".js")) &&
111             (curi.getViaContext() == null || !curi.getViaContext().
112                 toString().toLowerCase().startsWith("script"))) {
113             return;
114         }
115 
116         this.numberOfCURIsHandled++;
117 
118         ReplayCharSequence cs = null;
119         try {
120             cs = curi.getHttpRecorder().getReplayCharSequence();
121         } catch (IOException e) {
122             curi.addLocalizedError(this.getName(), e,
123             	"Failed get of replay char sequence.");
124         }
125         if (cs == null) {
126             LOGGER.warning("Failed getting ReplayCharSequence: " +
127                 curi.toString());
128             return;
129         }
130 
131         try {
132             try {
133                 numberOfLinksExtracted += considerStrings(curi, cs,
134                         getController(), true);
135             } catch (StackOverflowError e) {
136                 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
137             }
138             // Set flag to indicate that link extraction is completed.
139             curi.linkExtractorFinished();
140         } finally {
141             // Done w/ the ReplayCharSequence. Close it.
142             if (cs != null) {
143                 try {
144                     cs.close();
145                 } catch (IOException ioe) {
146                     LOGGER.warning(TextUtils.exceptionToString(
147                         "Failed close of ReplayCharSequence.", ioe));
148                 }
149             }
150         }
151     }
152 
153     public static long considerStrings(CrawlURI curi, CharSequence cs,
154             CrawlController controller, boolean handlingJSFile) {
155         long foundLinks = 0;
156         Matcher strings =
157             TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
158         while(strings.find()) {
159             CharSequence subsequence =
160                 cs.subSequence(strings.start(2), strings.end(2));
161 
162             if(UriUtils.isLikelyUriJavascriptContextLegacy(subsequence)) {
163                 String string = subsequence.toString();
164                 string = StringEscapeUtils.unescapeJavaScript(string);
165                 string = UriUtils.speculativeFixup(string, curi.getUURI());
166                 foundLinks++;
167                 try {
168                     if (handlingJSFile) {
169                         curi.createAndAddLinkRelativeToVia(string,
170                             Link.JS_MISC, Link.SPECULATIVE_HOP);
171                     } else {
172                         curi.createAndAddLinkRelativeToBase(string,
173                             Link.JS_MISC, Link.SPECULATIVE_HOP);
174                     }
175                 } catch (URIException e) {
176                     // There may not be a controller (e.g. If we're being run
177                     // by the extractor tool).
178                     if (controller != null) {
179                         controller.logUriError(e, curi.getUURI(), string);
180                     } else {
181                         LOGGER.info(curi + ", " + string + ": " +
182                             e.getMessage());
183                     }
184                 }
185             } else {
186                foundLinks += considerStrings(curi, subsequence,
187                    controller, handlingJSFile);
188             }
189         }
190         TextUtils.recycleMatcher(strings);
191         return foundLinks;
192     }
193 
194     /*
195      * (non-Javadoc)
196      * 
197      * @see org.archive.crawler.framework.Processor#report()
198      */
199     public String report() {
200         StringBuffer ret = new StringBuffer();
201         ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
202         ret.append("  Function:          Link extraction on JavaScript code\n");
203         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
204         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
205 
206         return ret.toString();
207     }
208 }