1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.io.IOException;
27 import java.util.logging.Logger;
28 import java.util.regex.Matcher;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.apache.commons.lang.StringEscapeUtils;
32 import org.archive.crawler.datamodel.CoreAttributeConstants;
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.crawler.framework.CrawlController;
35 import org.archive.io.ReplayCharSequence;
36 import org.archive.net.UURI;
37 import org.archive.util.DevUtils;
38 import org.archive.util.TextUtils;
39 import org.archive.util.UriUtils;
40
41 /***
42 * Processes Javascript files for strings that are likely to be
43 * crawlable URIs.
44 *
45 * @contributor gojomo
46 * @contributor szznax
47 *
48 */
49 public class ExtractorJS extends Extractor implements CoreAttributeConstants {
50
51 private static final long serialVersionUID = -2231962381454717720L;
52
53 private static Logger LOGGER =
54 Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
55
56
57
58
59 static final String JAVASCRIPT_STRING_EXTRACTOR =
60 "(////{0,8}+(?:\"|\'))(//S{0,"+UURI.MAX_URL_LENGTH+"}?)(?://1)";
61
62
63
64
65
66 protected long numberOfCURIsHandled = 0;
67 protected static long numberOfLinksExtracted = 0;
68
69
70
71
72
73
74
75 protected final static String[] EXTRACTOR_URI_EXCEPTIONS = {
76 "http://www.google-analytics.com/urchin.js"
77 };
78
79 /***
80 * @param name
81 */
82 public ExtractorJS(String name) {
83 super(name, "JavaScript extractor. Link extraction on JavaScript" +
84 " files (.js).");
85 }
86
87
88
89
90 public void extract(CrawlURI curi) {
91
92
93 for (String s: EXTRACTOR_URI_EXCEPTIONS) {
94 if (curi.toString().equals(s))
95 return;
96 }
97
98 if (!isHttpTransactionContentToProcess(curi)) {
99 return;
100 }
101 String contentType = curi.getContentType();
102 if ((contentType == null)) {
103 return;
104 }
105
106
107 if((contentType.indexOf("javascript") < 0) &&
108 (contentType.indexOf("jscript") < 0) &&
109 (contentType.indexOf("ecmascript") < 0) &&
110 (!curi.toString().toLowerCase().endsWith(".js")) &&
111 (curi.getViaContext() == null || !curi.getViaContext().
112 toString().toLowerCase().startsWith("script"))) {
113 return;
114 }
115
116 this.numberOfCURIsHandled++;
117
118 ReplayCharSequence cs = null;
119 try {
120 cs = curi.getHttpRecorder().getReplayCharSequence();
121 } catch (IOException e) {
122 curi.addLocalizedError(this.getName(), e,
123 "Failed get of replay char sequence.");
124 }
125 if (cs == null) {
126 LOGGER.warning("Failed getting ReplayCharSequence: " +
127 curi.toString());
128 return;
129 }
130
131 try {
132 try {
133 numberOfLinksExtracted += considerStrings(curi, cs,
134 getController(), true);
135 } catch (StackOverflowError e) {
136 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
137 }
138
139 curi.linkExtractorFinished();
140 } finally {
141
142 if (cs != null) {
143 try {
144 cs.close();
145 } catch (IOException ioe) {
146 LOGGER.warning(TextUtils.exceptionToString(
147 "Failed close of ReplayCharSequence.", ioe));
148 }
149 }
150 }
151 }
152
153 public static long considerStrings(CrawlURI curi, CharSequence cs,
154 CrawlController controller, boolean handlingJSFile) {
155 long foundLinks = 0;
156 Matcher strings =
157 TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
158 while(strings.find()) {
159 CharSequence subsequence =
160 cs.subSequence(strings.start(2), strings.end(2));
161
162 if(UriUtils.isLikelyUriJavascriptContextLegacy(subsequence)) {
163 String string = subsequence.toString();
164 string = StringEscapeUtils.unescapeJavaScript(string);
165 string = UriUtils.speculativeFixup(string, curi.getUURI());
166 foundLinks++;
167 try {
168 if (handlingJSFile) {
169 curi.createAndAddLinkRelativeToVia(string,
170 Link.JS_MISC, Link.SPECULATIVE_HOP);
171 } else {
172 curi.createAndAddLinkRelativeToBase(string,
173 Link.JS_MISC, Link.SPECULATIVE_HOP);
174 }
175 } catch (URIException e) {
176
177
178 if (controller != null) {
179 controller.logUriError(e, curi.getUURI(), string);
180 } else {
181 LOGGER.info(curi + ", " + string + ": " +
182 e.getMessage());
183 }
184 }
185 } else {
186 foundLinks += considerStrings(curi, subsequence,
187 controller, handlingJSFile);
188 }
189 }
190 TextUtils.recycleMatcher(strings);
191 return foundLinks;
192 }
193
194
195
196
197
198
199 public String report() {
200 StringBuffer ret = new StringBuffer();
201 ret.append("Processor: org.archive.crawler.extractor.ExtractorJS\n");
202 ret.append(" Function: Link extraction on JavaScript code\n");
203 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
204 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
205
206 return ret.toString();
207 }
208 }