1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException;
30 import java.io.InputStream;
31 import java.util.Vector;
32 import java.util.logging.Logger;
33
34 import org.apache.commons.io.IOUtils;
35 import org.archive.crawler.datamodel.CoreAttributeConstants;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.framework.CrawlController;
38 import org.archive.util.UriUtils;
39
40 import com.anotherbigidea.flash.interfaces.SWFActions;
41 import com.anotherbigidea.flash.interfaces.SWFTagTypes;
42 import com.anotherbigidea.flash.interfaces.SWFTags;
43 import com.anotherbigidea.flash.readers.ActionParser;
44 import com.anotherbigidea.flash.readers.SWFReader;
45 import com.anotherbigidea.flash.readers.TagParser;
46 import com.anotherbigidea.flash.structs.AlphaTransform;
47 import com.anotherbigidea.flash.structs.Matrix;
48 import com.anotherbigidea.flash.writers.SWFActionsImpl;
49 import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
50 import com.anotherbigidea.io.InStream;
51
52 /***
53 * Process SWF (flash/shockwave) files for strings that are likely to be
54 * crawlable URIs.
55 *
56 * @author Igor Ranitovic
57 */
58 public class ExtractorSWF extends Extractor implements CoreAttributeConstants {
59
60 private static final long serialVersionUID = 3627359592408010589L;
61
62 private static Logger logger = Logger.getLogger(ExtractorSWF.class
63 .getName());
64
65 protected long numberOfCURIsHandled = 0;
66
67 protected long numberOfLinksExtracted = 0;
68
69
70
71 private static final int MAX_READ_SIZE = 1024 * 1024;
72
73 /***
74 * @param name
75 */
76 public ExtractorSWF(String name) {
77 super(name, "Flash extractor. Extracts URIs from SWF "
78 + "(flash/shockwave) files.");
79 }
80
81 protected void extract(CrawlURI curi) {
82 if (!isHttpTransactionContentToProcess(curi)) {
83 return;
84 }
85
86 String contentType = curi.getContentType();
87 if (contentType == null) {
88 return;
89 }
90 if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
91 && (!curi.toString().toLowerCase().endsWith(".swf"))) {
92 return;
93 }
94
95 InputStream documentStream = null;
96 try {
97 documentStream =
98 curi.getHttpRecorder().getRecordedInput().getContentReplayInputStream();
99
100
101 SWFReader reader = getSWFReader(curi, documentStream);
102 if (reader == null) {
103 return;
104 }
105
106 numberOfCURIsHandled++;
107
108 reader.readFile();
109 } catch (IOException e) {
110 curi.addLocalizedError(getName(), e, "failed reading");
111 } catch (NullPointerException e) {
112 curi.addLocalizedError(getName(), e, "bad .swf file");
113 } catch (NegativeArraySizeException e) {
114 curi.addLocalizedError(getName(), e, "bad .swf file");
115 } finally {
116 IOUtils.closeQuietly(documentStream);
117 }
118
119
120 curi.linkExtractorFinished();
121 logger.fine(curi + " has " + numberOfLinksExtracted + " links.");
122
123 }
124
125 /***
126 * Get a link extracting SWFParser.
127 *
128 * A custom SWFReader which parses links from .swf file.
129 *
130 * @param curi A CrawlURI to be processed.
131 * @return An SWFReader.
132 */
133 private SWFReader getSWFReader(CrawlURI curi, InputStream documentStream) {
134 if (documentStream == null) {
135 return null;
136 }
137
138
139
140 ExtractorSWFActions actions = new ExtractorSWFActions(curi,
141 getController());
142
143 ExtractorSWFTags tags = new ExtractorSWFTags(actions);
144
145 SWFReader reader = new ExtractorSWFReader(getTagParser(tags), documentStream);
146 return reader;
147 }
148
149 class ExtractorSWFReader extends SWFReader
150 {
151 public ExtractorSWFReader(SWFTags consumer, InputStream inputstream) {
152 super(consumer, inputstream);
153 }
154
155 public ExtractorSWFReader(SWFTags consumer, InStream instream)
156 {
157 super(consumer, instream);
158 }
159
160 /***
161 * Override because a corrupt SWF file can cause us to try read
162 * lengths that are hundreds of megabytes in size causing us to
163 * OOME.
164 *
165 * Below is copied from SWFReader parent class.
166 */
167 public int readOneTag() throws IOException {
168 int header = mIn.readUI16();
169 int type = header >> 6;
170 int length = header & 0x3F;
171 boolean longTag = (length == 0x3F);
172 if (longTag) {
173 length = (int) mIn.readUI32();
174 }
175
176 if (length > MAX_READ_SIZE) {
177
178
179 mIn.skipBytes(length);
180 logger.info("oversized SWF tag (type=" + type + ";length="
181 + length + ") skipped");
182 } else {
183 byte[] contents = mIn.read(length);
184 mConsumer.tag(type, longTag, contents);
185 }
186 return type;
187 }
188 }
189
190
191 /***
192 * Get a TagParser
193 *
194 * A custom ExtractorTagParser which ignores all the big binary image/
195 * sound/font types which don't carry URLs is used, to avoid the
196 * occasionally fatal (OutOfMemoryError) memory bloat caused by the
197 * all-in-memory SWF library handling.
198 *
199 * @param customTags
200 * A custom tag parser.
201 * @return An SWFReader.
202 */
203 private TagParser getTagParser(SWFTagTypes customTags) {
204 return new ExtractorTagParser(customTags);
205 }
206
207 /***
208 * TagParser customized to ignore SWFTags that will never contain
209 * extractable URIs.
210 */
211 protected class ExtractorTagParser extends TagParser {
212
213 protected ExtractorTagParser(SWFTagTypes tagtypes) {
214 super(tagtypes);
215 }
216
217 protected void parseDefineBits(InStream in) throws IOException {
218
219 }
220
221 protected void parseDefineBitsJPEG3(InStream in) throws IOException {
222
223 }
224
225 protected void parseDefineBitsLossless(InStream in, int length,
226 boolean hasAlpha) throws IOException {
227
228 }
229
230 protected void parseDefineButtonSound(InStream in) throws IOException {
231
232 }
233
234 protected void parseDefineFont(InStream in) throws IOException {
235
236 }
237
238 protected void parseDefineJPEG2(InStream in, int length)
239 throws IOException {
240
241 }
242
243 protected void parseDefineJPEGTables(InStream in) throws IOException {
244
245 }
246
247 protected void parseDefineShape(int type, InStream in)
248 throws IOException {
249
250 }
251
252 protected void parseDefineSound(InStream in) throws IOException {
253
254 }
255
256 protected void parseFontInfo(InStream in, int length, boolean isFI2)
257 throws IOException {
258
259 }
260
261 protected void parseDefineFont2(InStream in) throws IOException {
262
263 }
264
265
266 @Override
267 protected void parseDefineSprite( InStream in ) throws IOException
268 {
269 int id = in.readUI16();
270 in.readUI16();
271
272 SWFTagTypes sstt = mTagtypes.tagDefineSprite( id );
273
274 if( sstt == null ) return;
275
276
277 TagParser parser = new ExtractorTagParser( sstt );
278 SWFReader reader = new ExtractorSWFReader( parser, in );
279
280 reader.readTags();
281 }
282
283
284
285 @Override
286 protected void parsePlaceObject2( InStream in ) throws IOException
287 {
288 boolean hasClipActions = in.readUBits(1) != 0;
289 boolean hasClipDepth = in.readUBits(1) != 0;
290 boolean hasName = in.readUBits(1) != 0;
291 boolean hasRatio = in.readUBits(1) != 0;
292 boolean hasColorTransform = in.readUBits(1) != 0;
293 boolean hasMatrix = in.readUBits(1) != 0;
294 boolean hasCharacter = in.readUBits(1) != 0;
295 boolean isMove = in.readUBits(1) != 0;
296
297 int depth = in.readUI16();
298
299 int charId = hasCharacter ? in.readUI16() : 0;
300 Matrix matrix = hasMatrix ? new Matrix( in ) : null;
301 AlphaTransform cxform = hasColorTransform ? new AlphaTransform( in ) : null;
302 int ratio = hasRatio ? in.readUI16() : -1;
303 String name = hasName ? in.readString( mStringEncoding ) : null;
304 int clipDepth = hasClipDepth ? in.readUI16() : 0;
305
306 int clipEventFlags = 0;
307
308 if (hasClipActions) {
309 in.readUI16();
310
311
312 clipEventFlags = mFlashVersion < 6 ? in.readUI16() : in.readSI32();
313 }
314
315 SWFActions actions = mTagtypes.tagPlaceObject2(isMove, clipDepth,
316 depth, charId, matrix, cxform, ratio, name, clipEventFlags);
317
318 if (hasClipActions && actions != null) {
319 int flags = 0;
320
321
322 while ((flags = mFlashVersion < 6 ? in.readUI16() : in.readSI32()) != 0) {
323 in.readUI32();
324
325 actions.start(flags);
326 ActionParser parser = new ActionParser(actions, mFlashVersion);
327
328 parser.parse(in);
329 }
330
331 actions.done();
332 }
333 }
334 }
335
336 /***
337 * SWFTagTypes customized to use <code>ExtractorSWFActions</code>, which
338 * parse URI-like strings.
339 */
340 @SuppressWarnings("unchecked")
341 protected class ExtractorSWFTags extends SWFTagTypesImpl {
342
343 private SWFActions actions;
344
345 public ExtractorSWFTags(SWFActions acts) {
346 super(null);
347 actions = acts;
348 }
349
350 public SWFActions tagDefineButton(int id, Vector buttonRecords)
351 throws IOException {
352
353 return actions;
354 }
355
356 public SWFActions tagDefineButton2(int id, boolean trackAsMenu,
357 Vector buttonRecord2s) throws IOException {
358
359 return actions;
360 }
361
362 public SWFActions tagDoAction() throws IOException {
363 return actions;
364 }
365
366 public SWFActions tagDoInActions(int spriteId) throws IOException {
367 return actions;
368 }
369
370 public SWFTagTypes tagDefineSprite(int id) throws IOException {
371 return this;
372 }
373
374 public SWFActions tagPlaceObject2(boolean isMove, int clipDepth,
375 int depth, int charId, Matrix matrix, AlphaTransform cxform,
376 int ratio, String name, int clipActionFlags) throws IOException {
377
378 return actions;
379 }
380 }
381
382 /***
383 * SWFActions that parse URI-like strings. Links discovered using
384 * <code>ExtractorJS</code> are marked as speculative links (hop X). All
385 * other links are marked as embedded links (hop E).
386 *
387 */
388 protected class ExtractorSWFActions extends SWFActionsImpl {
389
390 private CrawlURI curi;
391
392 private CrawlController controller;
393
394 static final String JSSTRING = "javascript:";
395
396 /***
397 * @param curi
398 * SWF URL to handle
399 * @param controller
400 * Crawl controller need for error reporting
401 */
402 public ExtractorSWFActions(CrawlURI curi, CrawlController controller) {
403 assert (curi != null) : "CrawlURI should not be null";
404 this.curi = curi;
405 this.controller = controller;
406 }
407
408 /***
409 * Overwrite handling of discovered URIs.
410 *
411 * @param url
412 * Discovered URL.
413 * @param target
414 * Discovered target (currently not being used.)
415 * @throws IOException
416 */
417 public void getURL(String url, String target) throws IOException {
418 processURIString(url);
419 }
420
421 public void lookupTable(String[] strings) throws IOException {
422 for (String str : strings) {
423 considerStringAsUri(str);
424 }
425 }
426
427 public void push(String value) throws IOException {
428 considerStringAsUri(value);
429 }
430
431 public void considerStringAsUri(String str) throws IOException {
432 if (UriUtils.isLikelyUriJavascriptContextLegacy(str)) {
433 curi.createAndAddLinkRelativeToVia(str,
434 Link.SPECULATIVE_MISC, Link.SPECULATIVE_HOP);
435 incrementLinkCount(1);
436 }
437 }
438
439 public void processURIString(String url) throws IOException {
440 if (url.startsWith(JSSTRING)) {
441 incrementLinkCount(ExtractorJS.considerStrings(
442 curi, url, controller,false));
443 } else {
444 curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,
445 Link.EMBED_HOP);
446 incrementLinkCount(1);
447 }
448 }
449
450 private void incrementLinkCount(long count) {
451 numberOfLinksExtracted += count;
452 }
453 }
454
455 public String report() {
456 StringBuffer ret = new StringBuffer();
457 ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
458 ret.append(" Function: Link extraction on Shockwave Flash "
459 + "documents (.swf)\n");
460
461 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
462 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
463 return ret.toString();
464 }
465 }