View Javadoc

1   /*
2    * Heritrix
3    *
4    * $Id: ExtractorSWF.java 6830 2010-04-21 23:39:57Z gojomo $
5    *
6    * Created on March 19, 2004
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.io.IOException;
30  import java.io.InputStream;
31  import java.util.Vector;
32  import java.util.logging.Logger;
33  
34  import org.apache.commons.io.IOUtils;
35  import org.archive.crawler.datamodel.CoreAttributeConstants;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.framework.CrawlController;
38  import org.archive.util.UriUtils;
39  
40  import com.anotherbigidea.flash.interfaces.SWFActions;
41  import com.anotherbigidea.flash.interfaces.SWFTagTypes;
42  import com.anotherbigidea.flash.interfaces.SWFTags;
43  import com.anotherbigidea.flash.readers.ActionParser;
44  import com.anotherbigidea.flash.readers.SWFReader;
45  import com.anotherbigidea.flash.readers.TagParser;
46  import com.anotherbigidea.flash.structs.AlphaTransform;
47  import com.anotherbigidea.flash.structs.Matrix;
48  import com.anotherbigidea.flash.writers.SWFActionsImpl;
49  import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
50  import com.anotherbigidea.io.InStream;
51  
52  /***
53   * Process SWF (flash/shockwave) files for strings that are likely to be
54   * crawlable URIs.
55   * 
56   * @author Igor Ranitovic
57   */
58  public class ExtractorSWF extends Extractor implements CoreAttributeConstants {
59  
60  	private static final long serialVersionUID = 3627359592408010589L;
61  
62  	private static Logger logger = Logger.getLogger(ExtractorSWF.class
63  			.getName());
64  
65  	protected long numberOfCURIsHandled = 0;
66  
67  	protected long numberOfLinksExtracted = 0;
68  
69  	// TODO: consider if this should be even smaller, because anything
70  	// containing URLs wouldn't be this big
71  	private static final int MAX_READ_SIZE = 1024 * 1024; // 1MB
72  
73  	/***
74  	 * @param name
75  	 */
76  	public ExtractorSWF(String name) {
77  		super(name, "Flash extractor. Extracts URIs from SWF "
78  				+ "(flash/shockwave) files.");
79  	}
80  
81  	protected void extract(CrawlURI curi) {
82  		if (!isHttpTransactionContentToProcess(curi)) {
83  			return;
84  		}
85  
86  		String contentType = curi.getContentType();
87  		if (contentType == null) {
88  			return;
89  		}
90  		if ((contentType.toLowerCase().indexOf("x-shockwave-flash") < 0)
91  				&& (!curi.toString().toLowerCase().endsWith(".swf"))) {
92  			return;
93  		}
94  
95          InputStream documentStream = null;
96  		try {
97              documentStream = 
98                  curi.getHttpRecorder().getRecordedInput().getContentReplayInputStream();
99              
100             // Get link extracting SWF reader
101             SWFReader reader = getSWFReader(curi, documentStream);
102             if (reader == null) {
103                 return;
104             }
105 
106             numberOfCURIsHandled++;
107 			// Parse file for links
108 			reader.readFile();
109 		} catch (IOException e) {
110 			curi.addLocalizedError(getName(), e, "failed reading");
111 		} catch (NullPointerException e) {
112 			curi.addLocalizedError(getName(), e, "bad .swf file");
113 		} catch (NegativeArraySizeException e) {
114 			curi.addLocalizedError(getName(), e, "bad .swf file");
115 		} finally {
116 		    IOUtils.closeQuietly(documentStream);
117         }
118 
119 		// Set flag to indicate that link extraction is completed.
120 		curi.linkExtractorFinished();
121 		logger.fine(curi + " has " + numberOfLinksExtracted + " links.");
122 
123 	}
124 
125 	/***
126 	 * Get a link extracting SWFParser.
127 	 * 
128 	 * A custom SWFReader which parses links from .swf file.
129 	 * 
130 	 * @param curi A CrawlURI to be processed.
131 	 * @return An SWFReader.
132 	 */
133 	private SWFReader getSWFReader(CrawlURI curi, InputStream documentStream) {
134         if (documentStream == null) {
135             return null;
136         }
137 
138 		// Create SWF actions that will add discoved URIs to CrawlURI
139 		// alist(s).
140 		ExtractorSWFActions actions = new ExtractorSWFActions(curi,
141 				getController());
142 		// Overwrite parsing of specific tags that might have URIs.
143 		ExtractorSWFTags tags = new ExtractorSWFTags(actions);
144 		// Get a SWFReader instance.
145 		SWFReader reader = new ExtractorSWFReader(getTagParser(tags), documentStream);
146 		return reader;
147 	}
148 
149 	class ExtractorSWFReader extends SWFReader
150 	{
151 	    public ExtractorSWFReader(SWFTags consumer, InputStream inputstream) {
152 	        super(consumer, inputstream);
153 	    }
154 	    
155 	    public ExtractorSWFReader(SWFTags consumer, InStream instream)
156 	    {
157 	        super(consumer, instream);
158 	    }    
159 
160 	    /***
161          * Override because a corrupt SWF file can cause us to try read
162          * lengths that are hundreds of megabytes in size causing us to
163          * OOME.
164          * 
165          * Below is copied from SWFReader parent class.
166          */
167         public int readOneTag() throws IOException {
168             int header = mIn.readUI16();
169             int type = header >> 6; // only want the top 10 bits
170             int length = header & 0x3F; // only want the bottom 6 bits
171             boolean longTag = (length == 0x3F);
172             if (longTag) {
173                 length = (int) mIn.readUI32();
174             }
175             // Below test added for Heritrix use.
176             if (length > MAX_READ_SIZE) {
177                 // skip to next, rather than throw IOException ending
178                 // processing
179                 mIn.skipBytes(length);
180                 logger.info("oversized SWF tag (type=" + type + ";length="
181                         + length + ") skipped");
182             } else {
183                 byte[] contents = mIn.read(length);
184                 mConsumer.tag(type, longTag, contents);
185             }
186             return type;
187         }
188     }
189 
190 
191 	/***
192 	 * Get a TagParser
193 	 * 
194 	 * A custom ExtractorTagParser which ignores all the big binary image/
195 	 * sound/font types which don't carry URLs is used, to avoid the
196 	 * occasionally fatal (OutOfMemoryError) memory bloat caused by the
197 	 * all-in-memory SWF library handling.
198 	 * 
199 	 * @param customTags
200 	 *            A custom tag parser.
201 	 * @return An SWFReader.
202 	 */
203 	private TagParser getTagParser(SWFTagTypes customTags) {
204 		return new ExtractorTagParser(customTags);
205 	}
206 
207 	/***
208 	 * TagParser customized to ignore SWFTags that will never contain
209 	 * extractable URIs.
210 	 */
211 	protected class ExtractorTagParser extends TagParser {
212 
213 		protected ExtractorTagParser(SWFTagTypes tagtypes) {
214 			super(tagtypes);
215 		}
216 
217 		protected void parseDefineBits(InStream in) throws IOException {
218 			// DO NOTHING - no URLs to be found in bits
219 		}
220 
221 		protected void parseDefineBitsJPEG3(InStream in) throws IOException {
222 			// DO NOTHING - no URLs to be found in bits
223 		}
224 
225 		protected void parseDefineBitsLossless(InStream in, int length,
226 				boolean hasAlpha) throws IOException {
227 			// DO NOTHING - no URLs to be found in bits
228 		}
229 
230 		protected void parseDefineButtonSound(InStream in) throws IOException {
231 			// DO NOTHING - no URLs to be found in sound
232 		}
233 
234 		protected void parseDefineFont(InStream in) throws IOException {
235 			// DO NOTHING - no URLs to be found in font
236 		}
237 
238 		protected void parseDefineJPEG2(InStream in, int length)
239 				throws IOException {
240 			// DO NOTHING - no URLs to be found in jpeg
241 		}
242 
243 		protected void parseDefineJPEGTables(InStream in) throws IOException {
244 			// DO NOTHING - no URLs to be found in jpeg
245 		}
246 
247 		protected void parseDefineShape(int type, InStream in)
248 				throws IOException {
249 			// DO NOTHING - no URLs to be found in shape
250 		}
251 
252 		protected void parseDefineSound(InStream in) throws IOException {
253 			// DO NOTHING - no URLs to be found in sound
254 		}
255 
256 		protected void parseFontInfo(InStream in, int length, boolean isFI2)
257 				throws IOException {
258 			// DO NOTHING - no URLs to be found in font info
259 		}
260 
261 		protected void parseDefineFont2(InStream in) throws IOException {
262 			// DO NOTHING - no URLs to be found in bits
263 		}
264 		
265 		// heritrix: Overridden to use our TagParser and SWFReader. The rest of the code is the same.
266 		@Override
267 	    protected void parseDefineSprite( InStream in ) throws IOException
268 	    {
269 	        int id         = in.readUI16();
270 	        in.readUI16(); // frame count
271 	        
272 	        SWFTagTypes sstt = mTagtypes.tagDefineSprite( id );
273 	        
274 	        if( sstt == null ) return;
275 	        
276 	        // heritrix: only these two lines differ from super.parseDefineSprite()
277 	        TagParser parser = new ExtractorTagParser( sstt );
278 	        SWFReader reader = new ExtractorSWFReader( parser, in );
279 	        
280 	        reader.readTags();
281 	    }
282 
283 		// Overridden to read 32 bit clip event flags when flash version >= 6.
284         // All the rest of the code is copied directly. Fixes HER-1509.
285 		@Override
286 	    protected void parsePlaceObject2( InStream in ) throws IOException
287 	    {
288 	        boolean hasClipActions    = in.readUBits(1) != 0;
289 	        boolean hasClipDepth      = in.readUBits(1) != 0;
290 	        boolean hasName           = in.readUBits(1) != 0;
291 	        boolean hasRatio          = in.readUBits(1) != 0;
292 	        boolean hasColorTransform = in.readUBits(1) != 0;
293 	        boolean hasMatrix         = in.readUBits(1) != 0;
294 	        boolean hasCharacter      = in.readUBits(1) != 0;
295 	        boolean isMove            = in.readUBits(1) != 0;
296 	    
297 	        int depth = in.readUI16();
298 	        
299 	        int            charId    = hasCharacter      ? in.readUI16()            : 0;
300 	        Matrix         matrix    = hasMatrix         ? new Matrix( in )         : null;
301 	        AlphaTransform cxform    = hasColorTransform ? new AlphaTransform( in ) : null;
302 	        int            ratio     = hasRatio          ? in.readUI16()            : -1;        
303 	        String         name      = hasName           ? in.readString( mStringEncoding )  : null;  
304 	        int            clipDepth = hasClipDepth      ? in.readUI16()            : 0;
305 	        
306 	        int clipEventFlags = 0;
307 	        
308 	        if (hasClipActions) {
309                 in.readUI16(); // reserved
310 
311                 // heritrix: flags size changed in swf version 6
312                 clipEventFlags = mFlashVersion < 6 ? in.readUI16() : in.readSI32();
313             }
314 	        
315 	        SWFActions actions = mTagtypes.tagPlaceObject2(isMove, clipDepth,
316                     depth, charId, matrix, cxform, ratio, name, clipEventFlags);
317 
318             if (hasClipActions && actions != null) {
319                 int flags = 0;
320 
321                 // heritrix: flags size changed in swf version 6
322                 while ((flags = mFlashVersion < 6 ? in.readUI16() : in.readSI32()) != 0) {
323                     in.readUI32(); // length
324 
325                     actions.start(flags);
326                     ActionParser parser = new ActionParser(actions, mFlashVersion);
327 
328                     parser.parse(in);
329                 }
330 
331                 actions.done();
332             }
333         }
334 	}
335 
336 	/***
337 	 * SWFTagTypes customized to use <code>ExtractorSWFActions</code>, which
338 	 * parse URI-like strings.
339 	 */
340     @SuppressWarnings("unchecked")
341 	protected class ExtractorSWFTags extends SWFTagTypesImpl {
342 
343 		private SWFActions actions;
344 
345 		public ExtractorSWFTags(SWFActions acts) {
346 			super(null);
347 			actions = acts;
348 		}
349 
350         public SWFActions tagDefineButton(int id, Vector buttonRecords)
351 				throws IOException {
352 
353 			return actions;
354 		}
355 
356 		public SWFActions tagDefineButton2(int id, boolean trackAsMenu,
357 				Vector buttonRecord2s) throws IOException {
358 
359 			return actions;
360 		}
361 
362 		public SWFActions tagDoAction() throws IOException {
363 			return actions;
364 		}
365 
366 		public SWFActions tagDoInActions(int spriteId) throws IOException {
367 			return actions;
368 		}
369 
370 		public SWFTagTypes tagDefineSprite(int id) throws IOException {
371 			return this;
372 		}
373 
374 		public SWFActions tagPlaceObject2(boolean isMove, int clipDepth,
375 				int depth, int charId, Matrix matrix, AlphaTransform cxform,
376 				int ratio, String name, int clipActionFlags) throws IOException {
377 
378 			return actions;
379 		}
380 	}
381 
382 	/***
383 	 * SWFActions that parse URI-like strings. Links discovered using
384 	 * <code>ExtractorJS</code> are marked as speculative links (hop X). All
385 	 * other links are marked as embedded links (hop E).
386 	 * 
387 	 */
388 	protected class ExtractorSWFActions extends SWFActionsImpl {
389 
390 		private CrawlURI curi;
391 
392 		private CrawlController controller;
393 
394 		static final String JSSTRING = "javascript:";
395 
396 		/***
397 		 * @param curi
398 		 *            SWF URL to handle
399 		 * @param controller
400 		 *            Crawl controller need for error reporting
401 		 */
402 		public ExtractorSWFActions(CrawlURI curi, CrawlController controller) {
403 			assert (curi != null) : "CrawlURI should not be null";
404 			this.curi = curi;
405 			this.controller = controller;
406 		}
407 
408 		/***
409 		 * Overwrite handling of discovered URIs.
410 		 * 
411 		 * @param url
412 		 *            Discovered URL.
413 		 * @param target
414 		 *            Discovered target (currently not being used.)
415 		 * @throws IOException
416 		 */
417 		public void getURL(String url, String target) throws IOException {
418 			processURIString(url);
419 		}
420 
421 		public void lookupTable(String[] strings) throws IOException {
422 			for (String str : strings) {
423 				considerStringAsUri(str);
424 			}
425 		}
426 
427 		public void push(String value) throws IOException {
428 			considerStringAsUri(value);
429 		}
430 
431 		public void considerStringAsUri(String str) throws IOException {
432 			if (UriUtils.isLikelyUriJavascriptContextLegacy(str)) {
433 				curi.createAndAddLinkRelativeToVia(str,
434 						Link.SPECULATIVE_MISC, Link.SPECULATIVE_HOP);
435 				incrementLinkCount(1);
436 			}
437 		}
438 
439 		public void processURIString(String url) throws IOException {
440 			if (url.startsWith(JSSTRING)) {
441 				incrementLinkCount(ExtractorJS.considerStrings(
442 						curi, url, controller,false));
443 			} else {
444 				curi.createAndAddLinkRelativeToVia(url, Link.EMBED_MISC,
445 						Link.EMBED_HOP);
446 				incrementLinkCount(1);
447 			}
448 		}
449 
450 		private void incrementLinkCount(long count) {
451 			numberOfLinksExtracted += count;
452 		}
453 	}
454 
455 	public String report() {
456 		StringBuffer ret = new StringBuffer();
457 		ret.append("Processor: org.archive.crawler.extractor.ExtractorSWF\n");
458 		ret.append("  Function:          Link extraction on Shockwave Flash "
459 				+ "documents (.swf)\n");
460 
461 		ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
462 		ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
463 		return ret.toString();
464 	}
465 }