View Javadoc

1   /*
2    * CrawlUriSWFAction
3    *
4    * $Id: CrawlUriSWFAction.java 6513 2009-09-23 18:18:56Z szznax $
5    *
6    * Created on March 15, 2004
7    *
8    * Copyright (C) 2003 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.io.IOException;
30  
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.framework.CrawlController;
33  
34  import com.anotherbigidea.flash.writers.SWFActionsImpl;
35  
36  /***
37   * SWF action that handles discovered URIs.
38   *
39   * @author Igor Ranitovic
40   */
41  public class CrawlUriSWFAction
42  extends SWFActionsImpl {
43      CrawlURI curi;
44      CrawlController controller; // for error reporting
45      
46      private long linkCount;
47      static final String JSSTRING = "javascript:";
48  
49      /***
50       *
51       * @param curi
52       */
53      public CrawlUriSWFAction(CrawlURI curi, CrawlController controller) {
54          assert (curi != null) : "CrawlURI should not be null";
55          this.curi = curi;
56          this.controller = controller; 
57          this.linkCount = 0;
58      }
59  
60      /***
61       * Overwrite handling of discovered URIs.
62       *
63       * @param url Discovered URL.
64       * @param target Discovered target (currently not being used.)
65       * @throws IOException
66       */
67      public void getURL(String url, String target)
68      throws IOException {
69          // I have done tests on a few tens of swf files and have not seen a need
70          // to use 'target.' Most of the time 'target' is not set, or it is set
71          // to '_self' or '_blank'.
72          if (url.startsWith(JSSTRING)) {
73              linkCount += ExtractorJS.considerStrings(curi, url, controller, false);
74          } else {
75              curi.createAndAddLinkRelativeToVia(url,Link.EMBED_MISC,Link.EMBED_HOP);
76              linkCount++;
77          }
78      }
79      
80      /***
81       * @return Total number of links extracted from a swf file.
82       */
83      public long getLinkCount() {
84          return linkCount;
85      }
86  }