View Javadoc

1   /*
2    * AggressiveExtractorHTML
3    *
4    * $Id: AggressiveExtractorHTML.java 5928 2008-07-31 21:30:25Z gojomo $
5    *
6    * Created on Jan 6, 2004
7    *
8    * Copyright (C) 2004 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.util.logging.Logger;
30  
31  import org.archive.crawler.datamodel.CrawlURI;
32  
33  /***
34   * Extended version of ExtractorHTML with more aggressive javascript link
35   * extraction where javascript code is parsed first with general HTML tags
36   * regexp, and than by javascript speculative link regexp.
37   *
38   * @author Igor Ranitovic
39   *
40   */
41  public class AggressiveExtractorHTML
42  extends ExtractorHTML {
43  
44      private static final long serialVersionUID = 3586060081186247087L;
45  
46      static Logger logger =
47          Logger.getLogger(AggressiveExtractorHTML.class.getName());
48      
49      public AggressiveExtractorHTML(String name) {
50          super(name, "Aggressive HTML extractor. Subclasses ExtractorHTML " +
51                  " so does all that it does, except in regard to javascript " +
52                  " blocks.  Here " +
53                  " it first processes as JS as its parent does, but then it " +
54                  " reruns through the JS treating it as HTML (May cause many " +
55                  " false positives). It finishes by applying heuristics " +
56                  " against script code looking for possible URIs. ");
57      }
58  
59      protected void processScript(CrawlURI curi, CharSequence sequence,
60              int endOfOpenTag) {
61          super.processScript(curi,sequence,endOfOpenTag);
62          // then, process entire javascript code as html code
63          // this may cause a lot of false positves
64          processGeneralTag(curi, sequence.subSequence(0,6),
65              sequence.subSequence(endOfOpenTag, sequence.length()));
66      }
67  
68      /* (non-Javadoc)
69       * @see org.archive.crawler.framework.Processor#report()
70       */
71      public String report() {
72          StringBuffer ret = new StringBuffer(256);
73          ret.append("Processor: org.archive.crawler.extractor.AggressiveExtractorHTML\n");
74          ret.append("  Function:          Link extraction on HTML documents " +
75              "(including embedded CSS)\n");
76          ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
77          ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
78          return ret.toString();
79      }
80  }