1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.util.logging.Logger;
30
31 import org.archive.crawler.datamodel.CrawlURI;
32
33 /***
34 * Extended version of ExtractorHTML with more aggressive javascript link
35 * extraction where javascript code is parsed first with general HTML tags
36 * regexp, and than by javascript speculative link regexp.
37 *
38 * @author Igor Ranitovic
39 *
40 */
41 public class AggressiveExtractorHTML
42 extends ExtractorHTML {
43
44 private static final long serialVersionUID = 3586060081186247087L;
45
46 static Logger logger =
47 Logger.getLogger(AggressiveExtractorHTML.class.getName());
48
49 public AggressiveExtractorHTML(String name) {
50 super(name, "Aggressive HTML extractor. Subclasses ExtractorHTML " +
51 " so does all that it does, except in regard to javascript " +
52 " blocks. Here " +
53 " it first processes as JS as its parent does, but then it " +
54 " reruns through the JS treating it as HTML (May cause many " +
55 " false positives). It finishes by applying heuristics " +
56 " against script code looking for possible URIs. ");
57 }
58
59 protected void processScript(CrawlURI curi, CharSequence sequence,
60 int endOfOpenTag) {
61 super.processScript(curi,sequence,endOfOpenTag);
62
63
64 processGeneralTag(curi, sequence.subSequence(0,6),
65 sequence.subSequence(endOfOpenTag, sequence.length()));
66 }
67
68
69
70
71 public String report() {
72 StringBuffer ret = new StringBuffer(256);
73 ret.append("Processor: org.archive.crawler.extractor.AggressiveExtractorHTML\n");
74 ret.append(" Function: Link extraction on HTML documents " +
75 "(including embedded CSS)\n");
76 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
77 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
78 return ret.toString();
79 }
80 }