1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.extractor;
24
25 import java.util.ArrayList;
26 import java.util.Collection;
27 import java.util.Iterator;
28 import java.util.LinkedList;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.apache.commons.lang.StringEscapeUtils;
35 import org.apache.commons.lang.StringUtils;
36 import org.archive.crawler.datamodel.CoreAttributeConstants;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.datamodel.RobotsHonoringPolicy;
39 import org.archive.net.UURI;
40 import org.archive.net.UURIFactory;
41 import org.archive.util.DevUtils;
42 import org.archive.util.TextUtils;
43 import org.archive.util.UriUtils;
44
45 import au.id.jericho.lib.html.Attribute;
46 import au.id.jericho.lib.html.Attributes;
47 import au.id.jericho.lib.html.Element;
48 import au.id.jericho.lib.html.FormControl;
49 import au.id.jericho.lib.html.FormControlType;
50 import au.id.jericho.lib.html.FormField;
51 import au.id.jericho.lib.html.FormFields;
52 import au.id.jericho.lib.html.HTMLElementName;
53 import au.id.jericho.lib.html.Source;
54 import au.id.jericho.lib.html.StartTagType;
55
56 /***
57 * Improved link-extraction from an HTML content-body using jericho-html parser.
58 * This extractor extends ExtractorHTML and mimics its workflow - but has some
59 * substantial differences when it comes to internal implementation. Instead
60 * of heavily relying upon java regular expressions it uses a real html parser
61 * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
62 * Using this parser it can better handle broken html (i.e. missing quotes)
63 * and also offer improved extraction of HTML form URLs (not only extract
64 * the action of a form, but also its default values).
65 * Unfortunately this parser also has one major drawback - it has to read the
66 * whole document into memory for parsing, thus has an inherent OOME risk.
67 * This OOME risk can be reduced/eleminated by limiting the size of documents
68 * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
69 * Also note that this extractor seems to have a lower overall memory
70 * consumption compared to ExtractorHTML. (still to be confirmed on a larger
71 * scale crawl)
72 *
73 * @author Olaf Freyer
74 * @version $Date: 2010-04-21 23:39:57 +0000 (Wed, 21 Apr 2010) $ $Revision: 6830 $
75 */
76 public class JerichoExtractorHTML extends ExtractorHTML implements
77 CoreAttributeConstants {
78
79 private static final long serialVersionUID = 1684681316546343615L;
80
81 private Logger logger = Logger.getLogger(this.getClass().getName());
82
83 protected long numberOfFormsProcessed = 0;
84
85 public JerichoExtractorHTML(String name) {
86 this(name, "Jericho-HTML extractor. Extracts links from HTML " +
87 "documents using Jericho HTML Parser. Offers same " +
88 "basic functionality as ExtractorHTML but better " +
89 "handles broken HTML and extraction of default " +
90 "values from HTML forms. A word of warning: the used " +
91 "parser, the Jericho HTML Parser, reads the whole " +
92 "document into memory for " +
93 "parsing - thus this extractor has an inherent OOME risk. " +
94 "This OOME risk can be reduced/eleminated by limiting the " +
95 "size of documents to be parsed (i.e. using " +
96 "NotExceedsDocumentLengthTresholdDecideRule). ");
97 }
98
99 public JerichoExtractorHTML(String name, String description) {
100 super(name, description);
101 }
102
103 private static List<Attribute> findOnAttributes(Attributes attributes) {
104 List<Attribute> result = new LinkedList<Attribute>();
105 for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) {
106 Attribute attr = (Attribute) attrIter.next();
107 if (attr.getKey().startsWith("on"))
108 result.add(attr);
109 }
110 return result;
111 }
112
113 protected void processGeneralTag(CrawlURI curi, Element element,
114 Attributes attributes) {
115 Attribute attr;
116 String attrValue;
117 List attrList;
118 String elementName = element.getName();
119
120
121 String codebase = null;
122 ArrayList<String> resources = null;
123
124 final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi,
125 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
126
127 final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
128 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
129
130 final boolean overlyEagerLinkDetection =
131 ((Boolean)getUncheckedAttribute(
132 curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
133
134
135 if (((attr = attributes.get("href")) != null) &&
136 ((attrValue = attr.getValue()) != null)) {
137 CharSequence context = Link.elementContext(elementName, attr
138 .getKey());
139 if ("link".equals(elementName)) {
140
141 processEmbed(curi, attrValue, context);
142 } else {
143
144 processLink(curi, attrValue, context);
145 }
146 if ("base".equals(elementName)) {
147 try {
148 curi.setBaseURI(attrValue);
149 } catch (URIException e) {
150 if (getController() != null) {
151
152
153 getController().logUriError(e, curi.getUURI(),
154 attrValue);
155 } else {
156 logger.info("Failed set base uri: " + curi + ", "
157 + attrValue + ": " + e.getMessage());
158 }
159 }
160 }
161 }
162
163 if (((attr = attributes.get("action")) != null) &&
164 ((attrValue = attr.getValue()) != null)) {
165 if (!ignoreFormActions) {
166 CharSequence context = Link.elementContext(elementName, attr
167 .getKey());
168 processLink(curi, attrValue, context);
169 }
170 }
171
172 if ((attrList = findOnAttributes(attributes)).size() != 0) {
173 for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) {
174 attr = (Attribute) attrIter.next();
175 CharSequence valueSegment = attr.getValueSegment();
176 if (valueSegment != null)
177 processScriptCode(curi, valueSegment);
178
179 }
180 }
181
182 if ((((attr = attributes.get("src")) != null)
183 || ((attr = attributes.get("lowsrc")) != null)
184 || ((attr = attributes.get("background")) != null)
185 || ((attr = attributes.get("cite")) != null)
186 || ((attr = attributes.get("longdesc")) != null)
187 || ((attr = attributes.get("usemap")) != null)
188 || ((attr = attributes.get("profile")) != null)
189 || ((attr = attributes.get("datasrc")) != null)) &&
190 ((attrValue = attr.getValue()) != null)) {
191
192 final char hopType;
193 CharSequence context = Link.elementContext(elementName, attr
194 .getKey());
195
196 if (!framesAsEmbeds
197 && ("frame".equals(elementName) || "iframe"
198 .equals(elementName)))
199 hopType = Link.NAVLINK_HOP;
200 else
201 hopType = Link.EMBED_HOP;
202
203 processEmbed(curi, attrValue, context, hopType);
204 }
205
206 if (((attr = attributes.get("codebase")) != null) &&
207 ((attrValue = attr.getValue()) != null)) {
208 codebase = StringEscapeUtils.unescapeHtml(attrValue);
209 CharSequence context = Link.elementContext(elementName, attr
210 .getKey());
211 processEmbed(curi, codebase, context);
212 }
213
214 if ((((attr = attributes.get("classid")) != null)
215 || ((attr = attributes.get("data")) != null)) &&
216 ((attrValue = attr.getValue()) != null)) {
217 if (resources == null)
218 resources = new ArrayList<String>();
219 resources.add(attrValue);
220 }
221
222 if (((attr = attributes.get("archive")) != null) &&
223 ((attrValue = attr.getValue()) != null)) {
224 if (resources == null)
225 resources = new ArrayList<String>();
226 String[] multi = TextUtils.split(WHITESPACE, attrValue);
227 for (int i = 0; i < multi.length; i++) {
228 resources.add(multi[i]);
229 }
230 }
231
232 if (((attr = attributes.get("code")) != null) &&
233 ((attrValue = attr.getValue()) != null)) {
234 if (resources == null)
235 resources = new ArrayList<String>();
236
237
238 if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
239 resources.add(attrValue + CLASSEXT);
240 } else {
241 resources.add(attrValue);
242 }
243 }
244
245 if (((attr = attributes.get("value")) != null) &&
246 ((attrValue = attr.getValue()) != null)) {
247 CharSequence valueContext = Link.elementContext(elementName, attr.getKey());
248 if("PARAM".equalsIgnoreCase(elementName)
249 && "flashvars".equalsIgnoreCase(attributes.get("name").getValue())) {
250
251 String queryStringLike = attrValue.toString();
252
253 considerQueryStringValues(curi, queryStringLike, valueContext,Link.SPECULATIVE_HOP);
254 } else {
255
256 if (overlyEagerLinkDetection) {
257 considerIfLikelyUri(curi,attrValue,valueContext,Link.NAVLINK_HOP);
258 }
259 }
260 }
261
262 if (((attr = attributes.get("style")) != null) &&
263 ((attrValue = attr.getValue()) != null)) {
264
265
266 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
267 attrValue, getController());
268 }
269
270
271 if (((attr = attributes.get("flashvars")) != null) &&
272 ((attrValue = attr.getValue()) != null)) {
273
274 CharSequence valueContext = Link.elementContext(elementName, attr.getKey());
275 considerQueryStringValues(curi, attrValue, valueContext,Link.SPECULATIVE_HOP);
276 }
277
278
279 if (resources == null)
280 return;
281
282 Iterator<String> iter = resources.iterator();
283 UURI codebaseURI = null;
284 String res = null;
285 try {
286 if (codebase != null) {
287
288 codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
289 }
290 while (iter.hasNext()) {
291 res = iter.next();
292 res = StringEscapeUtils.unescapeHtml(res);
293 if (codebaseURI != null) {
294 res = codebaseURI.resolve(res).toString();
295 }
296 processEmbed(curi, res, element);
297
298 }
299 } catch (URIException e) {
300 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
301 } catch (IllegalArgumentException e) {
302 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
303 + "codebase=" + codebase + " res=" + res + "\n"
304 + DevUtils.extraInfo(), e);
305 }
306 }
307
308 protected boolean processMeta(CrawlURI curi, Element element) {
309 String name = element.getAttributeValue("name");
310 String httpEquiv = element.getAttributeValue("http-equiv");
311 String content = element.getAttributeValue("content");
312
313 if ("robots".equals(name) && content != null) {
314 curi.putString(A_META_ROBOTS, content);
315 RobotsHonoringPolicy policy = getSettingsHandler().getOrder()
316 .getRobotsHonoringPolicy();
317 String contentLower = content.toLowerCase();
318 if ((policy == null || (!policy.isType(curi,
319 RobotsHonoringPolicy.IGNORE) && !policy.isType(curi,
320 RobotsHonoringPolicy.CUSTOM)))
321 && (contentLower.indexOf("nofollow") >= 0 || contentLower
322 .indexOf("none") >= 0)) {
323
324
325 logger.fine("HTML extraction skipped due to robots meta-tag " +
326 "for: " + curi.toString());
327 return true;
328 }
329 }
330 if ("refresh".equals(httpEquiv) && content != null) {
331 String refreshUri = content.substring(content.indexOf("=") + 1);
332 try {
333 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
334 Link.REFER_HOP);
335 } catch (URIException e) {
336 if (getController() != null) {
337 getController().logUriError(e, curi.getUURI(), refreshUri);
338 } else {
339 logger.info("Failed createAndAddLinkRelativeToBase " + curi
340 + ", " + element.toString() + ", " + refreshUri
341 + ": " + e);
342 }
343 }
344 }
345 return false;
346 }
347
348 protected void processScript(CrawlURI curi, Element element) {
349
350
351 processGeneralTag(curi, element, element.getAttributes());
352
353
354
355 processScriptCode(curi, element.getContent());
356
357 }
358
359 protected void processStyle(CrawlURI curi, Element element) {
360
361 processGeneralTag(curi, element, element.getAttributes());
362
363
364 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
365 element.getContent(), getController());
366 }
367
368 protected void processForm(CrawlURI curi, Element element) {
369 String action = element.getAttributeValue("action");
370 String name = element.getAttributeValue("name");
371 String queryURL = "";
372
373 final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
374 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
375
376 if (ignoreFormActions) {
377 return;
378 }
379
380
381 String method = StringUtils.defaultIfEmpty(
382 element.getAttributeValue("method"), "GET");
383 if(((Boolean)getUncheckedAttribute(curi,
384 ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()
385 && ! "GET".equalsIgnoreCase(method)) {
386 return;
387 }
388
389 numberOfFormsProcessed++;
390
391
392 FormFields formFields = element.findFormFields();
393 for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {
394
395 FormField formField = (FormField) fieldsIter.next();
396
397
398 for (Iterator controlIter = formField.getFormControls().iterator();
399 controlIter.hasNext();) {
400 FormControl formControl = (FormControl) controlIter.next();
401
402
403 String controlName = formControl.getName();
404
405
406 Collection controlValues;
407 if (!(formControl.getFormControlType() ==
408 FormControlType.SUBMIT)) {
409 controlValues = formControl.getValues();
410 } else {
411 controlValues = formControl.getPredefinedValues();
412 }
413
414 if (controlValues.size() > 0) {
415
416 for (Iterator valueIter = controlValues.iterator();
417 valueIter.hasNext();) {
418 String value = (String) valueIter.next();
419 queryURL += "&" + controlName + "=" + value;
420 }
421 } else {
422 queryURL += "&" + controlName + "=";
423 }
424 }
425 }
426
427
428 if (action == null) {
429 queryURL = queryURL.replaceFirst("&", "?");
430 } else {
431 if (!action.contains("?"))
432 queryURL = queryURL.replaceFirst("&", "?");
433 queryURL = action + queryURL;
434 }
435
436 CharSequence context = Link.elementContext(element.getName(),
437 "name=" + name);
438 processLink(curi, queryURL, context);
439
440 }
441
442 /***
443 * Run extractor. This method is package visible to ease testing.
444 *
445 * @param curi
446 * CrawlURI we're processing.
447 * @param cs
448 * Sequence from underlying ReplayCharSequence.
449 */
450 void extract(CrawlURI curi, CharSequence cs) {
451 Source source = new Source(cs);
452 List elements = source.findAllElements(StartTagType.NORMAL);
453 for (Iterator elementIter = elements.iterator();
454 elementIter.hasNext();) {
455 Element element = (Element) elementIter.next();
456 String elementName = element.getName();
457 Attributes attributes;
458 if (elementName.equals(HTMLElementName.META)) {
459 if (processMeta(curi, element)) {
460
461 break;
462 }
463 } else if (elementName.equals(HTMLElementName.SCRIPT)) {
464 processScript(curi, element);
465 } else if (elementName.equals(HTMLElementName.STYLE)) {
466 processStyle(curi, element);
467 } else if (elementName.equals(HTMLElementName.FORM)) {
468 processForm(curi, element);
469 } else if (!(attributes = element.getAttributes()).isEmpty()) {
470 processGeneralTag(curi, element, attributes);
471 }
472 }
473 }
474
475
476
477
478
479
480 public String report() {
481 StringBuffer ret = new StringBuffer();
482 ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
483 ret.append(" Function: Link extraction on HTML documents\n");
484 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
485 ret.append(" Forms processed: " + this.numberOfFormsProcessed + "\n");
486 ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n");
487 return ret.toString();
488 }
489 }