JerichoExtractorHTML xref

View Javadoc

1   /* JerichoExtractorHTML
2    * 
3    * Copyright (C) 2006 Olaf Freyer
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   *
21   * $Id: JerichoExtractorHTML.java 6830 2010-04-21 23:39:57Z gojomo $
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.util.ArrayList;
26  import java.util.Collection;
27  import java.util.Iterator;
28  import java.util.LinkedList;
29  import java.util.List;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.apache.commons.lang.StringEscapeUtils;
35  import org.apache.commons.lang.StringUtils;
36  import org.archive.crawler.datamodel.CoreAttributeConstants;
37  import org.archive.crawler.datamodel.CrawlURI;
38  import org.archive.crawler.datamodel.RobotsHonoringPolicy;
39  import org.archive.net.UURI;
40  import org.archive.net.UURIFactory;
41  import org.archive.util.DevUtils;
42  import org.archive.util.TextUtils;
43  import org.archive.util.UriUtils;
44  
45  import au.id.jericho.lib.html.Attribute;
46  import au.id.jericho.lib.html.Attributes;
47  import au.id.jericho.lib.html.Element;
48  import au.id.jericho.lib.html.FormControl;
49  import au.id.jericho.lib.html.FormControlType;
50  import au.id.jericho.lib.html.FormField;
51  import au.id.jericho.lib.html.FormFields;
52  import au.id.jericho.lib.html.HTMLElementName;
53  import au.id.jericho.lib.html.Source;
54  import au.id.jericho.lib.html.StartTagType;
55  
56  /***
57   * Improved link-extraction from an HTML content-body using jericho-html parser.
58   * This extractor extends ExtractorHTML and mimics its workflow - but has some
59   * substantial differences when it comes to internal implementation. Instead
60   * of heavily relying upon java regular expressions it uses a real html parser
61   * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
62   * Using this parser it can better handle broken html (i.e. missing quotes)
63   * and also offer improved extraction of HTML form URLs (not only extract
64   * the action of a form, but also its default values).
65   * Unfortunately this parser also has one major drawback - it has to read the
66   * whole document into memory for parsing, thus has an inherent OOME risk.
67   * This OOME risk can be reduced/eleminated by limiting the size of documents
68   * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
69   * Also note that this extractor seems to have a lower overall memory 
70   * consumption compared to ExtractorHTML. (still to be confirmed on a larger 
71   * scale crawl) 
72   * 
73   * @author Olaf Freyer
74   * @version $Date: 2010-04-21 23:39:57 +0000 (Wed, 21 Apr 2010) $ $Revision: 6830 $
75   */
76  public class JerichoExtractorHTML extends ExtractorHTML implements
77          CoreAttributeConstants {
78  
79      private static final long serialVersionUID = 1684681316546343615L;
80  
81      private Logger logger = Logger.getLogger(this.getClass().getName());
82  
83      protected long numberOfFormsProcessed = 0;
84  
85      public JerichoExtractorHTML(String name) {
86          this(name, "Jericho-HTML extractor. Extracts links from HTML " +
87                  "documents using Jericho HTML Parser. Offers same " + 
88                  "basic functionality as ExtractorHTML but better " +
89                  "handles broken HTML and extraction of default " +
90                  "values from HTML forms. A word of warning: the used " +
91                  "parser, the Jericho HTML Parser, reads the whole " +
92                  "document into memory for " +
93                  "parsing - thus this extractor has an inherent OOME risk. " +
94                  "This OOME risk can be reduced/eleminated by limiting the " +
95                  "size of documents to be parsed (i.e. using " +
96                  "NotExceedsDocumentLengthTresholdDecideRule). ");
97      }
98  
99      public JerichoExtractorHTML(String name, String description) {
100         super(name, description);
101     }
102 
103     private static List<Attribute> findOnAttributes(Attributes attributes) {
104         List<Attribute> result = new LinkedList<Attribute>();
105         for (Iterator attrIter = attributes.iterator(); attrIter.hasNext();) {
106             Attribute attr = (Attribute) attrIter.next();
107             if (attr.getKey().startsWith("on"))
108                 result.add(attr);
109         }
110         return result;
111     }
112 
113     protected void processGeneralTag(CrawlURI curi, Element element,
114             Attributes attributes) {
115         Attribute attr;
116         String attrValue;
117         List attrList;
118         String elementName = element.getName();
119 
120         // Just in case it's an OBJECT or APPLET tag
121         String codebase = null;
122         ArrayList<String> resources = null;
123 
124         final boolean framesAsEmbeds = ((Boolean) getUncheckedAttribute(curi,
125                 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
126 
127         final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
128                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
129 
130         final boolean overlyEagerLinkDetection =
131             ((Boolean)getUncheckedAttribute(
132                 curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
133 
134         // HREF
135         if (((attr = attributes.get("href")) != null) &&
136             ((attrValue = attr.getValue()) != null)) {
137             CharSequence context = Link.elementContext(elementName, attr
138                     .getKey());
139             if ("link".equals(elementName)) {
140                 // <LINK> elements treated as embeds (css, ico, etc)
141                 processEmbed(curi, attrValue, context);
142             } else {
143                 // other HREFs treated as links
144                 processLink(curi, attrValue, context);
145             }
146             if ("base".equals(elementName)) {
147                 try {
148                     curi.setBaseURI(attrValue);
149                 } catch (URIException e) {
150                     if (getController() != null) {
151                         // Controller can be null: e.g. when running
152                         // ExtractorTool.
153                         getController().logUriError(e, curi.getUURI(),
154                                 attrValue);
155                     } else {
156                         logger.info("Failed set base uri: " + curi + ", "
157                                 + attrValue + ": " + e.getMessage());
158                     }
159                 }
160             }
161         }
162         // ACTION
163         if (((attr = attributes.get("action")) != null) &&
164                  ((attrValue = attr.getValue()) != null)) {
165             if (!ignoreFormActions) {
166                 CharSequence context = Link.elementContext(elementName, attr
167                         .getKey());
168                 processLink(curi, attrValue, context);
169             }
170         }
171         // ON_
172         if ((attrList = findOnAttributes(attributes)).size() != 0) {
173             for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) {
174                 attr = (Attribute) attrIter.next();
175                 CharSequence valueSegment = attr.getValueSegment();
176                 if (valueSegment != null)
177                     processScriptCode(curi, valueSegment);
178 
179             }
180         }
181         // SRC atc.
182         if ((((attr = attributes.get("src")) != null)
183                 || ((attr = attributes.get("lowsrc")) != null)
184                 || ((attr = attributes.get("background")) != null)
185                 || ((attr = attributes.get("cite")) != null)
186                 || ((attr = attributes.get("longdesc")) != null)
187                 || ((attr = attributes.get("usemap")) != null)
188                 || ((attr = attributes.get("profile")) != null)
189                 || ((attr = attributes.get("datasrc")) != null)) &&
190                    ((attrValue = attr.getValue()) != null)) {
191 
192             final char hopType;
193             CharSequence context = Link.elementContext(elementName, attr
194                     .getKey());
195 
196             if (!framesAsEmbeds
197                     && ("frame".equals(elementName) || "iframe"
198                             .equals(elementName)))
199                 hopType = Link.NAVLINK_HOP;
200             else
201                 hopType = Link.EMBED_HOP;
202 
203             processEmbed(curi, attrValue, context, hopType);
204         }
205         // CODEBASE
206         if (((attr = attributes.get("codebase")) != null) &&
207                  ((attrValue = attr.getValue()) != null)) {
208             codebase = StringEscapeUtils.unescapeHtml(attrValue);
209             CharSequence context = Link.elementContext(elementName, attr
210                     .getKey());
211             processEmbed(curi, codebase, context);
212         }
213         // CLASSID DATA
214         if ((((attr = attributes.get("classid")) != null)
215                 || ((attr = attributes.get("data")) != null)) &&
216                    ((attrValue = attr.getValue()) != null)) {
217             if (resources == null)
218                 resources = new ArrayList<String>();
219             resources.add(attrValue);
220         }
221         // ARCHIVE
222         if (((attr = attributes.get("archive")) != null) &&
223                  ((attrValue = attr.getValue()) != null)) {
224             if (resources == null)
225                 resources = new ArrayList<String>();
226             String[] multi = TextUtils.split(WHITESPACE, attrValue);
227             for (int i = 0; i < multi.length; i++) {
228                 resources.add(multi[i]);
229             }
230         }
231         // CODE
232         if (((attr = attributes.get("code")) != null) &&
233                  ((attrValue = attr.getValue()) != null)) {
234             if (resources == null)
235                 resources = new ArrayList<String>();
236             // If element is applet and code value does not end with
237             // '.class' then append '.class' to the code value.
238             if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
239                 resources.add(attrValue + CLASSEXT);
240             } else {
241                 resources.add(attrValue);
242             }
243         }
244         // VALUE
245         if (((attr = attributes.get("value")) != null) &&
246                  ((attrValue = attr.getValue()) != null)) {
247             CharSequence valueContext = Link.elementContext(elementName, attr.getKey());
248             if("PARAM".equalsIgnoreCase(elementName) 
249                     && "flashvars".equalsIgnoreCase(attributes.get("name").getValue())) {
250                 // special handling for <PARAM NAME='flashvars" VALUE="">
251                 String queryStringLike = attrValue.toString();
252                 // treat value as query-string-like "key=value[;key=value]*" pairings
253                 considerQueryStringValues(curi, queryStringLike, valueContext,Link.SPECULATIVE_HOP);
254             } else {
255                 // regular VALUE handling
256                 if (overlyEagerLinkDetection) {
257                     considerIfLikelyUri(curi,attrValue,valueContext,Link.NAVLINK_HOP);
258                 }
259             }
260         }
261         // STYLE
262         if (((attr = attributes.get("style")) != null) &&
263                  ((attrValue = attr.getValue()) != null)) {
264             // STYLE inline attribute
265             // then, parse for URIs
266             this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
267                     attrValue, getController());
268         }
269         
270         // FLASHVARS
271         if (((attr = attributes.get("flashvars")) != null) &&
272                 ((attrValue = attr.getValue()) != null)) {
273             // FLASHVARS inline attribute
274             CharSequence valueContext = Link.elementContext(elementName, attr.getKey());
275             considerQueryStringValues(curi, attrValue, valueContext,Link.SPECULATIVE_HOP);
276        }
277         
278         // handle codebase/resources
279         if (resources == null)
280             return;
281 
282         Iterator<String> iter = resources.iterator();
283         UURI codebaseURI = null;
284         String res = null;
285         try {
286             if (codebase != null) {
287                 // TODO: Pass in the charset.
288                 codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);
289             }
290             while (iter.hasNext()) {
291                 res = iter.next();
292                 res = StringEscapeUtils.unescapeHtml(res);
293                 if (codebaseURI != null) {
294                     res = codebaseURI.resolve(res).toString();
295                 }
296                 processEmbed(curi, res, element); // TODO: include attribute
297                                                     // too
298             }
299         } catch (URIException e) {
300             curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
301         } catch (IllegalArgumentException e) {
302             DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"
303                     + "codebase=" + codebase + " res=" + res + "\n"
304                     + DevUtils.extraInfo(), e);
305         }
306     }
307 
308     protected boolean processMeta(CrawlURI curi, Element element) {
309         String name = element.getAttributeValue("name");
310         String httpEquiv = element.getAttributeValue("http-equiv");
311         String content = element.getAttributeValue("content");
312 
313         if ("robots".equals(name) && content != null) {
314             curi.putString(A_META_ROBOTS, content);
315             RobotsHonoringPolicy policy = getSettingsHandler().getOrder()
316                     .getRobotsHonoringPolicy();
317             String contentLower = content.toLowerCase();
318             if ((policy == null || (!policy.isType(curi,
319                     RobotsHonoringPolicy.IGNORE) && !policy.isType(curi,
320                     RobotsHonoringPolicy.CUSTOM)))
321                     && (contentLower.indexOf("nofollow") >= 0 || contentLower
322                             .indexOf("none") >= 0)) {
323                 // if 'nofollow' or 'none' is specified and the
324                 // honoring policy is not IGNORE or CUSTOM, end html extraction
325                 logger.fine("HTML extraction skipped due to robots meta-tag " +
326                     "for: " + curi.toString());
327                 return true;
328             }
329         }
330         if ("refresh".equals(httpEquiv) && content != null) {
331             String refreshUri = content.substring(content.indexOf("=") + 1);
332             try {
333                 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
334                         Link.REFER_HOP);
335             } catch (URIException e) {
336                 if (getController() != null) {
337                     getController().logUriError(e, curi.getUURI(), refreshUri);
338                 } else {
339                     logger.info("Failed createAndAddLinkRelativeToBase " + curi
340                             + ", " + element.toString() + ", " + refreshUri
341                             + ": " + e);
342                 }
343             }
344         }
345         return false;
346     }
347 
348     protected void processScript(CrawlURI curi, Element element) {
349         // first, get attributes of script-open tag
350         // as per any other tag
351         processGeneralTag(curi, element, element.getAttributes());
352 
353         // then, apply best-effort string-analysis heuristics
354         // against any code present (false positives are OK)
355         processScriptCode(curi, element.getContent());
356 
357     }
358 
359     protected void processStyle(CrawlURI curi, Element element) {
360         // First, get attributes of script-open tag as per any other tag.
361         processGeneralTag(curi, element, element.getAttributes());
362 
363         // then, parse for URIs
364         this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,
365                 element.getContent(), getController());
366     }
367 
368     protected void processForm(CrawlURI curi, Element element) {
369         String action = element.getAttributeValue("action");
370         String name = element.getAttributeValue("name");
371         String queryURL = "";
372 
373         final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(
374                 curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
375 
376         if (ignoreFormActions) {
377             return;
378         }
379         
380         // method-sensitive extraction
381         String method = StringUtils.defaultIfEmpty(
382                 element.getAttributeValue("method"), "GET");
383         if(((Boolean)getUncheckedAttribute(curi,
384                  ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue() 
385                  && ! "GET".equalsIgnoreCase(method)) {
386              return;
387         }
388 
389         numberOfFormsProcessed++;
390 
391         // get all form fields
392         FormFields formFields = element.findFormFields();
393         for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {
394             // for each form field
395             FormField formField = (FormField) fieldsIter.next();
396 
397             // for each form control
398             for (Iterator controlIter = formField.getFormControls().iterator();
399                 controlIter.hasNext();) {
400                 FormControl formControl = (FormControl) controlIter.next();
401 
402                 // get name of control element (and URLEncode it)
403                 String controlName = formControl.getName();
404 
405                 // retrieve list of values - submit needs special handling
406                 Collection controlValues;
407                 if (!(formControl.getFormControlType() ==
408                         FormControlType.SUBMIT)) {
409                     controlValues = formControl.getValues();
410                 } else {
411                     controlValues = formControl.getPredefinedValues();
412                 }
413 
414                 if (controlValues.size() > 0) {
415                     // for each value set
416                     for (Iterator valueIter = controlValues.iterator();
417                             valueIter.hasNext();) {
418                         String value = (String) valueIter.next();
419                         queryURL += "&" + controlName + "=" + value;
420                     }
421                 } else {
422                     queryURL += "&" + controlName + "=";
423                 }
424             }
425         }
426 
427         // clean up url
428         if (action == null) {
429             queryURL = queryURL.replaceFirst("&", "?");
430         } else {
431             if (!action.contains("?"))
432                 queryURL = queryURL.replaceFirst("&", "?");
433             queryURL = action + queryURL;
434         }
435 
436         CharSequence context = Link.elementContext(element.getName(),
437             "name=" + name);
438         processLink(curi, queryURL, context);
439 
440     }
441 
442     /***
443      * Run extractor. This method is package visible to ease testing.
444      * 
445      * @param curi
446      *            CrawlURI we're processing.
447      * @param cs
448      *            Sequence from underlying ReplayCharSequence.
449      */
450     void extract(CrawlURI curi, CharSequence cs) {
451         Source source = new Source(cs);
452         List elements = source.findAllElements(StartTagType.NORMAL);
453         for (Iterator elementIter = elements.iterator();
454                 elementIter.hasNext();) {
455             Element element = (Element) elementIter.next();
456             String elementName = element.getName();
457             Attributes attributes;
458             if (elementName.equals(HTMLElementName.META)) {
459                 if (processMeta(curi, element)) {
460                     // meta tag included NOFOLLOW; abort processing
461                     break;
462                 }
463             } else if (elementName.equals(HTMLElementName.SCRIPT)) {
464                 processScript(curi, element);
465             } else if (elementName.equals(HTMLElementName.STYLE)) {
466                 processStyle(curi, element);
467             } else if (elementName.equals(HTMLElementName.FORM)) {
468                 processForm(curi, element);
469             } else if (!(attributes = element.getAttributes()).isEmpty()) {
470                 processGeneralTag(curi, element, attributes);
471             }
472         }
473     }
474 
475     /*
476      * (non-Javadoc)
477      * 
478      * @see org.archive.crawler.framework.Processor#report()
479      */
480     public String report() {
481         StringBuffer ret = new StringBuffer();
482         ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");
483         ret.append("  Function:          Link extraction on HTML documents\n");
484         ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
485         ret.append("  Forms processed:   " + this.numberOfFormsProcessed + "\n");
486         ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
487         return ret.toString();
488     }
489 }