ExtractorHTML xref

View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimpleHTMLExtractor.java
20   * Created on Jun 5, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.io.IOException;
27  import java.io.UnsupportedEncodingException;
28  import java.net.URLDecoder;
29  import java.util.ArrayList;
30  import java.util.Iterator;
31  import java.util.logging.Level;
32  import java.util.logging.Logger;
33  import java.util.regex.Matcher;
34  
35  import org.apache.commons.httpclient.URIException;
36  import org.archive.crawler.datamodel.CoreAttributeConstants;
37  import org.archive.crawler.datamodel.CrawlURI;
38  import org.archive.crawler.datamodel.RobotsHonoringPolicy;
39  import org.archive.crawler.settings.SimpleType;
40  import org.archive.crawler.settings.Type;
41  import org.archive.io.ReplayCharSequence;
42  import org.archive.net.UURI;
43  import org.archive.net.UURIFactory;
44  import org.archive.util.DevUtils;
45  import org.archive.util.HttpRecorder;
46  import org.archive.util.TextUtils;
47  import org.archive.util.UriUtils;
48  
49  /***
50   * Basic link-extraction, from an HTML content-body,
51   * using regular expressions.
52   *
53   * @author gojomo
54   *
55   */
56  public class ExtractorHTML extends Extractor
57  implements CoreAttributeConstants {
58  
59      private static final long serialVersionUID = 5855731422080471017L;
60  
61      private static Logger logger =
62          Logger.getLogger(ExtractorHTML.class.getName());
63  
64      /***
65       * Compiled relevant tag extractor.
66       *
67       * <p>
68       * This pattern extracts either:
69       * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
70       * <li> (2) &lt;style&gt;...&lt;/style&gt; or
71       * <li> (3) &lt;meta ...&gt; or
72       * <li> (4) any other open-tag with at least one attribute
73       * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
74       * <p>
75       * groups:
76       * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
77       * <li> 2: just script open tag
78       * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
79       * <li> 4: just style open tag
80       * <li> 5: entire other tag, without '<' '>'
81       * <li> 6: element
82       * <li> 7: META
83       * <li> 8: !-- comment --
84       */
85  // version w/ less unnecessary backtracking
86        private static final int MAX_ELEMENT_LENGTH =
87            Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
88                ".maxElementNameLength", "1024"));
89        
90        static final String RELEVANT_TAG_EXTRACTOR =
91            "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2
92            "|((style[^>]*+)>.*?</style)" + // 3, 4
93            "|(((meta)|(?://w{1,"+MAX_ELEMENT_LENGTH+"}))//s+[^>]*+)" + // 5, 6, 7
94            "|(!--.*?--))>"; // 8 
95  
96  //    version w/ problems with unclosed script tags 
97  //    static final String RELEVANT_TAG_EXTRACTOR =
98  //    "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?://w+))//s+.*?)|(!--.*?--))>";
99  
100 
101       
102 //    // this pattern extracts 'href' or 'src' attributes from
103 //    // any open-tag innards matched by the above
104 //    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
105 //     "(?is)(//w+)(?://s+|(?://s.*?//s))(?:(href)|(src))//s*=(?:(?://s*\"(.+?)\")|(?://s*'(.+?)')|(//S+))");
106 //
107 //    // this pattern extracts 'robots' attributes
108 //    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
109 //     "(?is)(//w+)//s+.*?(?:(robots))//s*=(?:(?://s*\"(.+)\")|(?://s*'(.+)')|(//S+))");
110 
111       private static final int MAX_ATTR_NAME_LENGTH =
112           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
113               ".maxAttributeNameLength", "1024")); // 1K; 
114       
115       static final int MAX_ATTR_VAL_LENGTH = 
116           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
117               ".maxAttributeValueLength", "16384")); // 16K; 
118       
119     // TODO: perhaps cut to near MAX_URI_LENGTH
120     
121     // this pattern extracts attributes from any open-tag innards
122     // matched by the above. attributes known to be URIs of various
123     // sorts are matched specially
124     static final String EACH_ATTRIBUTE_EXTRACTOR =
125       "(?is)//b((href)|(action)|(on//w*)" // 1, 2, 3, 4 
126      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...
127      +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5
128      +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
129      +"|(value)|(style)|(method)" // 10, 11, 12
130      +"|([-//w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 13
131      +"//s*=//s*"
132      +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 14
133      +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 15
134      +"|(//S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 16
135     // groups:
136     // 1: attribute name
137     // 2: HREF - single URI relative to doc base, or occasionally javascript:
138     // 3: ACTION - single URI relative to doc base, or occasionally javascript:
139     // 4: ON[WHATEVER] - script handler
140     // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
141     //    single URI relative to doc base
142     // 6: CODEBASE - a single URI relative to doc base, affecting other
143     //    attributes
144     // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
145     // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
146     //    (if supplied)
147     // 9: CODE - a single URI relative to the CODEBASE (is specified).
148     // 10: VALUE - often includes a uri path on forms
149     // 11: STYLE - inline attribute style info
150     // 12: METHOD - form GET/POST
151     // 13: any other attribute
152     // 14: double-quote delimited attr value
153     // 15: single-quote delimited attr value
154     // 16: space-delimited attr value
155 
156     static final String WHITESPACE = "//s";
157     static final String CLASSEXT =".class";
158     static final String APPLET = "applet";
159     static final String BASE = "base";
160     static final String LINK = "link";
161     static final String FRAME = "frame";
162     static final String IFRAME = "iframe";
163 
164     public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
165         "treat-frames-as-embed-links";
166     
167     public static final String ATTR_IGNORE_FORM_ACTION_URLS =
168         "ignore-form-action-urls";
169 
170     public static final String ATTR_EXTRACT_ONLY_FORM_GETS =
171         "extract-only-form-gets";
172 
173     /*** whether to try finding links in Javscript; default true */
174     public static final String ATTR_EXTRACT_JAVASCRIPT =
175         "extract-javascript";
176 
177     public static final String EXTRACT_VALUE_ATTRIBUTES =
178         "extract-value-attributes";
179     
180     public static final String ATTR_IGNORE_UNEXPECTED_HTML = 
181         "ignore-unexpected-html";
182 
183     
184     protected long numberOfCURIsHandled = 0;
185     protected long numberOfLinksExtracted = 0;
186 
187     public ExtractorHTML(String name) {
188         this(name, "HTML extractor. Extracts links from HTML documents");
189     }
190     
191     public ExtractorHTML(String name, String description) {
192         super(name, description);
193         Type t = addElementToDefinition(
194             new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
195             "If true, in-page Javascript is scanned for strings that " +
196             "appear likely to be URIs. This typically finds both valid " +
197             "and invalid URIs, and attempts to fetch the invalid URIs " +
198             "sometimes generates webmaster concerns over odd crawler " +
199             "behavior. Default is true.",
200             Boolean.TRUE));
201         t.setExpertSetting(true);
202         t = addElementToDefinition(
203             new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
204             "If true, FRAME/IFRAME SRC-links are treated as embedded " +
205             "resources (like IMG, 'E' hop-type), otherwise they are " +
206             "treated as navigational links. Default is true.", Boolean.TRUE));
207         t.setExpertSetting(true);
208         t = addElementToDefinition(
209             new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
210             "If true, URIs appearing as the ACTION attribute in " +
211             "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
212         t.setExpertSetting(true);
213         t = addElementToDefinition(
214                 new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS,
215                 "If true, only HTML FORM ACTIONs associated with the GET "+ 
216                 "method are extracted. (Form ACTIONs with method POST "+
217                 "will be ignored. Default is true", Boolean.TRUE));
218         t.setExpertSetting(true);
219         t = addElementToDefinition(
220             new SimpleType(EXTRACT_VALUE_ATTRIBUTES,
221             "If true, strings that look like URIs found in element VALUE " +
222             "attributes (which are sometimes used as URIs by in-page " +
223             "Javascript or server-side redirects) will be extracted. " +
224             "This typically finds both valid and invalid URIs, and " +
225             "attempts to fetch the invalid URIs sometimes generate " +
226             "webmaster concerns over odd crawler behavior. Default " +
227             "is true.",
228             Boolean.TRUE));
229         t.setExpertSetting(true);
230         t = addElementToDefinition(
231             new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
232             "If true, URIs which end in typical non-HTML extensions " +
233             "(such as .gif) will not be scanned as if it were HTML. " +
234             "Default is true.", Boolean.TRUE));
235         t.setExpertSetting(true);
236     }
237 
238     protected void processGeneralTag(CrawlURI curi, CharSequence element,
239             CharSequence cs) {
240 
241         Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
242 
243         // Just in case it's an OBJECT or APPLET tag
244         String codebase = null;
245         ArrayList<String> resources = null;
246         
247         // Just in case it's a FORM
248         CharSequence action = null;
249         CharSequence actionContext = null;
250         CharSequence method = null; 
251         
252         // Just in case it's a VALUE whose interpretation depends on accompanying NAME
253         CharSequence valueVal = null; 
254         CharSequence valueContext = null;
255         CharSequence nameVal = null; 
256         
257         final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,
258             ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
259 
260         final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,
261                 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
262         
263         final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute
264                 (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
265         
266         final String elementStr = element.toString();
267         while (attr.find()) {
268             int valueGroup =
269                 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
270             int start = attr.start(valueGroup);
271             int end = attr.end(valueGroup);
272             assert start >= 0: "Start is: " + start + ", " + curi;
273             assert end >= 0: "End is :" + end + ", " + curi;
274             CharSequence value = cs.subSequence(start, end);
275             CharSequence attrName = cs.subSequence(attr.start(1),attr.end(1));
276             value = TextUtils.unescapeHtml(value);
277             if (attr.start(2) > -1) {
278                 // HREF
279                 CharSequence context =
280                     Link.elementContext(element, attr.group(2));
281                 if(elementStr.equalsIgnoreCase(LINK)) {
282                     // <LINK> elements treated as embeds (css, ico, etc)
283                     processEmbed(curi, value, context);
284                 } else {
285                     // other HREFs treated as links
286                     processLink(curi, value, context);
287                 }
288                 if (elementStr.equalsIgnoreCase(BASE)) {
289                     try {
290                         curi.setBaseURI(value.toString());
291                     } catch (URIException e) {
292                         if (getController() != null) {
293                             // Controller can be null: e.g. when running
294                             // ExtractorTool.
295                             getController().logUriError(e, curi.getUURI(),
296                                 value.toString());
297                         } else {
298                             logger.info("Failed set base uri: " +
299                                 curi + ", " + value.toString() + ": " +
300                                 e.getMessage());
301                         }
302                     }
303                 }
304             } else if (attr.start(3) > -1) {
305                 // ACTION
306                 if (!ignoreFormActions) {
307                     action = value; 
308                     actionContext = Link.elementContext(element,
309                         attr.group(3));
310                     // handling finished only at end (after METHOD also collected)
311                 }
312             } else if (attr.start(4) > -1) {
313                 // ON____
314                 processScriptCode(curi, value); // TODO: context?
315             } else if (attr.start(5) > -1) {
316                 // SRC etc.
317                 CharSequence context = Link.elementContext(element,
318                     attr.group(5));
319                 
320                 // true, if we expect another HTML page instead of an image etc.
321                 // TODO: add explicit 'F'rame hop type? (it's not really L, and
322                 // different enough from other 'E's)
323                 final char hopType;
324                 
325                 if(!framesAsEmbeds
326                     && (elementStr.equalsIgnoreCase(FRAME) || elementStr
327                         .equalsIgnoreCase(IFRAME))) {
328                     hopType = Link.NAVLINK_HOP;
329                 } else {
330                     hopType = Link.EMBED_HOP;
331                 }
332                 processEmbed(curi, value, context, hopType);
333             } else if (attr.start(6) > -1) {
334                 // CODEBASE
335                 codebase = (value instanceof String)?
336                     (String)value: value.toString();
337                 CharSequence context = Link.elementContext(element,
338                     attr.group(6));
339                 processEmbed(curi, codebase, context);
340             } else if (attr.start(7) > -1) {
341                 // CLASSID, DATA
342                 if (resources == null) {
343                     resources = new ArrayList<String>();
344                 }
345                 resources.add(value.toString());
346             } else if (attr.start(8) > -1) {
347                 // ARCHIVE
348                 if (resources==null) {
349                     resources = new ArrayList<String>();
350                 }
351                 String[] multi = TextUtils.split(WHITESPACE, value);
352                 for(int i = 0; i < multi.length; i++ ) {
353                     resources.add(multi[i]);
354                 }
355             } else if (attr.start(9) > -1) {
356                 // CODE
357                 if (resources==null) {
358                     resources = new ArrayList<String>();
359                 }
360                 // If element is applet and code value does not end with
361                 // '.class' then append '.class' to the code value.
362                 if (elementStr.equalsIgnoreCase(APPLET) &&
363                         !value.toString().toLowerCase().endsWith(CLASSEXT)) {
364                     resources.add(value.toString() + CLASSEXT);
365                 } else {
366                     resources.add(value.toString());
367                 }
368             } else if (attr.start(10) > -1) {
369                 // VALUE, with possibility of URI
370                 // store value, context for handling at end
371                 valueVal = value; 
372                 valueContext = Link.elementContext(element,attr.group(10));
373             } else if (attr.start(11) > -1) {
374                 // STYLE inline attribute
375                 // then, parse for URIs
376                 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
377                     curi, value, getController());
378                 
379             } else if (attr.start(12) > -1) {
380                 // METHOD
381                 method = value;
382                 // form processing finished at end (after ACTION also collected)
383             } else if (attr.start(13) > -1) {
384                 if("NAME".equalsIgnoreCase(attrName.toString())) {
385                     // remember 'name' for end-analysis
386                     nameVal = value; 
387                 }
388                 if("FLASHVARS".equalsIgnoreCase(attrName.toString())) {
389                     // consider FLASHVARS attribute immediately
390                     valueContext = Link.elementContext(element,attr.group(13));
391                     considerQueryStringValues(curi, value, valueContext,Link.SPECULATIVE_HOP);
392                 }
393                 // any other attribute
394                 // ignore for now
395                 // could probe for path- or script-looking strings, but
396                 // those should be vanishingly rare in other attributes,
397                 // and/or symptomatic of page bugs
398             }
399         }
400         TextUtils.recycleMatcher(attr);
401 
402         // finish handling codebase/resources now that all available
403         if (resources != null) {
404             Iterator<String> iter = resources.iterator();
405             UURI codebaseURI = null;
406             String res = null;
407             try {
408                 if (codebase != null) {
409                     // TODO: Pass in the charset.
410                     codebaseURI = UURIFactory.
411                         getInstance(curi.getUURI(), codebase);
412                 }
413                 while(iter.hasNext()) {
414                     res = iter.next().toString();
415                     res = (String) TextUtils.unescapeHtml(res);
416                     if (codebaseURI != null) {
417                         res = codebaseURI.resolve(res).toString();
418                     }
419                     processEmbed(curi, res, element); // TODO: include attribute too
420                 }
421             } catch (URIException e) {
422                 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
423             } catch (IllegalArgumentException e) {
424                 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
425                     "codebase=" + codebase + " res=" + res + "\n" +
426                     DevUtils.extraInfo(), e);
427             }
428         }
429         
430         // finish handling form action, now method is available
431         if(action != null) {
432             if(method == null || "GET".equalsIgnoreCase(method.toString()) 
433                     || ! ((Boolean)getUncheckedAttribute(curi,
434                             ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) {
435                 processLink(curi, action, actionContext);
436             }
437         }
438         
439         // finish handling VALUE
440         if (valueVal != null) {
441             if ("PARAM".equalsIgnoreCase(elementStr) && nameVal != null
442                     && "flashvars".equalsIgnoreCase(nameVal.toString())) {
443                 // special handling for <PARAM NAME='flashvars" VALUE="">
444                 String queryStringLike = valueVal.toString();
445                 // treat value as query-string-like "key=value[&key=value]*" pairings
446                 considerQueryStringValues(curi, queryStringLike, valueContext,Link.SPECULATIVE_HOP);
447             } else {
448                 // regular VALUE handling
449                 if (extractValueAttributes) {
450                     considerIfLikelyUri(curi,valueVal,valueContext,Link.NAVLINK_HOP);
451                 }
452             }
453         }
454     }
455 
456     /***
457      * Consider a query-string-like collections of key=value[&key=value]
458      * pairs for URI-like strings in the values. Where URI-like strings are
459      * found, add as discovered outlink. 
460      * 
461      * @param curi origin CrawlURI
462      * @param queryString query-string-like string
463      * @param valueContext page context where found
464      */
465     protected void considerQueryStringValues(CrawlURI curi,
466             CharSequence queryString, CharSequence valueContext, char hopType) {
467         for (String pairString : queryString.toString().split("&")) {
468             String[] encodedKeyVal = pairString.split("=");
469             if (encodedKeyVal.length == 2) try {
470                 String value = URLDecoder.decode(encodedKeyVal[1], "UTF-8");
471                 considerIfLikelyUri(curi, value, valueContext, hopType);
472             } catch (IllegalArgumentException e) {
473                 // still consider values rejected by URLDecoder
474                 considerIfLikelyUri(curi, encodedKeyVal[1], valueContext, hopType);
475             } catch (UnsupportedEncodingException e) {
476                 logger.log(Level.SEVERE,"all jvms must support UTF-8, and yet somehow this happened",e);
477             }
478         }
479     }
480 
481     /***
482      * Consider whether a given string is URI-like. If so, add as discovered 
483      * outlink. 
484      * 
485      * @param curi origin CrawlURI
486      * @param queryString query-string-like string
487      * @param valueContext page context where found
488 
489      */
490     protected void considerIfLikelyUri(CrawlURI curi, CharSequence candidate, 
491             CharSequence valueContext, char hopType) {
492         if(UriUtils.isLikelyUriHtmlContextLegacy(candidate)) {
493             addLinkFromString(curi,candidate,valueContext,hopType);
494         }
495     }
496 
497     /***
498      * Extract the (java)script source in the given CharSequence. 
499      * 
500      * @param curi source CrawlURI
501      * @param cs CharSequence of javascript code
502      */
503     protected void processScriptCode(CrawlURI curi, CharSequence cs) {
504         if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
505             this.numberOfLinksExtracted +=
506                 ExtractorJS.considerStrings(curi, cs, getController(), false);
507         } // else do nothing
508     }
509 
510     static final String JAVASCRIPT = "(?i)^javascript:.*";
511 
512     /***
513      * Handle generic HREF cases.
514      * 
515      * @param curi
516      * @param value
517      * @param context
518      */
519     protected void processLink(CrawlURI curi, final CharSequence value,
520             CharSequence context) {
521         if (TextUtils.matches(JAVASCRIPT, value)) {
522             processScriptCode(curi, value. subSequence(11, value.length()));
523         } else {    
524             if (logger.isLoggable(Level.FINEST)) {
525                 logger.finest("link: " + value.toString() + " from " + curi);
526             }
527             addLinkFromString(curi, value, context, Link.NAVLINK_HOP);
528             this.numberOfLinksExtracted++;
529         }
530     }
531 
532     protected void addLinkFromString(CrawlURI curi, CharSequence uri,
533             CharSequence context, char hopType) {
534         try {
535             // We do a 'toString' on context because its a sequence from
536             // the underlying ReplayCharSequence and the link its about
537             // to become a part of is expected to outlive the current
538             // ReplayCharSequence.
539             curi.createAndAddLinkRelativeToBase(uri.toString(), context.toString(),
540                 hopType);
541         } catch (URIException e) {
542             if (getController() != null) {
543                 getController().logUriError(e, curi.getUURI(), uri);
544             } else {
545                 logger.info("Failed createAndAddLinkRelativeToBase " +
546                     curi + ", " + uri + ", " + context + ", " + hopType +
547                     ": " + e);
548             }
549         }
550     }
551 
552     protected final void processEmbed(CrawlURI curi, CharSequence value,
553             CharSequence context) {
554         processEmbed(curi, value, context, Link.EMBED_HOP);
555     }
556 
557     protected void processEmbed(CrawlURI curi, final CharSequence value,
558             CharSequence context, char hopType) {
559         if (logger.isLoggable(Level.FINEST)) {
560             logger.finest("embed (" + hopType + "): " + value.toString() +
561                 " from " + curi);
562         }
563         addLinkFromString(curi,
564             (value instanceof String)?
565                 (String)value: value.toString(),
566             context, hopType);
567         this.numberOfLinksExtracted++;
568     }
569 
570     public void extract(CrawlURI curi) {
571         if (!isHttpTransactionContentToProcess(curi) ||
572                 ! (isExpectedMimeType(curi.getContentType(), "text/html")
573                    || isExpectedMimeType(curi.getContentType(), "application/xhtml")
574                    || isExpectedMimeType(curi.getContentType(), "text/vnd.wap.wml")
575                    || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.wml")
576                    || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.xhtml"))) {
577             return;
578         }
579 
580         final boolean ignoreUnexpectedHTML =
581              ((Boolean)getUncheckedAttribute(curi, 
582                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();        
583 
584         if (ignoreUnexpectedHTML) {
585             try {
586                 if(!isHtmlExpectedHere(curi)) {
587                     // HTML was not expected (eg a GIF was expected) so ignore
588                     // (as if a soft 404)
589                     return;
590                 }
591             } catch (URIException e) {
592                 logger.severe("Failed expectedHTML test: " + e.getMessage());
593             }
594         }
595 
596         this.numberOfCURIsHandled++;
597 
598         ReplayCharSequence cs = null;
599         
600         try {
601            HttpRecorder hr = curi.getHttpRecorder();
602            if (hr == null) {
603                throw new IOException("Why is recorder null here?");
604            }
605            cs = hr.getReplayCharSequence();
606         } catch (IOException e) {
607             curi.addLocalizedError(this.getName(), e,
608                 "Failed get of replay char sequence " + curi.toString() +
609                     " " + e.getMessage());
610             logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
611                 Thread.currentThread().getName(), e);
612         }
613         
614         if (cs == null) {
615             return;
616         }
617 
618         // We have a ReplayCharSequence open.  Wrap all in finally so we
619         // for sure close it before we leave.
620         try {
621             // Extract all links from the charsequence
622             extract(curi, cs);
623             // Set flag to indicate that link extraction is completed.
624             curi.linkExtractorFinished();
625         } finally {
626             if (cs != null) {
627                 try {
628                     cs.close();
629                 } catch (IOException ioe) {
630                     logger.warning(TextUtils.exceptionToString(
631                         "Failed close of ReplayCharSequence.", ioe));
632                 }
633             }
634         }
635     }
636 
637     /***
638      * Run extractor.
639      * This method is package visible to ease testing.
640      * @param curi CrawlURI we're processing.
641      * @param cs Sequence from underlying ReplayCharSequence. This
642      * is TRANSIENT data. Make a copy if you want the data to live outside
643      * of this extractors' lifetime.
644      */
645     void extract(CrawlURI curi, CharSequence cs) {
646         Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
647         while(tags.find()) {
648             if(Thread.interrupted()){
649                 break;
650             }
651             if (tags.start(8) > 0) {
652                 // comment match
653                 // for now do nothing
654             } else if (tags.start(7) > 0) {
655                 // <meta> match
656                 int start = tags.start(5);
657                 int end = tags.end(5);
658                 assert start >= 0: "Start is: " + start + ", " + curi;
659                 assert end >= 0: "End is :" + end + ", " + curi;
660                 if (processMeta(curi,
661                     cs.subSequence(start, end))) {
662 
663                     // meta tag included NOFOLLOW; abort processing
664                     break;
665                 }
666             } else if (tags.start(5) > 0) {
667                 // generic <whatever> match
668                 int start5 = tags.start(5);
669                 int end5 = tags.end(5);
670                 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
671                 assert end5 >= 0: "End is :" + end5 + ", " + curi;
672                 int start6 = tags.start(6);
673                 int end6 = tags.end(6);
674                 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
675                 assert end6 >= 0: "End is :" + end6 + ", " + curi;
676                 processGeneralTag(curi,
677                     cs.subSequence(start6, end6),
678                     cs.subSequence(start5, end5));
679 
680             } else if (tags.start(1) > 0) {
681                 // <script> match
682                 int start = tags.start(1);
683                 int end = tags.end(1);
684                 assert start >= 0: "Start is: " + start + ", " + curi;
685                 assert end >= 0: "End is :" + end + ", " + curi;
686                 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
687                     ", " + curi;
688                 processScript(curi, cs.subSequence(start, end),
689                     tags.end(2) - start);
690 
691             } else if (tags.start(3) > 0){
692                 // <style... match
693                 int start = tags.start(3);
694                 int end = tags.end(3);
695                 assert start >= 0: "Start is: " + start + ", " + curi;
696                 assert end >= 0: "End is :" + end + ", " + curi;
697                 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
698                     ", " + curi;
699                 processStyle(curi, cs.subSequence(start, end),
700                     tags.end(4) - start);
701             }
702         }
703         TextUtils.recycleMatcher(tags);
704     }
705 
706 
707     static final String NON_HTML_PATH_EXTENSION =
708         "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
709         "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
710 
711     /***
712      * Test whether this HTML is so unexpected (eg in place of a GIF URI)
713      * that it shouldn't be scanned for links.
714      *
715      * @param curi CrawlURI to examine.
716      * @return True if HTML is acceptable/expected here
717      * @throws URIException
718      */
719     protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
720         String path = curi.getUURI().getPath();
721         if(path==null) {
722             // no path extension, HTML is fine
723             return true;
724         }
725         int dot = path.lastIndexOf('.');
726         if (dot < 0) {
727             // no path extension, HTML is fine
728             return true;
729         }
730         if(dot<(path.length()-5)) {
731             // extension too long to recognize, HTML is fine
732             return true;
733         }
734         String ext = path.substring(dot+1);
735         return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
736     }
737 
738     protected void processScript(CrawlURI curi, CharSequence sequence,
739             int endOfOpenTag) {
740         // first, get attributes of script-open tag
741         // as per any other tag
742         processGeneralTag(curi,sequence.subSequence(0,6),
743             sequence.subSequence(0,endOfOpenTag));
744 
745         // then, apply best-effort string-analysis heuristics
746         // against any code present (false positives are OK)
747         processScriptCode(
748             curi, sequence.subSequence(endOfOpenTag, sequence.length()));
749     }
750 
751     /***
752      * Process metadata tags.
753      * @param curi CrawlURI we're processing.
754      * @param cs Sequence from underlying ReplayCharSequence. This
755      * is TRANSIENT data. Make a copy if you want the data to live outside
756      * of this extractors' lifetime.
757      * @return True robots exclusion metatag.
758      */
759     protected boolean processMeta(CrawlURI curi, CharSequence cs) {
760         Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
761         String name = null;
762         String httpEquiv = null;
763         String content = null;
764         while (attr.find()) {
765             int valueGroup =
766                 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
767             CharSequence value =
768                 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
769             value = TextUtils.unescapeHtml(value);
770             if (attr.group(1).equalsIgnoreCase("name")) {
771                 name = value.toString();
772             } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
773                 httpEquiv = value.toString();
774             } else if (attr.group(1).equalsIgnoreCase("content")) {
775                 content = value.toString();
776             }
777             // TODO: handle other stuff
778         }
779         TextUtils.recycleMatcher(attr);
780 
781         // Look for the 'robots' meta-tag
782         if("robots".equalsIgnoreCase(name) && content != null ) {
783             curi.putString(A_META_ROBOTS, content);
784             RobotsHonoringPolicy policy =
785                 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
786             String contentLower = content.toLowerCase();
787             if ((policy == null
788                 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
789                     && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
790                 && (contentLower.indexOf("nofollow") >= 0
791                     || contentLower.indexOf("none") >= 0)) {
792                 // if 'nofollow' or 'none' is specified and the
793                 // honoring policy is not IGNORE or CUSTOM, end html extraction
794                 logger.fine("HTML extraction skipped due to robots meta-tag for: "
795                                 + curi.toString());
796                 return true;
797             }
798         } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
799             int urlIndex = content.indexOf("=") + 1;
800             if(urlIndex>0) {
801                 String refreshUri = content.substring(urlIndex);
802                 try {
803                     curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
804                         Link.REFER_HOP);
805                 } catch (URIException e) {
806                     if (getController() != null) {
807                         getController().logUriError(e, curi.getUURI(), refreshUri);
808                     } else {
809                         logger.info("Failed createAndAddLinkRelativeToBase " +
810                             curi + ", " + cs + ", " + refreshUri + ": " + e);
811                     }
812                 }
813             }
814         }
815         return false;
816     }
817 
818     /***
819      * Process style text.
820      * @param curi CrawlURI we're processing.
821      * @param sequence Sequence from underlying ReplayCharSequence. This
822      * is TRANSIENT data. Make a copy if you want the data to live outside
823      * of this extractors' lifetime.
824      * @param endOfOpenTag
825      */
826     protected void processStyle(CrawlURI curi, CharSequence sequence,
827             int endOfOpenTag) {
828         // First, get attributes of script-open tag as per any other tag.
829         processGeneralTag(curi, sequence.subSequence(0,6),
830             sequence.subSequence(0,endOfOpenTag));
831 
832         // then, parse for URIs
833         this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
834             curi, sequence.subSequence(endOfOpenTag,sequence.length()),
835                 getController());
836     }
837     
838 
839 
840     /* (non-Javadoc)
841      * @see org.archive.crawler.framework.Processor#report()
842      */
843     public String report() {
844         StringBuffer ret = new StringBuffer();
845         ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
846         ret.append("  Function:          Link extraction on HTML documents\n");
847         ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
848         ret.append("  Links extracted:   " + this.numberOfLinksExtracted +
849             "\n\n");
850         return ret.toString();
851     }
852 }
853