1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.io.IOException;
27 import java.io.UnsupportedEncodingException;
28 import java.net.URLDecoder;
29 import java.util.ArrayList;
30 import java.util.Iterator;
31 import java.util.logging.Level;
32 import java.util.logging.Logger;
33 import java.util.regex.Matcher;
34
35 import org.apache.commons.httpclient.URIException;
36 import org.archive.crawler.datamodel.CoreAttributeConstants;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.datamodel.RobotsHonoringPolicy;
39 import org.archive.crawler.settings.SimpleType;
40 import org.archive.crawler.settings.Type;
41 import org.archive.io.ReplayCharSequence;
42 import org.archive.net.UURI;
43 import org.archive.net.UURIFactory;
44 import org.archive.util.DevUtils;
45 import org.archive.util.HttpRecorder;
46 import org.archive.util.TextUtils;
47 import org.archive.util.UriUtils;
48
49 /***
50 * Basic link-extraction, from an HTML content-body,
51 * using regular expressions.
52 *
53 * @author gojomo
54 *
55 */
56 public class ExtractorHTML extends Extractor
57 implements CoreAttributeConstants {
58
59 private static final long serialVersionUID = 5855731422080471017L;
60
61 private static Logger logger =
62 Logger.getLogger(ExtractorHTML.class.getName());
63
64 /***
65 * Compiled relevant tag extractor.
66 *
67 * <p>
68 * This pattern extracts either:
69 * <li> (1) whole <script>...</script> or
70 * <li> (2) <style>...</style> or
71 * <li> (3) <meta ...> or
72 * <li> (4) any other open-tag with at least one attribute
73 * (eg matches "<a href='boo'>" but not "</a>" or "<br>")
74 * <p>
75 * groups:
76 * <li> 1: SCRIPT SRC=foo>boo</SCRIPT
77 * <li> 2: just script open tag
78 * <li> 3: STYLE TYPE=moo>zoo</STYLE
79 * <li> 4: just style open tag
80 * <li> 5: entire other tag, without '<' '>'
81 * <li> 6: element
82 * <li> 7: META
83 * <li> 8: !-- comment --
84 */
85
86 private static final int MAX_ELEMENT_LENGTH =
87 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
88 ".maxElementNameLength", "1024"));
89
90 static final String RELEVANT_TAG_EXTRACTOR =
91 "(?is)<(?:((script[^>]*+)>.*?</script)" +
92 "|((style[^>]*+)>.*?</style)" +
93 "|(((meta)|(?://w{1,"+MAX_ELEMENT_LENGTH+"}))//s+[^>]*+)" +
94 "|(!--.*?--))>";
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111 private static final int MAX_ATTR_NAME_LENGTH =
112 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
113 ".maxAttributeNameLength", "1024"));
114
115 static final int MAX_ATTR_VAL_LENGTH =
116 Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +
117 ".maxAttributeValueLength", "16384"));
118
119
120
121
122
123
124 static final String EACH_ATTRIBUTE_EXTRACTOR =
125 "(?is)//b((href)|(action)|(on//w*)"
126 +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"
127 +"|(?:usemap)|(?:profile)|(?:datasrc))"
128 +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)"
129 +"|(value)|(style)|(method)"
130 +"|([-//w]{1,"+MAX_ATTR_NAME_LENGTH+"}))"
131 +"//s*=//s*"
132 +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))"
133 +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))"
134 +"|(//S{1,"+MAX_ATTR_VAL_LENGTH+"}))";
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156 static final String WHITESPACE = "//s";
157 static final String CLASSEXT =".class";
158 static final String APPLET = "applet";
159 static final String BASE = "base";
160 static final String LINK = "link";
161 static final String FRAME = "frame";
162 static final String IFRAME = "iframe";
163
164 public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =
165 "treat-frames-as-embed-links";
166
167 public static final String ATTR_IGNORE_FORM_ACTION_URLS =
168 "ignore-form-action-urls";
169
170 public static final String ATTR_EXTRACT_ONLY_FORM_GETS =
171 "extract-only-form-gets";
172
173 /*** whether to try finding links in Javscript; default true */
174 public static final String ATTR_EXTRACT_JAVASCRIPT =
175 "extract-javascript";
176
177 public static final String EXTRACT_VALUE_ATTRIBUTES =
178 "extract-value-attributes";
179
180 public static final String ATTR_IGNORE_UNEXPECTED_HTML =
181 "ignore-unexpected-html";
182
183
184 protected long numberOfCURIsHandled = 0;
185 protected long numberOfLinksExtracted = 0;
186
187 public ExtractorHTML(String name) {
188 this(name, "HTML extractor. Extracts links from HTML documents");
189 }
190
191 public ExtractorHTML(String name, String description) {
192 super(name, description);
193 Type t = addElementToDefinition(
194 new SimpleType(ATTR_EXTRACT_JAVASCRIPT,
195 "If true, in-page Javascript is scanned for strings that " +
196 "appear likely to be URIs. This typically finds both valid " +
197 "and invalid URIs, and attempts to fetch the invalid URIs " +
198 "sometimes generates webmaster concerns over odd crawler " +
199 "behavior. Default is true.",
200 Boolean.TRUE));
201 t.setExpertSetting(true);
202 t = addElementToDefinition(
203 new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,
204 "If true, FRAME/IFRAME SRC-links are treated as embedded " +
205 "resources (like IMG, 'E' hop-type), otherwise they are " +
206 "treated as navigational links. Default is true.", Boolean.TRUE));
207 t.setExpertSetting(true);
208 t = addElementToDefinition(
209 new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,
210 "If true, URIs appearing as the ACTION attribute in " +
211 "HTML FORMs are ignored. Default is false.", Boolean.FALSE));
212 t.setExpertSetting(true);
213 t = addElementToDefinition(
214 new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS,
215 "If true, only HTML FORM ACTIONs associated with the GET "+
216 "method are extracted. (Form ACTIONs with method POST "+
217 "will be ignored. Default is true", Boolean.TRUE));
218 t.setExpertSetting(true);
219 t = addElementToDefinition(
220 new SimpleType(EXTRACT_VALUE_ATTRIBUTES,
221 "If true, strings that look like URIs found in element VALUE " +
222 "attributes (which are sometimes used as URIs by in-page " +
223 "Javascript or server-side redirects) will be extracted. " +
224 "This typically finds both valid and invalid URIs, and " +
225 "attempts to fetch the invalid URIs sometimes generate " +
226 "webmaster concerns over odd crawler behavior. Default " +
227 "is true.",
228 Boolean.TRUE));
229 t.setExpertSetting(true);
230 t = addElementToDefinition(
231 new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,
232 "If true, URIs which end in typical non-HTML extensions " +
233 "(such as .gif) will not be scanned as if it were HTML. " +
234 "Default is true.", Boolean.TRUE));
235 t.setExpertSetting(true);
236 }
237
238 protected void processGeneralTag(CrawlURI curi, CharSequence element,
239 CharSequence cs) {
240
241 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
242
243
244 String codebase = null;
245 ArrayList<String> resources = null;
246
247
248 CharSequence action = null;
249 CharSequence actionContext = null;
250 CharSequence method = null;
251
252
253 CharSequence valueVal = null;
254 CharSequence valueContext = null;
255 CharSequence nameVal = null;
256
257 final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,
258 ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();
259
260 final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,
261 ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();
262
263 final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute
264 (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
265
266 final String elementStr = element.toString();
267 while (attr.find()) {
268 int valueGroup =
269 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
270 int start = attr.start(valueGroup);
271 int end = attr.end(valueGroup);
272 assert start >= 0: "Start is: " + start + ", " + curi;
273 assert end >= 0: "End is :" + end + ", " + curi;
274 CharSequence value = cs.subSequence(start, end);
275 CharSequence attrName = cs.subSequence(attr.start(1),attr.end(1));
276 value = TextUtils.unescapeHtml(value);
277 if (attr.start(2) > -1) {
278
279 CharSequence context =
280 Link.elementContext(element, attr.group(2));
281 if(elementStr.equalsIgnoreCase(LINK)) {
282
283 processEmbed(curi, value, context);
284 } else {
285
286 processLink(curi, value, context);
287 }
288 if (elementStr.equalsIgnoreCase(BASE)) {
289 try {
290 curi.setBaseURI(value.toString());
291 } catch (URIException e) {
292 if (getController() != null) {
293
294
295 getController().logUriError(e, curi.getUURI(),
296 value.toString());
297 } else {
298 logger.info("Failed set base uri: " +
299 curi + ", " + value.toString() + ": " +
300 e.getMessage());
301 }
302 }
303 }
304 } else if (attr.start(3) > -1) {
305
306 if (!ignoreFormActions) {
307 action = value;
308 actionContext = Link.elementContext(element,
309 attr.group(3));
310
311 }
312 } else if (attr.start(4) > -1) {
313
314 processScriptCode(curi, value);
315 } else if (attr.start(5) > -1) {
316
317 CharSequence context = Link.elementContext(element,
318 attr.group(5));
319
320
321
322
323 final char hopType;
324
325 if(!framesAsEmbeds
326 && (elementStr.equalsIgnoreCase(FRAME) || elementStr
327 .equalsIgnoreCase(IFRAME))) {
328 hopType = Link.NAVLINK_HOP;
329 } else {
330 hopType = Link.EMBED_HOP;
331 }
332 processEmbed(curi, value, context, hopType);
333 } else if (attr.start(6) > -1) {
334
335 codebase = (value instanceof String)?
336 (String)value: value.toString();
337 CharSequence context = Link.elementContext(element,
338 attr.group(6));
339 processEmbed(curi, codebase, context);
340 } else if (attr.start(7) > -1) {
341
342 if (resources == null) {
343 resources = new ArrayList<String>();
344 }
345 resources.add(value.toString());
346 } else if (attr.start(8) > -1) {
347
348 if (resources==null) {
349 resources = new ArrayList<String>();
350 }
351 String[] multi = TextUtils.split(WHITESPACE, value);
352 for(int i = 0; i < multi.length; i++ ) {
353 resources.add(multi[i]);
354 }
355 } else if (attr.start(9) > -1) {
356
357 if (resources==null) {
358 resources = new ArrayList<String>();
359 }
360
361
362 if (elementStr.equalsIgnoreCase(APPLET) &&
363 !value.toString().toLowerCase().endsWith(CLASSEXT)) {
364 resources.add(value.toString() + CLASSEXT);
365 } else {
366 resources.add(value.toString());
367 }
368 } else if (attr.start(10) > -1) {
369
370
371 valueVal = value;
372 valueContext = Link.elementContext(element,attr.group(10));
373 } else if (attr.start(11) > -1) {
374
375
376 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
377 curi, value, getController());
378
379 } else if (attr.start(12) > -1) {
380
381 method = value;
382
383 } else if (attr.start(13) > -1) {
384 if("NAME".equalsIgnoreCase(attrName.toString())) {
385
386 nameVal = value;
387 }
388 if("FLASHVARS".equalsIgnoreCase(attrName.toString())) {
389
390 valueContext = Link.elementContext(element,attr.group(13));
391 considerQueryStringValues(curi, value, valueContext,Link.SPECULATIVE_HOP);
392 }
393
394
395
396
397
398 }
399 }
400 TextUtils.recycleMatcher(attr);
401
402
403 if (resources != null) {
404 Iterator<String> iter = resources.iterator();
405 UURI codebaseURI = null;
406 String res = null;
407 try {
408 if (codebase != null) {
409
410 codebaseURI = UURIFactory.
411 getInstance(curi.getUURI(), codebase);
412 }
413 while(iter.hasNext()) {
414 res = iter.next().toString();
415 res = (String) TextUtils.unescapeHtml(res);
416 if (codebaseURI != null) {
417 res = codebaseURI.resolve(res).toString();
418 }
419 processEmbed(curi, res, element);
420 }
421 } catch (URIException e) {
422 curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);
423 } catch (IllegalArgumentException e) {
424 DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
425 "codebase=" + codebase + " res=" + res + "\n" +
426 DevUtils.extraInfo(), e);
427 }
428 }
429
430
431 if(action != null) {
432 if(method == null || "GET".equalsIgnoreCase(method.toString())
433 || ! ((Boolean)getUncheckedAttribute(curi,
434 ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) {
435 processLink(curi, action, actionContext);
436 }
437 }
438
439
440 if (valueVal != null) {
441 if ("PARAM".equalsIgnoreCase(elementStr) && nameVal != null
442 && "flashvars".equalsIgnoreCase(nameVal.toString())) {
443
444 String queryStringLike = valueVal.toString();
445
446 considerQueryStringValues(curi, queryStringLike, valueContext,Link.SPECULATIVE_HOP);
447 } else {
448
449 if (extractValueAttributes) {
450 considerIfLikelyUri(curi,valueVal,valueContext,Link.NAVLINK_HOP);
451 }
452 }
453 }
454 }
455
456 /***
457 * Consider a query-string-like collections of key=value[&key=value]
458 * pairs for URI-like strings in the values. Where URI-like strings are
459 * found, add as discovered outlink.
460 *
461 * @param curi origin CrawlURI
462 * @param queryString query-string-like string
463 * @param valueContext page context where found
464 */
465 protected void considerQueryStringValues(CrawlURI curi,
466 CharSequence queryString, CharSequence valueContext, char hopType) {
467 for (String pairString : queryString.toString().split("&")) {
468 String[] encodedKeyVal = pairString.split("=");
469 if (encodedKeyVal.length == 2) try {
470 String value = URLDecoder.decode(encodedKeyVal[1], "UTF-8");
471 considerIfLikelyUri(curi, value, valueContext, hopType);
472 } catch (IllegalArgumentException e) {
473
474 considerIfLikelyUri(curi, encodedKeyVal[1], valueContext, hopType);
475 } catch (UnsupportedEncodingException e) {
476 logger.log(Level.SEVERE,"all jvms must support UTF-8, and yet somehow this happened",e);
477 }
478 }
479 }
480
481 /***
482 * Consider whether a given string is URI-like. If so, add as discovered
483 * outlink.
484 *
485 * @param curi origin CrawlURI
486 * @param queryString query-string-like string
487 * @param valueContext page context where found
488
489 */
490 protected void considerIfLikelyUri(CrawlURI curi, CharSequence candidate,
491 CharSequence valueContext, char hopType) {
492 if(UriUtils.isLikelyUriHtmlContextLegacy(candidate)) {
493 addLinkFromString(curi,candidate,valueContext,hopType);
494 }
495 }
496
497 /***
498 * Extract the (java)script source in the given CharSequence.
499 *
500 * @param curi source CrawlURI
501 * @param cs CharSequence of javascript code
502 */
503 protected void processScriptCode(CrawlURI curi, CharSequence cs) {
504 if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {
505 this.numberOfLinksExtracted +=
506 ExtractorJS.considerStrings(curi, cs, getController(), false);
507 }
508 }
509
510 static final String JAVASCRIPT = "(?i)^javascript:.*";
511
512 /***
513 * Handle generic HREF cases.
514 *
515 * @param curi
516 * @param value
517 * @param context
518 */
519 protected void processLink(CrawlURI curi, final CharSequence value,
520 CharSequence context) {
521 if (TextUtils.matches(JAVASCRIPT, value)) {
522 processScriptCode(curi, value. subSequence(11, value.length()));
523 } else {
524 if (logger.isLoggable(Level.FINEST)) {
525 logger.finest("link: " + value.toString() + " from " + curi);
526 }
527 addLinkFromString(curi, value, context, Link.NAVLINK_HOP);
528 this.numberOfLinksExtracted++;
529 }
530 }
531
532 protected void addLinkFromString(CrawlURI curi, CharSequence uri,
533 CharSequence context, char hopType) {
534 try {
535
536
537
538
539 curi.createAndAddLinkRelativeToBase(uri.toString(), context.toString(),
540 hopType);
541 } catch (URIException e) {
542 if (getController() != null) {
543 getController().logUriError(e, curi.getUURI(), uri);
544 } else {
545 logger.info("Failed createAndAddLinkRelativeToBase " +
546 curi + ", " + uri + ", " + context + ", " + hopType +
547 ": " + e);
548 }
549 }
550 }
551
552 protected final void processEmbed(CrawlURI curi, CharSequence value,
553 CharSequence context) {
554 processEmbed(curi, value, context, Link.EMBED_HOP);
555 }
556
557 protected void processEmbed(CrawlURI curi, final CharSequence value,
558 CharSequence context, char hopType) {
559 if (logger.isLoggable(Level.FINEST)) {
560 logger.finest("embed (" + hopType + "): " + value.toString() +
561 " from " + curi);
562 }
563 addLinkFromString(curi,
564 (value instanceof String)?
565 (String)value: value.toString(),
566 context, hopType);
567 this.numberOfLinksExtracted++;
568 }
569
570 public void extract(CrawlURI curi) {
571 if (!isHttpTransactionContentToProcess(curi) ||
572 ! (isExpectedMimeType(curi.getContentType(), "text/html")
573 || isExpectedMimeType(curi.getContentType(), "application/xhtml")
574 || isExpectedMimeType(curi.getContentType(), "text/vnd.wap.wml")
575 || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.wml")
576 || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.xhtml"))) {
577 return;
578 }
579
580 final boolean ignoreUnexpectedHTML =
581 ((Boolean)getUncheckedAttribute(curi,
582 ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();
583
584 if (ignoreUnexpectedHTML) {
585 try {
586 if(!isHtmlExpectedHere(curi)) {
587
588
589 return;
590 }
591 } catch (URIException e) {
592 logger.severe("Failed expectedHTML test: " + e.getMessage());
593 }
594 }
595
596 this.numberOfCURIsHandled++;
597
598 ReplayCharSequence cs = null;
599
600 try {
601 HttpRecorder hr = curi.getHttpRecorder();
602 if (hr == null) {
603 throw new IOException("Why is recorder null here?");
604 }
605 cs = hr.getReplayCharSequence();
606 } catch (IOException e) {
607 curi.addLocalizedError(this.getName(), e,
608 "Failed get of replay char sequence " + curi.toString() +
609 " " + e.getMessage());
610 logger.log(Level.SEVERE,"Failed get of replay char sequence in " +
611 Thread.currentThread().getName(), e);
612 }
613
614 if (cs == null) {
615 return;
616 }
617
618
619
620 try {
621
622 extract(curi, cs);
623
624 curi.linkExtractorFinished();
625 } finally {
626 if (cs != null) {
627 try {
628 cs.close();
629 } catch (IOException ioe) {
630 logger.warning(TextUtils.exceptionToString(
631 "Failed close of ReplayCharSequence.", ioe));
632 }
633 }
634 }
635 }
636
637 /***
638 * Run extractor.
639 * This method is package visible to ease testing.
640 * @param curi CrawlURI we're processing.
641 * @param cs Sequence from underlying ReplayCharSequence. This
642 * is TRANSIENT data. Make a copy if you want the data to live outside
643 * of this extractors' lifetime.
644 */
645 void extract(CrawlURI curi, CharSequence cs) {
646 Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);
647 while(tags.find()) {
648 if(Thread.interrupted()){
649 break;
650 }
651 if (tags.start(8) > 0) {
652
653
654 } else if (tags.start(7) > 0) {
655
656 int start = tags.start(5);
657 int end = tags.end(5);
658 assert start >= 0: "Start is: " + start + ", " + curi;
659 assert end >= 0: "End is :" + end + ", " + curi;
660 if (processMeta(curi,
661 cs.subSequence(start, end))) {
662
663
664 break;
665 }
666 } else if (tags.start(5) > 0) {
667
668 int start5 = tags.start(5);
669 int end5 = tags.end(5);
670 assert start5 >= 0: "Start is: " + start5 + ", " + curi;
671 assert end5 >= 0: "End is :" + end5 + ", " + curi;
672 int start6 = tags.start(6);
673 int end6 = tags.end(6);
674 assert start6 >= 0: "Start is: " + start6 + ", " + curi;
675 assert end6 >= 0: "End is :" + end6 + ", " + curi;
676 processGeneralTag(curi,
677 cs.subSequence(start6, end6),
678 cs.subSequence(start5, end5));
679
680 } else if (tags.start(1) > 0) {
681
682 int start = tags.start(1);
683 int end = tags.end(1);
684 assert start >= 0: "Start is: " + start + ", " + curi;
685 assert end >= 0: "End is :" + end + ", " + curi;
686 assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +
687 ", " + curi;
688 processScript(curi, cs.subSequence(start, end),
689 tags.end(2) - start);
690
691 } else if (tags.start(3) > 0){
692
693 int start = tags.start(3);
694 int end = tags.end(3);
695 assert start >= 0: "Start is: " + start + ", " + curi;
696 assert end >= 0: "End is :" + end + ", " + curi;
697 assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +
698 ", " + curi;
699 processStyle(curi, cs.subSequence(start, end),
700 tags.end(4) - start);
701 }
702 }
703 TextUtils.recycleMatcher(tags);
704 }
705
706
707 static final String NON_HTML_PATH_EXTENSION =
708 "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
709 "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";
710
711 /***
712 * Test whether this HTML is so unexpected (eg in place of a GIF URI)
713 * that it shouldn't be scanned for links.
714 *
715 * @param curi CrawlURI to examine.
716 * @return True if HTML is acceptable/expected here
717 * @throws URIException
718 */
719 protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {
720 String path = curi.getUURI().getPath();
721 if(path==null) {
722
723 return true;
724 }
725 int dot = path.lastIndexOf('.');
726 if (dot < 0) {
727
728 return true;
729 }
730 if(dot<(path.length()-5)) {
731
732 return true;
733 }
734 String ext = path.substring(dot+1);
735 return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);
736 }
737
738 protected void processScript(CrawlURI curi, CharSequence sequence,
739 int endOfOpenTag) {
740
741
742 processGeneralTag(curi,sequence.subSequence(0,6),
743 sequence.subSequence(0,endOfOpenTag));
744
745
746
747 processScriptCode(
748 curi, sequence.subSequence(endOfOpenTag, sequence.length()));
749 }
750
751 /***
752 * Process metadata tags.
753 * @param curi CrawlURI we're processing.
754 * @param cs Sequence from underlying ReplayCharSequence. This
755 * is TRANSIENT data. Make a copy if you want the data to live outside
756 * of this extractors' lifetime.
757 * @return True robots exclusion metatag.
758 */
759 protected boolean processMeta(CrawlURI curi, CharSequence cs) {
760 Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);
761 String name = null;
762 String httpEquiv = null;
763 String content = null;
764 while (attr.find()) {
765 int valueGroup =
766 (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
767 CharSequence value =
768 cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));
769 value = TextUtils.unescapeHtml(value);
770 if (attr.group(1).equalsIgnoreCase("name")) {
771 name = value.toString();
772 } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {
773 httpEquiv = value.toString();
774 } else if (attr.group(1).equalsIgnoreCase("content")) {
775 content = value.toString();
776 }
777
778 }
779 TextUtils.recycleMatcher(attr);
780
781
782 if("robots".equalsIgnoreCase(name) && content != null ) {
783 curi.putString(A_META_ROBOTS, content);
784 RobotsHonoringPolicy policy =
785 getSettingsHandler().getOrder().getRobotsHonoringPolicy();
786 String contentLower = content.toLowerCase();
787 if ((policy == null
788 || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)
789 && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))
790 && (contentLower.indexOf("nofollow") >= 0
791 || contentLower.indexOf("none") >= 0)) {
792
793
794 logger.fine("HTML extraction skipped due to robots meta-tag for: "
795 + curi.toString());
796 return true;
797 }
798 } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {
799 int urlIndex = content.indexOf("=") + 1;
800 if(urlIndex>0) {
801 String refreshUri = content.substring(urlIndex);
802 try {
803 curi.createAndAddLinkRelativeToBase(refreshUri, "meta",
804 Link.REFER_HOP);
805 } catch (URIException e) {
806 if (getController() != null) {
807 getController().logUriError(e, curi.getUURI(), refreshUri);
808 } else {
809 logger.info("Failed createAndAddLinkRelativeToBase " +
810 curi + ", " + cs + ", " + refreshUri + ": " + e);
811 }
812 }
813 }
814 }
815 return false;
816 }
817
818 /***
819 * Process style text.
820 * @param curi CrawlURI we're processing.
821 * @param sequence Sequence from underlying ReplayCharSequence. This
822 * is TRANSIENT data. Make a copy if you want the data to live outside
823 * of this extractors' lifetime.
824 * @param endOfOpenTag
825 */
826 protected void processStyle(CrawlURI curi, CharSequence sequence,
827 int endOfOpenTag) {
828
829 processGeneralTag(curi, sequence.subSequence(0,6),
830 sequence.subSequence(0,endOfOpenTag));
831
832
833 this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(
834 curi, sequence.subSequence(endOfOpenTag,sequence.length()),
835 getController());
836 }
837
838
839
840
841
842
843 public String report() {
844 StringBuffer ret = new StringBuffer();
845 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");
846 ret.append(" Function: Link extraction on HTML documents\n");
847 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
848 ret.append(" Links extracted: " + this.numberOfLinksExtracted +
849 "\n\n");
850 return ret.toString();
851 }
852 }
853