1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.extractor;
24
25 import java.io.File;
26 import java.io.FileOutputStream;
27 import java.io.IOException;
28 import java.net.URL;
29 import java.util.Collection;
30 import java.util.Iterator;
31
32 import javax.management.AttributeNotFoundException;
33 import javax.management.InvalidAttributeValueException;
34 import javax.management.MBeanException;
35 import javax.management.ReflectionException;
36
37 import org.apache.commons.collections.CollectionUtils;
38 import org.apache.commons.collections.Predicate;
39 import org.apache.commons.httpclient.URIException;
40 import org.archive.crawler.datamodel.CoreAttributeConstants;
41 import org.archive.crawler.datamodel.CrawlOrder;
42 import org.archive.crawler.datamodel.CrawlURI;
43 import org.archive.crawler.settings.MapType;
44 import org.archive.crawler.settings.SettingsHandler;
45 import org.archive.crawler.settings.XMLSettingsHandler;
46 import org.archive.net.UURI;
47 import org.archive.net.UURIFactory;
48 import org.archive.util.HttpRecorder;
49 import org.archive.util.TmpDirTestCase;
50
51
52 /***
53 * Test html extractor.
54 *
55 * @author stack
56 * @version $Revision: 6830 $, $Date: 2010-04-21 23:39:57 +0000 (Wed, 21 Apr 2010) $
57 */
58 public class ExtractorHTMLTest
59 extends TmpDirTestCase
60 implements CoreAttributeConstants {
61 private final String ARCHIVE_DOT_ORG = "archive.org";
62 private final String LINK_TO_FIND = "http://www.hewlett.org/";
63 private HttpRecorder recorder = null;
64 private ExtractorHTML extractor = null;
65
66 protected ExtractorHTML createExtractor()
67 throws InvalidAttributeValueException, AttributeNotFoundException,
68 MBeanException, ReflectionException {
69
70
71
72
73
74
75 final String name = this.getClass().getName();
76 SettingsHandler handler = new XMLSettingsHandler(
77 new File(getTmpDir(), name + ".order.xml"));
78 handler.initialize();
79 return (ExtractorHTML)((MapType)handler.getOrder().
80 getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
81 getSettingsObject(null), new ExtractorHTML(name));
82 }
83
84 protected void setUp() throws Exception {
85 super.setUp();
86 this.extractor = createExtractor();
87 final boolean USE_NET = false;
88 URL url = null;
89 if (USE_NET) {
90 url = new URL("http://" + this.ARCHIVE_DOT_ORG);
91 } else {
92 File f = new File(getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
93 url = f.toURI().toURL();
94 FileOutputStream fos = new FileOutputStream(f);
95 fos.write(("<html><head><title>test</title><body>" +
96 "<a href=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
97 "</body></html>").getBytes());
98 fos.flush();
99 fos.close();
100 }
101 this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
102 this.getClass().getName(), url.openStream(), null);
103 }
104
105
106
107
108 protected void tearDown() throws Exception {
109 super.tearDown();
110 }
111
112 public void testInnerProcess() throws IOException {
113 UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
114 CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
115 this.extractor.innerProcess(curi);
116 Collection<Link> links = curi.getOutLinks();
117 boolean foundLinkToHewlettFoundation = false;
118 for (Iterator<Link> i = links.iterator(); i.hasNext();) {
119 Link link = (Link)i.next();
120 if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
121 foundLinkToHewlettFoundation = true;
122 break;
123 }
124 }
125 assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
126 }
127
128 private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
129 throws URIException {
130 CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
131 curi.setContentSize(this.recorder.getRecordedInput().getSize());
132 curi.setContentType("text/html");
133 curi.setFetchStatus(200);
134 curi.setHttpRecorder(rec);
135
136 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
137 new Object());
138 return curi;
139 }
140
141 /***
142 * Test single net or local filesystem page parse.
143 * Set the uuri to be a net url or instead put in place a file
144 * named for this class under the unit test directory.
145 * @throws IOException
146 * @throws ReflectionException
147 * @throws MBeanException
148 * @throws AttributeNotFoundException
149 * @throws InvalidAttributeValueException
150 */
151 public void testPageParse()
152 throws InvalidAttributeValueException, AttributeNotFoundException,
153 MBeanException, ReflectionException, IOException {
154 UURI uuri = null;
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171 if (uuri != null) {
172 runExtractor(uuri);
173 }
174 }
175
176 protected UURI getUURI(String url) throws URIException {
177 url = (url.indexOf("://") > 0)? url: "file://" + url;
178 return UURIFactory.getInstance(url);
179 }
180
181 protected void runExtractor(UURI baseUURI)
182 throws InvalidAttributeValueException, AttributeNotFoundException,
183 MBeanException, ReflectionException, IOException {
184 runExtractor(baseUURI, null);
185 }
186
187 protected void runExtractor(UURI baseUURI, String encoding)
188 throws IOException, InvalidAttributeValueException,
189 AttributeNotFoundException, MBeanException, ReflectionException {
190 if (baseUURI == null) {
191 return;
192 }
193 this.extractor = createExtractor();
194 URL url = new URL(baseUURI.toString());
195 this.recorder = HttpRecorder.
196 wrapInputStreamWithHttpRecord(getTmpDir(),
197 this.getClass().getName(), url.openStream(), encoding);
198 CrawlURI curi = setupCrawlURI(this.recorder, url.toString());
199 this.extractor.innerProcess(curi);
200
201 System.out.println("+" + this.extractor.report());
202 int count = 0;
203 Collection<Link> links = curi.getOutLinks();
204 System.out.println("+HTML Links (hopType="+Link.NAVLINK_HOP+"):");
205 if (links != null) {
206 for (Iterator<Link> i = links.iterator(); i.hasNext();) {
207 Link link = (Link)i.next();
208 if (link.getHopType()==Link.NAVLINK_HOP) {
209 count++;
210 System.out.println(link.getDestination());
211 }
212 }
213 }
214 System.out.println("+HTML Embeds (hopType="+Link.EMBED_HOP+"):");
215 if (links != null) {
216 for (Iterator<Link> i = links.iterator(); i.hasNext();) {
217 Link link = (Link)i.next();
218 if (link.getHopType()==Link.EMBED_HOP) {
219 count++;
220 System.out.println(link.getDestination());
221 }
222 }
223 }
224 System.out.
225 println("+HTML Speculative Embeds (hopType="+Link.SPECULATIVE_HOP+"):");
226 if (links != null) {
227 for (Iterator<Link> i = links.iterator(); i.hasNext();) {
228 Link link = (Link)i.next();
229 if (link.getHopType()==Link.SPECULATIVE_HOP) {
230 count++;
231 System.out.println(link.getDestination());
232 }
233 }
234 }
235 System.out.
236 println("+HTML Other (all other hopTypes):");
237 if (links != null) {
238 for (Iterator<Link> i = links.iterator(); i.hasNext();) {
239 Link link = (Link) i.next();
240 if (link.getHopType() != Link.SPECULATIVE_HOP
241 && link.getHopType() != Link.NAVLINK_HOP
242 && link.getHopType() != Link.EMBED_HOP) {
243 count++;
244 System.out.println(link.getHopType() + " "
245 + link.getDestination());
246 }
247 }
248 }
249 System.out.println("TOTAL URIS EXTRACTED: "+count);
250 }
251
252 /***
253 * Test a particular <embed src=...> construct that was suspicious in
254 * the No10GovUk crawl.
255 *
256 * @throws URIException
257 */
258 public void testEmbedSrc() throws URIException {
259 CrawlURI curi=
260 new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
261
262 CharSequence cs = "<embed src=\"/documents/prem/18/1/graphics/qtvr/" +
263 "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " +
264 "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" +
265 "quicktime/download/\" /> ";
266 this.extractor.extract(curi,cs);
267 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
268 public boolean evaluate(Object object) {
269 return ((Link) object).getDestination().toString().indexOf(
270 "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0;
271 }
272 }));
273 }
274
275 /***
276 * Test a whitespace issue found in href.
277 *
278 * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
279 * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
280 *
281 * @throws URIException
282 */
283 public void testHrefWhitespace() throws URIException {
284 CrawlURI curi =
285 new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));
286 CharSequence cs = "<a href=\"http://www.carsound.dk\n\n\n" +
287 "\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";
288 this.extractor.extract(curi,cs);
289 curi.getOutLinks();
290 assertTrue("Not stripping new lines", CollectionUtils.exists(curi
291 .getOutLinks(), new Predicate() {
292 public boolean evaluate(Object object) {
293 return ((Link) object).getDestination().toString().indexOf(
294 "http://www.carsound.dk/")>=0;
295 }
296 }));
297 }
298
299 /***
300 * Test a missing whitespace issue found in form
301 *
302 * [HER-1128] ExtractorHTML fails to extract FRAME SRC link without
303 * whitespace before SRC http://webteam.archive.org/jira/browse/HER-1128
304 */
305 public void testNoWhitespaceBeforeValidAttribute() throws URIException {
306 CrawlURI curi = new CrawlURI(UURIFactory
307 .getInstance("http://www.example.com"));
308 CharSequence cs = "<frame name=\"main\"src=\"http://www.example.com/\"> ";
309 this.extractor.extract(curi, cs);
310 Link[] links = curi.getOutLinks().toArray(new Link[0]);
311 assertTrue("no links found",links.length==1);
312 assertTrue("expected link not found",
313 links[0].getDestination().toString().equals("http://www.example.com/"));
314 }
315
316 /***
317 * Test only extract FORM ACTIONS with METHOD GET
318 *
319 * [HER-1280] do not by default GET form action URLs declared as POST,
320 * because it can cause problems/complaints
321 * http://webteam.archive.org/jira/browse/HER-1280
322 */
323 public void testOnlyExtractFormGets() throws URIException {
324 CrawlURI curi = new CrawlURI(UURIFactory
325 .getInstance("http://www.example.com"));
326 CharSequence cs =
327 "<form method=\"get\" action=\"http://www.example.com/ok1\"> "+
328 "<form action=\"http://www.example.com/ok2\" method=\"get\"> "+
329 "<form method=\"post\" action=\"http://www.example.com/notok\"> "+
330 "<form action=\"http://www.example.com/ok3\"> ";
331 this.extractor.extract(curi, cs);
332 Link[] links = curi.getOutLinks().toArray(new Link[0]);
333 assertTrue("incorrect number of links found",links.length==3);
334 }
335
336 /***
337 * Test that relative URIs with late colons aren't misinterpreted
338 * as absolute URIs with long, illegal scheme components.
339 *
340 * See http://webteam.archive.org/jira/browse/HER-1268
341 *
342 * @throws URIException
343 */
344 public void testBadRelativeLinks() throws URIException {
345 CrawlURI curi = new CrawlURI(UURIFactory
346 .getInstance("http://www.example.com"));
347 CharSequence cs = "<a href=\"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value\"/>"
348 + "<a href=\"example.html?parameter=this:value\"/>";
349 this.extractor.extract(curi, cs);
350
351 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
352 public boolean evaluate(Object object) {
353 return ((Link) object)
354 .getDestination()
355 .toString()
356 .indexOf(
357 "/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value") >= 0;
358 }
359 }));
360
361 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
362 public boolean evaluate(Object object) {
363 return ((Link) object).getDestination().toString().indexOf(
364 "/example.html?parameter=this:value") >= 0;
365 }
366 }));
367 }
368
369 /***
370 * Test if scheme is maintained by speculative hops onto exact
371 * same host
372 *
373 * [HER-1524] speculativeFixup in ExtractorJS should maintain URL scheme
374 */
375 public void testSpeculativeLinkExtraction() throws URIException {
376 CrawlURI curi = new CrawlURI(UURIFactory
377 .getInstance("https://www.example.com"));
378 CharSequence cs =
379 "<script type=\"text/javascript\">_parameter=\"www.anotherexample.com\";"
380 + "_anotherparameter=\"www.example.com/index.html\""
381 + ";</script>";
382 this.extractor.extract(curi, cs);
383
384 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
385 public boolean evaluate(Object object) {
386 return ((Link) object).getDestination().toString().equals(
387 "http://www.anotherexample.com/");
388 }
389 }));
390 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
391 public boolean evaluate(Object object) {
392 return ((Link) object).getDestination().toString().equals(
393 "https://www.example.com/index.html");
394 }
395 }));
396 }
397
398 /***
399 * test to see if embedded <SCRIPT/> which writes script TYPE
400 * creates any outlinks, e.g. "type='text/javascript'".
401 *
402 * [HER-1526] SCRIPT writing script TYPE common trigger of bogus links
403 * (eg. 'text/javascript')
404 *
405 * @throws URIException
406 */
407 public void testScriptTagWritingScriptType() throws URIException {
408 CrawlURI curi = new CrawlURI(UURIFactory
409 .getInstance("http://www.example.com/en/fiche/dossier/322/"));
410 CharSequence cs =
411 "<script type=\"text/javascript\">"
412 + "var gaJsHost = ((\"https:\" == document.location.protocol) "
413 + "? \"https://ssl.\" : \"http://www.\");"
414 + "document.write(unescape(\"%3Cscript src='\" + gaJsHost + "
415 + "\"google-analytics.com/ga.js' "
416 + "type='text/javascript'%3E%3C/script%3E\"));"
417 + "</script>";
418 this.extractor.extract(curi, cs);
419 assertTrue("outlinks should be empty",curi.getOutLinks().isEmpty());
420 }
421
422 protected Predicate destinationContainsPredicate(final String fragment) {
423 return new Predicate() {
424 public boolean evaluate(Object object) {
425 return ((Link) object).getDestination().toString().indexOf(fragment) >= 0;
426 }
427 };
428 }
429
430 protected Predicate destinationsIsPredicate(final String value) {
431 return new Predicate() {
432 public boolean evaluate(Object object) {
433 return ((Link) object).getDestination().toString().equals(value);
434 }
435 };
436 }
437
438 /***
439 * HER-1728
440 * @throws URIException
441 */
442 public void testFlashvarsParamValue() throws URIException {
443 CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
444 CharSequence cs =
445 "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" +
446 " <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" +
447 " <param name=\"menu\" value=\"false\">\n" +
448 " <param name=\"bgcolor\" value=\"#000000\">\n" +
449 " <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" +
450 " <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" +
451 "</object> ";
452 this.extractor.extract(curi, cs);
453 String expected = "http://www.example.com/ParamZoomifySlideshowViewer.xml";
454 assertTrue("outlinks should contain: "+expected,
455 CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected)));
456 }
457
458 /***
459 * HER-1728
460 * @throws URIException
461 */
462 public void testFlashvarsEmbedAttribute() throws URIException {
463 CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
464 CharSequence cs =
465 "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" +
466 " <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" +
467 " <param name=\"menu\" value=\"false\">\n" +
468 " <param name=\"bgcolor\" value=\"#000000\">\n" +
469 " <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" +
470 " <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" +
471 "</object> ";
472 this.extractor.extract(curi, cs);
473 String expected = "http://www.example.com/EmbedZoomifySlideshowViewer.xml";
474 assertTrue("outlinks should contain: "+expected,
475 CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected)));
476 }
477
478 /***
479 * False test: tries to verify extractor ignores a 'longDesc'
480 * attribute. In fact, HTML spec says longDesc is a URI, so
481 * crawler should find 2 links here.
482 * See [HER-206]
483 * @throws URIException
484 */
485 public void xestAvoidBadSpec() throws URIException {
486 CrawlURI curi = new CrawlURI(UURIFactory
487 .getInstance("http://www.example.com"));
488 CharSequence cs =
489 "<TBODY>\r\n" +
490 "<TR>\r\n" +
491 "<TD><IMG height=259 alt=\"Operation Overlord Commanders\"\r\n" +
492 "src=\"/img/aboutus/history/dday60/commanders.jpg\"\r\n" +
493 "width=500 longDesc=\"Overlord Commanders, Back row, left\r\n" +
494 "to right:<BR>Lieutenant General Bradley, Admiral\r\n" +
495 "Ramsay, Air Chief Marshal Leigh-Mallory, General Bedell\r\n" +
496 "Smith.<BR>Front row, left to right: Air Chief Marshal\r\n" +
497 "Tedder, General Eisenhower, General Montgomery.\"></TD></TR>\r\n" +
498 "<TR>\r\n" +
499 "<TD class=caption>�Overlord� Commanders, Back row, left\r\n" +
500 "to right:<BR>Lieutenant General Bradley, Admiral\r\n" +
501 "Ramsay, Air Chief Marshal Leigh-Mallory, General Bedell\r\n" +
502 "Smith.<BR>Front row, left to right: Air Chief Marshal\r\n" +
503 "Tedder, General Eisenhower, General\r\n" +
504 "Montgomery.</TD></TR></TBODY></TABLE>\r\n" +
505 "<P>\r\n" +
506 "<TABLE id=imageinset width=\"35%\" align=right\r\n" +
507 "summary=\"Key Facts About the Allied Forces Deployed on\r\n" +
508 "D-Day\" border=0>\r\n" +
509 "<TBODY>";
510 this.extractor.extract(curi, cs);
511 Link[] links = curi.getOutLinks().toArray(new Link[0]);
512 assertTrue("incorrect number of links found",links.length==1);
513 }
514
515 public static void main(String[] args) throws Exception {
516 if (args.length != 1 && args.length != 2) {
517 System.err.println("Usage: " + ExtractorHTMLTest.class.getName() +
518 " URL|PATH [ENCODING]");
519 System.exit(1);
520 }
521 ExtractorHTMLTest testCase = new ExtractorHTMLTest();
522 testCase.setUp();
523 try {
524 testCase.runExtractor(testCase.getUURI(args[0]),
525 (args.length == 2)? args[1]: null);
526 } finally {
527 testCase.tearDown();
528 }
529 }
530 }