ExtractorHTMLTest xref

View Javadoc

1   /* ExtractorHTMLTest
2    *
3    * Created on May 19, 2004
4    *
5    * Copyright (C) 2004 Internet Archive.
6    *
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    *
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   *
14   * Heritrix is distributed in the hope that it will be useful,
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   *
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.io.File;
26  import java.io.FileOutputStream;
27  import java.io.IOException;
28  import java.net.URL;
29  import java.util.Collection;
30  import java.util.Iterator;
31  
32  import javax.management.AttributeNotFoundException;
33  import javax.management.InvalidAttributeValueException;
34  import javax.management.MBeanException;
35  import javax.management.ReflectionException;
36  
37  import org.apache.commons.collections.CollectionUtils;
38  import org.apache.commons.collections.Predicate;
39  import org.apache.commons.httpclient.URIException;
40  import org.archive.crawler.datamodel.CoreAttributeConstants;
41  import org.archive.crawler.datamodel.CrawlOrder;
42  import org.archive.crawler.datamodel.CrawlURI;
43  import org.archive.crawler.settings.MapType;
44  import org.archive.crawler.settings.SettingsHandler;
45  import org.archive.crawler.settings.XMLSettingsHandler;
46  import org.archive.net.UURI;
47  import org.archive.net.UURIFactory;
48  import org.archive.util.HttpRecorder;
49  import org.archive.util.TmpDirTestCase;
50  
51  
52  /***
53   * Test html extractor.
54   *
55   * @author stack
56   * @version $Revision: 6830 $, $Date: 2010-04-21 23:39:57 +0000 (Wed, 21 Apr 2010) $
57   */
58  public class ExtractorHTMLTest
59  extends TmpDirTestCase
60  implements CoreAttributeConstants {
61      private final String ARCHIVE_DOT_ORG = "archive.org";
62      private final String LINK_TO_FIND = "http://www.hewlett.org/";
63      private HttpRecorder recorder = null;
64      private ExtractorHTML extractor = null;
65      
66      protected ExtractorHTML createExtractor()
67      throws InvalidAttributeValueException, AttributeNotFoundException,
68      MBeanException, ReflectionException {
69          // Hack in a settings handler.  Do this by adding this extractor
70          // to the order file (I'm adding it to a random MapType; seemingly
71          // can only add to MapTypes post-construction). This takes care
72          // of setting a valid SettingsHandler into the ExtractorHTML (This
73          // shouldn't be so difficult).  Of note, the order file below is
74          // not written to disk.
75          final String name = this.getClass().getName();
76          SettingsHandler handler = new XMLSettingsHandler(
77              new File(getTmpDir(), name + ".order.xml"));
78          handler.initialize();
79          return (ExtractorHTML)((MapType)handler.getOrder().
80              getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
81                  getSettingsObject(null), new ExtractorHTML(name));
82      }
83      
84      protected void setUp() throws Exception {
85          super.setUp();
86          this.extractor = createExtractor();
87          final boolean USE_NET = false;
88          URL url = null;
89          if (USE_NET) {
90              url = new URL("http://" + this.ARCHIVE_DOT_ORG);
91          } else {
92              File f = new File(getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
93              url = f.toURI().toURL();
94              FileOutputStream fos = new FileOutputStream(f);
95              fos.write(("<html><head><title>test</title><body>" +
96                  "<a href=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
97                  "</body></html>").getBytes());
98              fos.flush();
99              fos.close();
100         }
101         this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
102             this.getClass().getName(), url.openStream(), null);
103     }
104 
105     /*
106      * @see TestCase#tearDown()
107      */
108     protected void tearDown() throws Exception {
109         super.tearDown();
110     }
111 
112     public void testInnerProcess() throws IOException {
113         UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
114         CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
115         this.extractor.innerProcess(curi);
116         Collection<Link> links = curi.getOutLinks();
117         boolean foundLinkToHewlettFoundation = false;
118         for (Iterator<Link> i = links.iterator(); i.hasNext();) {
119             Link link = (Link)i.next();
120             if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
121                 foundLinkToHewlettFoundation = true;
122                 break;
123             }
124         }
125         assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
126     }
127     
128     private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
129     		throws URIException {
130         CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
131         curi.setContentSize(this.recorder.getRecordedInput().getSize());
132         curi.setContentType("text/html");
133         curi.setFetchStatus(200);
134         curi.setHttpRecorder(rec);
135         // Fake out the extractor that this is a HTTP transaction.
136         curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
137             new Object());
138         return curi;
139     }
140     
141     /***
142      * Test single net or local filesystem page parse.
143      * Set the uuri to be a net url or instead put in place a file
144      * named for this class under the unit test directory.
145      * @throws IOException
146      * @throws ReflectionException
147      * @throws MBeanException
148      * @throws AttributeNotFoundException
149      * @throws InvalidAttributeValueException
150      */
151     public void testPageParse()
152     throws InvalidAttributeValueException, AttributeNotFoundException,
153     MBeanException, ReflectionException, IOException {
154         UURI uuri = null;
155         
156 // DO
157 //      uuri = UURIFactory.getInstance("http://www.xjmu.edu.cn/");
158 // OR
159 //        File f = new File(getTmpDir(), this.getClass().getName() +
160 //        ".html");
161 //        if (f.exists()) {
162 //        	uuri = UURIFactory.getInstance("file://" +
163 //        			f.getAbsolutePath());
164 //        }
165 // OR 
166 //      uuri = getUURI(URL or PATH)
167 //
168 // OR 
169 //      Use the main method below and pass this class an argument.
170 //     
171         if (uuri != null) {
172         	runExtractor(uuri);
173         }
174     }
175     
176     protected UURI getUURI(String url) throws URIException {
177         url = (url.indexOf("://") > 0)? url: "file://" + url;
178         return UURIFactory.getInstance(url);
179     }
180     
181     protected void runExtractor(UURI baseUURI)
182     throws InvalidAttributeValueException, AttributeNotFoundException,
183     MBeanException, ReflectionException, IOException {
184         runExtractor(baseUURI, null);
185     }
186     
187     protected void runExtractor(UURI baseUURI, String encoding)
188     throws IOException, InvalidAttributeValueException,
189     AttributeNotFoundException, MBeanException, ReflectionException {
190         if (baseUURI == null) {
191         	return;
192         }
193         this.extractor = createExtractor();
194         URL url = new URL(baseUURI.toString());
195         this.recorder = HttpRecorder.
196             wrapInputStreamWithHttpRecord(getTmpDir(),
197             this.getClass().getName(), url.openStream(), encoding);
198         CrawlURI curi = setupCrawlURI(this.recorder, url.toString());
199         this.extractor.innerProcess(curi);
200         
201         System.out.println("+" + this.extractor.report());
202         int count = 0; 
203         Collection<Link> links = curi.getOutLinks();
204         System.out.println("+HTML Links (hopType="+Link.NAVLINK_HOP+"):");
205         if (links != null) {
206             for (Iterator<Link> i = links.iterator(); i.hasNext();) {
207                 Link link = (Link)i.next();
208                 if (link.getHopType()==Link.NAVLINK_HOP) {
209                     count++;
210                     System.out.println(link.getDestination());
211                 }
212             }
213         }
214         System.out.println("+HTML Embeds (hopType="+Link.EMBED_HOP+"):");
215         if (links != null) {
216             for (Iterator<Link> i = links.iterator(); i.hasNext();) {
217                 Link link = (Link)i.next();
218                 if (link.getHopType()==Link.EMBED_HOP) {
219                     count++;
220                     System.out.println(link.getDestination());
221                 }
222             }
223         }
224         System.out.
225             println("+HTML Speculative Embeds (hopType="+Link.SPECULATIVE_HOP+"):");
226         if (links != null) {
227             for (Iterator<Link> i = links.iterator(); i.hasNext();) {
228                 Link link = (Link)i.next();
229                 if (link.getHopType()==Link.SPECULATIVE_HOP) {
230                     count++;
231                     System.out.println(link.getDestination());
232                 }
233             }
234         }
235         System.out.
236             println("+HTML Other (all other hopTypes):");
237         if (links != null) {
238             for (Iterator<Link> i = links.iterator(); i.hasNext();) {
239                 Link link = (Link) i.next();
240                 if (link.getHopType() != Link.SPECULATIVE_HOP
241                         && link.getHopType() != Link.NAVLINK_HOP
242                         && link.getHopType() != Link.EMBED_HOP) {
243                     count++;
244                     System.out.println(link.getHopType() + " "
245                             + link.getDestination());
246                 }
247             }
248         }
249         System.out.println("TOTAL URIS EXTRACTED: "+count);
250     }
251 
252     /***
253      * Test a particular <embed src=...> construct that was suspicious in
254      * the No10GovUk crawl.
255      *
256      * @throws URIException
257      */
258     public void testEmbedSrc() throws URIException {
259         CrawlURI curi=
260             new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
261         // An example from http://www.records.pro.gov.uk/documents/prem/18/1/default.asp?PageId=62&qt=true
262         CharSequence cs = "<embed src=\"/documents/prem/18/1/graphics/qtvr/" +
263             "hall.mov\" width=\"320\" height=\"212\" controller=\"true\" " +
264             "CORRECTION=\"FULL\" pluginspage=\"http://www.apple.com/" +
265             "quicktime/download/\" /> ";
266         this.extractor.extract(curi,cs);
267         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
268             public boolean evaluate(Object object) {
269                 return ((Link) object).getDestination().toString().indexOf(
270                         "/documents/prem/18/1/graphics/qtvr/hall.mov")>=0;
271             }
272         }));
273     }
274     
275     /***
276      * Test a whitespace issue found in href.
277      * 
278      * See [ 963965 ] Either UURI or ExtractHTML should strip whitespace better.
279      * https://sourceforge.net/tracker/?func=detail&atid=539099&aid=963965&group_id=73833
280      *
281      * @throws URIException
282      */
283     public void testHrefWhitespace() throws URIException {
284         CrawlURI curi =
285             new CrawlURI(UURIFactory.getInstance("http://www.carsound.dk"));
286         CharSequence cs = "<a href=\"http://www.carsound.dk\n\n\n" +
287         	"\"\ntarget=\"_blank\">C.A.R. Sound\n\n\n\n</a>";   
288         this.extractor.extract(curi,cs);
289         curi.getOutLinks();
290         assertTrue("Not stripping new lines", CollectionUtils.exists(curi
291                 .getOutLinks(), new Predicate() {
292             public boolean evaluate(Object object) {
293                 return ((Link) object).getDestination().toString().indexOf(
294                         "http://www.carsound.dk/")>=0;
295             }
296         }));
297     }
298     
299     /***
300      * Test a missing whitespace issue found in form
301      * 
302      * [HER-1128] ExtractorHTML fails to extract FRAME SRC link without
303      * whitespace before SRC http://webteam.archive.org/jira/browse/HER-1128
304      */
305     public void testNoWhitespaceBeforeValidAttribute() throws URIException {
306         CrawlURI curi = new CrawlURI(UURIFactory
307                 .getInstance("http://www.example.com"));
308         CharSequence cs = "<frame name=\"main\"src=\"http://www.example.com/\"> ";
309         this.extractor.extract(curi, cs);
310         Link[] links = curi.getOutLinks().toArray(new Link[0]);
311         assertTrue("no links found",links.length==1);
312         assertTrue("expected link not found", 
313                 links[0].getDestination().toString().equals("http://www.example.com/"));
314     }
315     
316     /***
317      * Test only extract FORM ACTIONS with METHOD GET 
318      * 
319      * [HER-1280] do not by default GET form action URLs declared as POST, 
320      * because it can cause problems/complaints 
321      * http://webteam.archive.org/jira/browse/HER-1280
322      */
323     public void testOnlyExtractFormGets() throws URIException {
324         CrawlURI curi = new CrawlURI(UURIFactory
325                 .getInstance("http://www.example.com"));
326         CharSequence cs = 
327             "<form method=\"get\" action=\"http://www.example.com/ok1\"> "+
328             "<form action=\"http://www.example.com/ok2\" method=\"get\"> "+
329             "<form method=\"post\" action=\"http://www.example.com/notok\"> "+
330             "<form action=\"http://www.example.com/ok3\"> ";
331         this.extractor.extract(curi, cs);
332         Link[] links = curi.getOutLinks().toArray(new Link[0]);
333         assertTrue("incorrect number of links found",links.length==3);
334     }
335     
336     /***
337      * Test that relative URIs with late colons aren't misinterpreted
338      * as absolute URIs with long, illegal scheme components. 
339      * 
340      * See http://webteam.archive.org/jira/browse/HER-1268
341      * 
342      * @throws URIException
343      */
344     public void testBadRelativeLinks() throws URIException {
345         CrawlURI curi = new CrawlURI(UURIFactory
346                 .getInstance("http://www.example.com"));
347         CharSequence cs = "<a href=\"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value\"/>"
348                 + "<a href=\"example.html?parameter=this:value\"/>";
349         this.extractor.extract(curi, cs);
350 
351         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
352             public boolean evaluate(Object object) {
353                 return ((Link) object)
354                         .getDestination()
355                         .toString()
356                         .indexOf(
357                                 "/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value") >= 0;
358             }
359         }));
360 
361         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
362             public boolean evaluate(Object object) {
363                 return ((Link) object).getDestination().toString().indexOf(
364                         "/example.html?parameter=this:value") >= 0;
365             }
366         }));
367     }
368     
369     /***
370      * Test if scheme is maintained by speculative hops onto exact 
371      * same host
372      * 
373      * [HER-1524] speculativeFixup in ExtractorJS should maintain URL scheme
374      */
375     public void testSpeculativeLinkExtraction() throws URIException {
376         CrawlURI curi = new CrawlURI(UURIFactory
377                 .getInstance("https://www.example.com"));
378         CharSequence cs = 
379             "<script type=\"text/javascript\">_parameter=\"www.anotherexample.com\";"
380                 + "_anotherparameter=\"www.example.com/index.html\""
381                 + ";</script>";
382         this.extractor.extract(curi, cs);
383 
384         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
385             public boolean evaluate(Object object) {
386                 return ((Link) object).getDestination().toString().equals(
387                         "http://www.anotherexample.com/");
388             }
389         }));
390         assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
391             public boolean evaluate(Object object) {
392                 return ((Link) object).getDestination().toString().equals(
393                         "https://www.example.com/index.html");
394             }
395         }));
396     }    
397     
398     /***
399      * test to see if embedded <SCRIPT/> which writes script TYPE
400      * creates any outlinks, e.g. "type='text/javascript'". 
401      * 
402      * [HER-1526] SCRIPT writing script TYPE common trigger of bogus links 
403      *   (eg. 'text/javascript')
404      *   
405      * @throws URIException
406      */
407     public void testScriptTagWritingScriptType() throws URIException {
408         CrawlURI curi = new CrawlURI(UURIFactory
409                 .getInstance("http://www.example.com/en/fiche/dossier/322/"));
410         CharSequence cs = 
411             "<script type=\"text/javascript\">"
412             + "var gaJsHost = ((\"https:\" == document.location.protocol) "
413             + "? \"https://ssl.\" : \"http://www.\");"
414             + "document.write(unescape(\"%3Cscript src='\" + gaJsHost + "
415             + "\"google-analytics.com/ga.js' "
416             + "type='text/javascript'%3E%3C/script%3E\"));"
417             + "</script>";
418         this.extractor.extract(curi, cs);
419         assertTrue("outlinks should be empty",curi.getOutLinks().isEmpty());    
420     }
421     
422     protected Predicate destinationContainsPredicate(final String fragment) {
423         return new Predicate() {
424             public boolean evaluate(Object object) {
425                 return ((Link) object).getDestination().toString().indexOf(fragment) >= 0;
426             }
427         };
428     }
429     
430     protected Predicate destinationsIsPredicate(final String value) {
431         return new Predicate() {
432             public boolean evaluate(Object object) {
433                 return ((Link) object).getDestination().toString().equals(value);
434             }
435         };
436     }
437     
438     /***
439      * HER-1728 
440      * @throws URIException 
441      */
442     public void testFlashvarsParamValue() throws URIException {
443         CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
444         CharSequence cs = 
445             "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
446             "    <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" + 
447             "    <param name=\"menu\" value=\"false\">\n" + 
448             "    <param name=\"bgcolor\" value=\"#000000\">\n" + 
449             "    <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" + 
450             "    <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
451             "</object> ";
452         this.extractor.extract(curi, cs);
453         String expected = "http://www.example.com/ParamZoomifySlideshowViewer.xml";
454         assertTrue("outlinks should contain: "+expected,
455                 CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected)));
456     }
457     
458     /***
459      * HER-1728 
460      * @throws URIException 
461      */
462     public void testFlashvarsEmbedAttribute() throws URIException {
463         CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
464         CharSequence cs = 
465             "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
466             "    <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" + 
467             "    <param name=\"menu\" value=\"false\">\n" + 
468             "    <param name=\"bgcolor\" value=\"#000000\">\n" + 
469             "    <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" + 
470             "    <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
471             "</object> ";
472         this.extractor.extract(curi, cs);
473         String expected = "http://www.example.com/EmbedZoomifySlideshowViewer.xml";
474         assertTrue("outlinks should contain: "+expected,
475                 CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected)));
476     }
477     
478     /***
479      * False test: tries to verify extractor ignores a 'longDesc'
480      * attribute. In fact, HTML spec says longDesc is a URI, so 
481      * crawler should find 2 links here. 
482      * See [HER-206]
483      * @throws URIException
484      */
485     public void xestAvoidBadSpec() throws URIException {
486         CrawlURI curi = new CrawlURI(UURIFactory
487                 .getInstance("http://www.example.com"));
488         CharSequence cs = 
489             "<TBODY>\r\n" + 
490             "<TR>\r\n" + 
491             "<TD><IMG height=259 alt=\"Operation Overlord Commanders\"\r\n" + 
492             "src=\"/img/aboutus/history/dday60/commanders.jpg\"\r\n" + 
493             "width=500 longDesc=\"Overlord Commanders, Back row, left\r\n" + 
494             "to right:<BR>Lieutenant General Bradley, Admiral\r\n" + 
495             "Ramsay, Air Chief Marshal Leigh-Mallory, General Bedell\r\n" + 
496             "Smith.<BR>Front row, left to right: Air Chief Marshal\r\n" + 
497             "Tedder, General Eisenhower, General Montgomery.\"></TD></TR>\r\n" + 
498             "<TR>\r\n" + 
499             "<TD class=caption>�Overlord� Commanders, Back row, left\r\n" + 
500             "to right:<BR>Lieutenant General Bradley, Admiral\r\n" + 
501             "Ramsay, Air Chief Marshal Leigh-Mallory, General Bedell\r\n" + 
502             "Smith.<BR>Front row, left to right: Air Chief Marshal\r\n" + 
503             "Tedder, General Eisenhower, General\r\n" + 
504             "Montgomery.</TD></TR></TBODY></TABLE>\r\n" + 
505             "<P>\r\n" + 
506             "<TABLE id=imageinset width=\"35%\" align=right\r\n" + 
507             "summary=\"Key Facts About the Allied Forces Deployed on\r\n" + 
508             "D-Day\" border=0>\r\n" + 
509             "<TBODY>";
510         this.extractor.extract(curi, cs);
511         Link[] links = curi.getOutLinks().toArray(new Link[0]);
512         assertTrue("incorrect number of links found",links.length==1);
513     }
514 
515     public static void main(String[] args) throws Exception {
516         if (args.length != 1 && args.length != 2) {
517             System.err.println("Usage: " + ExtractorHTMLTest.class.getName() +
518                 " URL|PATH [ENCODING]");
519             System.exit(1);
520         }
521         ExtractorHTMLTest testCase = new ExtractorHTMLTest();
522         testCase.setUp();
523         try {
524             testCase.runExtractor(testCase.getUURI(args[0]),
525                 (args.length == 2)? args[1]: null);
526         } finally {
527             testCase.tearDown();
528         }
529     }
530 }