View Javadoc

1   package org.archive.crawler.extractor;
2   
3   import java.io.File;
4   import java.io.IOException;
5   import java.net.MalformedURLException;
6   import java.net.URL;
7   import java.util.HashMap;
8   import java.util.logging.Logger;
9   
10  import javax.management.AttributeNotFoundException;
11  import javax.management.InvalidAttributeValueException;
12  import javax.management.MBeanException;
13  import javax.management.ReflectionException;
14  
15  import org.apache.commons.httpclient.URIException;
16  import org.archive.crawler.datamodel.CoreAttributeConstants;
17  import org.archive.crawler.datamodel.CrawlOrder;
18  import org.archive.crawler.datamodel.CrawlURI;
19  import org.archive.crawler.settings.MapType;
20  import org.archive.crawler.settings.SettingsHandler;
21  import org.archive.crawler.settings.XMLSettingsHandler;
22  import org.archive.net.UURIFactory;
23  import org.archive.util.HttpRecorder;
24  import org.archive.util.TmpDirTestCase;
25  
26  /* Note: all of the tests in here grab swf files from the web. But we want
27   * heritrix to build without relying on any external services, so the tests are
28   * named such that they won't run. To run the tests, rename the methods from
29   * xest* to test*.
30   */
31  public class ExtractorSWFTest extends TmpDirTestCase implements
32  		CoreAttributeConstants {
33  
34  	private static Logger logger = Logger.getLogger(ExtractorSWFTest.class
35  			.getName());
36  
37  	private ExtractorSWF extractor;
38  
39  	protected void initializeExtractors()
40  			throws InvalidAttributeValueException, AttributeNotFoundException,
41  			MBeanException, ReflectionException {
42  		// Hack in a settings handler. Do this by adding this extractor
43  		// to the order file (I'm adding it to a random MapType; seemingly
44  		// can only add to MapTypes post-construction). This takes care
45  		// of setting a valid SettingsHandler into the ExtractorHTML (This
46  		// shouldn't be so difficult). Of note, the order file below is
47  		// not written to disk.
48  		SettingsHandler handler = new XMLSettingsHandler(new File(getTmpDir(),
49  				this.getClass().getName() + ".order.xml"));
50  		handler.initialize();
51  
52  		this.extractor = (ExtractorSWF) ((MapType) handler.getOrder()
53  				.getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler
54  				.getSettingsObject(null), new ExtractorSWF(
55  				"ExtractorSWFTest/ExtractorSWF"));
56  	}
57  
58  	protected void setUp() throws Exception {
59  		super.setUp();
60  		this.initializeExtractors();
61  	}
62  
63  	protected void tearDown() throws Exception {
64  		super.tearDown();
65  	}
66  
67  	// junit requires there be at least one test in a test case, but all
68  	// our tests require external resources
69  	public void testNothing() {
70  		logger.info("Not testing SWF extractors. To enable these tests, rename the xest* methods in ExtractorSWFTest.java");
71  	}
72  
73  	public void xestHer1509() throws IOException {
74  		// url -> link to find
75  		HashMap<String, String> testUrls = new HashMap<String, String>();
76  		testUrls.put("http://wayback.archive-it.org/779/20080709003013/http://www.dreamingmethods.com/uploads/lastdream/loader.swf", "project.swf");
77  		testUrls.put("http://wayback.archive-it.org/1094/20080923035716/http://www.dreamingmethods.com/uploads/dm_archive/mainsite/downloads/flash/Dim%20O%20Gauble/loader.swf", "map_3d.swf");
78  		testUrls.put("http://wayback.archive-it.org/1094/20080923040243/http://www.dreamingmethods.com/uploads/dm_archive/mainsite/downloads/flash/clearance/loader.swf", "clearance_intro.swf");
79  
80  		for (String url : testUrls.keySet()) {
81  			HttpRecorder recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
82  					getTmpDir(), this.getClass().getName(), new URL(url)
83  							.openStream(), null);
84  			CrawlURI curi = setupCrawlURI(recorder, url);
85  
86  			long startTime = System.currentTimeMillis();
87  			this.extractor.innerProcess(curi);
88  			long elapsed = System.currentTimeMillis() - startTime;
89  			logger.info(this.extractor.getClass().getSimpleName() + " took "
90  					+ elapsed + "ms to process " + url);
91  
92  			boolean foundIt = false;
93  			for (Link link : curi.getOutLinks()) {
94  				logger.info("found link: " + link);
95  				foundIt = foundIt || link.getDestination().toString().endsWith(testUrls.get(url));
96  			}
97  
98  			assertTrue("failed to extract link \"" + testUrls.get(url)
99  					+ "\" from " + url, foundIt);
100 		}
101 	}
102 
103 	/*
104 	 * Tests for correct encoding of non-ascii url's. 
105 	 *
106 	 * The old javaswf extractor mishandles these. For example:
107 	 *
108 	 * "http://wayback.archive-it.org/1100/20080721212134/http://www.marca.com/futbol/madrid_vs_barca/previa/barca/barcaOK.swf",
109 	 *
110 	 * This one has a link that the new extractor handles correctly but the
111 	 * legacy one handles wrong. The link string is 'barca/delapeƱa.swf'.
112 	 * The legacy extractor incorrectly produces
113 	 * "barca/delape%EF%BF%BDa.swf" while the new one correctly produces
114 	 * "barca/delape%C3%B1a.swf". 
115 	 */
116 	public void xestNonAsciiLink() throws MalformedURLException, IOException {
117 		// url -> link to find
118 		HashMap<String,String> testUrls = new HashMap<String, String>();
119 		testUrls.put("http://wayback.archive-it.org/1100/20080721212134/http://www.marca.com/futbol/madrid_vs_barca/previa/barca/barcaOK.swf", "barca/delape%C3%B1a.swf");
120 		testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.contraloriagen.gov.co/html/publicaciones/imagenes/analisis-proyec-ley.swf", "http://www.contraloriagen.gov.co:8081/internet/html/publicaciones/por_dependencia_y_clase.jsp?clases=3&titulo_pagina=An%C3%A1lisis%20a%20Proyectos%20de%20Ley%20y%20Actos%20Legislativos");
121 		testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.ine.gov.ve/secciones/modulos/Apure/sApure.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
122 		testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.ine.gov.ve/secciones/modulos/Lara/sLara.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
123 		testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.minsa.gob.pe/hnhipolitounanue/text13.swf", "archivos%20cuerpo/APOYO%20A%20LA%20DOCENCIA%20E%20INVESTIG/Registro%20de%20Estudios%20Cl%C3%ADnicos.pdf");
124 		testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.nacobre.com.mx/flash/Flash_mercados.swf", "NSMcdoAccesoriosBa%C3%B1o.asp");
125 		testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.sagarpa.gob.mx/dlg/nuevoleon/ddr's/Montemorelos/text4.swf", "campa%C3%B1a_abeja.htm");
126 		testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.sagarpa.gob.mx/dlg/tabasco/text2.swf", "delegacion/comunicacion/cartel%20reuni%C3%B3n%20forestal%20xviii%20media2.pdf");
127 		testUrls.put("http://wayback.archive-it.org/317/20061129141640/http://www.ine.gov.ve/secciones/modulos/Miranda/sMiranda.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
128 		testUrls.put("http://wayback.archive-it.org/317/20061129141640/http://www.ine.gov.ve/secciones/modulos/Tachira/sTachira.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
129 
130 		for (String url : testUrls.keySet()) {
131 			HttpRecorder recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
132 					getTmpDir(), this.getClass().getName(), new URL(url)
133 							.openStream(), null);
134 			CrawlURI curi = setupCrawlURI(recorder, url);
135 
136 			long startTime = System.currentTimeMillis();
137 			this.extractor.innerProcess(curi);
138 			long elapsed = System.currentTimeMillis() - startTime;
139 			logger.info(this.extractor.getClass().getSimpleName() + " took "
140 					+ elapsed + "ms to process " + url);
141 
142 			boolean foundIt = false;
143 			for (Link link : curi.getOutLinks()) {
144 				logger.info("found link: " + link);
145 				foundIt = foundIt || link.getDestination().toString().endsWith(testUrls.get(url));
146 			}
147 
148 			if (!foundIt)
149 				logger.severe("failed to extract link \"" + testUrls.get(url)
150 						+ "\" from " + url);
151 			assertTrue("failed to extract link \"" + testUrls.get(url)
152 					+ "\" from " + url, foundIt);
153 		}
154 	}
155 
156 	private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
157 			throws URIException {
158 		CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
159 		curi.setContentSize(rec.getRecordedInput().getSize());
160 		curi.setContentType("application/x-shockwave-flash");
161 		curi.setFetchStatus(200);
162 		curi.setHttpRecorder(rec);
163 		// Fake out the extractor that this is a HTTP transaction.
164 		curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION, new Object());
165 		return curi;
166 	}
167 }