1 package org.archive.crawler.extractor;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.net.MalformedURLException;
6 import java.net.URL;
7 import java.util.HashMap;
8 import java.util.logging.Logger;
9
10 import javax.management.AttributeNotFoundException;
11 import javax.management.InvalidAttributeValueException;
12 import javax.management.MBeanException;
13 import javax.management.ReflectionException;
14
15 import org.apache.commons.httpclient.URIException;
16 import org.archive.crawler.datamodel.CoreAttributeConstants;
17 import org.archive.crawler.datamodel.CrawlOrder;
18 import org.archive.crawler.datamodel.CrawlURI;
19 import org.archive.crawler.settings.MapType;
20 import org.archive.crawler.settings.SettingsHandler;
21 import org.archive.crawler.settings.XMLSettingsHandler;
22 import org.archive.net.UURIFactory;
23 import org.archive.util.HttpRecorder;
24 import org.archive.util.TmpDirTestCase;
25
26
27
28
29
30
31 public class ExtractorSWFTest extends TmpDirTestCase implements
32 CoreAttributeConstants {
33
34 private static Logger logger = Logger.getLogger(ExtractorSWFTest.class
35 .getName());
36
37 private ExtractorSWF extractor;
38
39 protected void initializeExtractors()
40 throws InvalidAttributeValueException, AttributeNotFoundException,
41 MBeanException, ReflectionException {
42
43
44
45
46
47
48 SettingsHandler handler = new XMLSettingsHandler(new File(getTmpDir(),
49 this.getClass().getName() + ".order.xml"));
50 handler.initialize();
51
52 this.extractor = (ExtractorSWF) ((MapType) handler.getOrder()
53 .getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler
54 .getSettingsObject(null), new ExtractorSWF(
55 "ExtractorSWFTest/ExtractorSWF"));
56 }
57
58 protected void setUp() throws Exception {
59 super.setUp();
60 this.initializeExtractors();
61 }
62
63 protected void tearDown() throws Exception {
64 super.tearDown();
65 }
66
67
68
69 public void testNothing() {
70 logger.info("Not testing SWF extractors. To enable these tests, rename the xest* methods in ExtractorSWFTest.java");
71 }
72
73 public void xestHer1509() throws IOException {
74
75 HashMap<String, String> testUrls = new HashMap<String, String>();
76 testUrls.put("http://wayback.archive-it.org/779/20080709003013/http://www.dreamingmethods.com/uploads/lastdream/loader.swf", "project.swf");
77 testUrls.put("http://wayback.archive-it.org/1094/20080923035716/http://www.dreamingmethods.com/uploads/dm_archive/mainsite/downloads/flash/Dim%20O%20Gauble/loader.swf", "map_3d.swf");
78 testUrls.put("http://wayback.archive-it.org/1094/20080923040243/http://www.dreamingmethods.com/uploads/dm_archive/mainsite/downloads/flash/clearance/loader.swf", "clearance_intro.swf");
79
80 for (String url : testUrls.keySet()) {
81 HttpRecorder recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
82 getTmpDir(), this.getClass().getName(), new URL(url)
83 .openStream(), null);
84 CrawlURI curi = setupCrawlURI(recorder, url);
85
86 long startTime = System.currentTimeMillis();
87 this.extractor.innerProcess(curi);
88 long elapsed = System.currentTimeMillis() - startTime;
89 logger.info(this.extractor.getClass().getSimpleName() + " took "
90 + elapsed + "ms to process " + url);
91
92 boolean foundIt = false;
93 for (Link link : curi.getOutLinks()) {
94 logger.info("found link: " + link);
95 foundIt = foundIt || link.getDestination().toString().endsWith(testUrls.get(url));
96 }
97
98 assertTrue("failed to extract link \"" + testUrls.get(url)
99 + "\" from " + url, foundIt);
100 }
101 }
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116 public void xestNonAsciiLink() throws MalformedURLException, IOException {
117
118 HashMap<String,String> testUrls = new HashMap<String, String>();
119 testUrls.put("http://wayback.archive-it.org/1100/20080721212134/http://www.marca.com/futbol/madrid_vs_barca/previa/barca/barcaOK.swf", "barca/delape%C3%B1a.swf");
120 testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.contraloriagen.gov.co/html/publicaciones/imagenes/analisis-proyec-ley.swf", "http://www.contraloriagen.gov.co:8081/internet/html/publicaciones/por_dependencia_y_clase.jsp?clases=3&titulo_pagina=An%C3%A1lisis%20a%20Proyectos%20de%20Ley%20y%20Actos%20Legislativos");
121 testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.ine.gov.ve/secciones/modulos/Apure/sApure.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
122 testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.ine.gov.ve/secciones/modulos/Lara/sLara.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
123 testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.minsa.gob.pe/hnhipolitounanue/text13.swf", "archivos%20cuerpo/APOYO%20A%20LA%20DOCENCIA%20E%20INVESTIG/Registro%20de%20Estudios%20Cl%C3%ADnicos.pdf");
124 testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.nacobre.com.mx/flash/Flash_mercados.swf", "NSMcdoAccesoriosBa%C3%B1o.asp");
125 testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.sagarpa.gob.mx/dlg/nuevoleon/ddr's/Montemorelos/text4.swf", "campa%C3%B1a_abeja.htm");
126 testUrls.put("http://wayback.archive-it.org/176/20080610233230/http://www.sagarpa.gob.mx/dlg/tabasco/text2.swf", "delegacion/comunicacion/cartel%20reuni%C3%B3n%20forestal%20xviii%20media2.pdf");
127 testUrls.put("http://wayback.archive-it.org/317/20061129141640/http://www.ine.gov.ve/secciones/modulos/Miranda/sMiranda.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
128 testUrls.put("http://wayback.archive-it.org/317/20061129141640/http://www.ine.gov.ve/secciones/modulos/Tachira/sTachira.swf", "aspectosfisicos.asp?Codigo=Nacimientos&titulo=Nacimientos%20vivos%20registrados%20por%20a%C3%B1o,%20seg%C3%BAn%20municipio%20de%20residencia%20habitual%20de%20la%20madre,%201999-2002&Fuente=Fuente:%20Prefecturas%20y%20Jefaturas%20Civiles&cod_ent=13&nvalor=2_2&seccion=2");
129
130 for (String url : testUrls.keySet()) {
131 HttpRecorder recorder = HttpRecorder.wrapInputStreamWithHttpRecord(
132 getTmpDir(), this.getClass().getName(), new URL(url)
133 .openStream(), null);
134 CrawlURI curi = setupCrawlURI(recorder, url);
135
136 long startTime = System.currentTimeMillis();
137 this.extractor.innerProcess(curi);
138 long elapsed = System.currentTimeMillis() - startTime;
139 logger.info(this.extractor.getClass().getSimpleName() + " took "
140 + elapsed + "ms to process " + url);
141
142 boolean foundIt = false;
143 for (Link link : curi.getOutLinks()) {
144 logger.info("found link: " + link);
145 foundIt = foundIt || link.getDestination().toString().endsWith(testUrls.get(url));
146 }
147
148 if (!foundIt)
149 logger.severe("failed to extract link \"" + testUrls.get(url)
150 + "\" from " + url);
151 assertTrue("failed to extract link \"" + testUrls.get(url)
152 + "\" from " + url, foundIt);
153 }
154 }
155
156 private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
157 throws URIException {
158 CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
159 curi.setContentSize(rec.getRecordedInput().getSize());
160 curi.setContentType("application/x-shockwave-flash");
161 curi.setFetchStatus(200);
162 curi.setHttpRecorder(rec);
163
164 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION, new Object());
165 return curi;
166 }
167 }