1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.archive.crawler.extractor;
22
23 import java.io.ByteArrayInputStream;
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.Collection;
27 import java.util.Iterator;
28
29 import javax.management.AttributeNotFoundException;
30 import javax.management.InvalidAttributeValueException;
31 import javax.management.MBeanException;
32 import javax.management.ReflectionException;
33
34 import org.apache.commons.httpclient.URIException;
35 import org.archive.crawler.datamodel.CoreAttributeConstants;
36 import org.archive.crawler.datamodel.CrawlOrder;
37 import org.archive.crawler.datamodel.CrawlURI;
38 import org.archive.crawler.settings.MapType;
39 import org.archive.crawler.settings.SettingsHandler;
40 import org.archive.crawler.settings.XMLSettingsHandler;
41 import org.archive.net.UURI;
42 import org.archive.net.UURIFactory;
43 import org.archive.util.HttpRecorder;
44 import org.archive.util.TmpDirTestCase;
45
46 import com.google.common.base.Charsets;
47
48
49 /***
50 * Test XML extractor.
51 *
52 * @contributor gojomo
53 * @contributor stack
54 * @version $Revision: 6830 $, $Date: 2010-04-21 16:39:57 -0700 (Wed, 21 Apr 2010) $
55 */
56 public class ExtractorXMLTest
57 extends TmpDirTestCase
58 implements CoreAttributeConstants {
59 private final String LINK_TO_FIND = "http://www.example.org/";
60 private HttpRecorder recorder = null;
61 private ExtractorXML extractor = null;
62
63 protected ExtractorXML createExtractor()
64 throws InvalidAttributeValueException, AttributeNotFoundException,
65 MBeanException, ReflectionException {
66
67
68
69
70
71
72 final String name = this.getClass().getName();
73 SettingsHandler handler = new XMLSettingsHandler(
74 new File(getTmpDir(), name + ".order.xml"));
75 handler.initialize();
76 return (ExtractorXML)((MapType)handler.getOrder().
77 getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
78 getSettingsObject(null), new ExtractorXML(name));
79 }
80
81 protected void setUp() throws Exception {
82 super.setUp();
83 this.extractor = createExtractor();
84 ByteArrayInputStream bais = new ByteArrayInputStream(
85 "<?xml version=\"1.0\"?><x y='http://www.example.org'>z</x>".getBytes(Charsets.UTF_8));
86 this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
87 this.getClass().getName(), bais, "UTF-8");
88 }
89
90 protected void tearDown() throws Exception {
91 super.tearDown();
92 }
93
94 public void testNoHintsOtherThanContentPrefix() throws IOException {
95 UURI uuri = UURIFactory.getInstance("http://www.example.com"); // no XML-related suffix
96 CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
97 curi.setContentType("image/gif");
98 this.extractor.innerProcess(curi);
99 Collection<Link> links = curi.getOutLinks();
100 boolean foundLink = false;
101 for (Iterator<Link> i = links.iterator(); i.hasNext();) {
102 Link link = (Link)i.next();
103 if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
104 foundLink = true;
105 }
106 }
107 assertTrue("Did not find url", foundLink);
108 }
109
110 private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
111 throws URIException {
112 CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
113 curi.setContentSize(this.recorder.getRecordedInput().getSize());
114 curi.setContentType("text/xml");
115 curi.setFetchStatus(200);
116 curi.setHttpRecorder(rec);
117
118 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
119 new Object());
120 return curi;
121 }
122 }