1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.extractor;
24
25 import java.io.File;
26 import java.io.FileOutputStream;
27 import java.io.IOException;
28 import java.net.URL;
29 import java.util.Collection;
30 import java.util.Iterator;
31
32 import javax.management.Attribute;
33 import javax.management.AttributeNotFoundException;
34 import javax.management.InvalidAttributeValueException;
35 import javax.management.MBeanException;
36 import javax.management.ReflectionException;
37
38 import org.apache.commons.collections.CollectionUtils;
39 import org.apache.commons.collections.Predicate;
40 import org.apache.commons.httpclient.URIException;
41 import org.archive.crawler.datamodel.CoreAttributeConstants;
42 import org.archive.crawler.datamodel.CrawlOrder;
43 import org.archive.crawler.datamodel.CrawlURI;
44 import org.archive.crawler.settings.MapType;
45 import org.archive.crawler.settings.SettingsHandler;
46 import org.archive.crawler.settings.XMLSettingsHandler;
47 import org.archive.net.UURI;
48 import org.archive.net.UURIFactory;
49 import org.archive.util.HttpRecorder;
50
51
52 /***
53 * Test html extractor.
54 *
55 * @author stack
56 * @version $Revision: 5757 $, $Date: 2008-02-06 07:44:20 +0000 (Wed, 06 Feb 2008) $
57 */
58 public class JerichoExtractorHTMLTest
59 extends ExtractorHTMLTest
60 implements CoreAttributeConstants {
61 private final String ARCHIVE_DOT_ORG = "archive.org";
62 private final String LINK_TO_FIND = "http://www.hewlett.org/";
63 private HttpRecorder recorder = null;
64 private JerichoExtractorHTML extractor = null;
65
66 protected JerichoExtractorHTML createExtractor()
67 throws InvalidAttributeValueException, AttributeNotFoundException,
68 MBeanException, ReflectionException {
69
70
71
72
73
74
75 final String name = this.getClass().getName();
76 SettingsHandler handler = new XMLSettingsHandler(
77 new File(getTmpDir(), name + ".order.xml"));
78 handler.initialize();
79 return (JerichoExtractorHTML)((MapType)handler.getOrder().
80 getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
81 getSettingsObject(null), new JerichoExtractorHTML(name));
82 }
83
84 protected void setUp() throws Exception {
85 super.setUp();
86 this.extractor = createExtractor();
87 final boolean USE_NET = false;
88 URL url = null;
89 if (USE_NET) {
90 url = new URL("http://" + this.ARCHIVE_DOT_ORG);
91 } else {
92 File f = new File(getTmpDir(), this.ARCHIVE_DOT_ORG + ".html");
93 url = f.toURI().toURL();
94 FileOutputStream fos = new FileOutputStream(f);
95 fos.write(("<html><head><title>test</title><body>" +
96 "<a href=" + this.LINK_TO_FIND + ">Hewlett Foundation</a>" +
97 "</body></html>").getBytes());
98 fos.flush();
99 fos.close();
100 }
101 this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
102 this.getClass().getName(), url.openStream(), null);
103 }
104
105
106 public void testInnerProcess() throws IOException {
107 UURI uuri = UURIFactory.getInstance("http://" + this.ARCHIVE_DOT_ORG);
108 CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
109 this.extractor.innerProcess(curi);
110 Collection links = curi.getOutLinks();
111 boolean foundLinkToHewlettFoundation = false;
112 for (Iterator i = links.iterator(); i.hasNext();) {
113 Link link = (Link)i.next();
114 if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
115 foundLinkToHewlettFoundation = true;
116 break;
117 }
118 }
119 assertTrue("Did not find gif url", foundLinkToHewlettFoundation);
120 }
121
122 private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
123 throws URIException {
124 CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
125 curi.setContentSize(this.recorder.getRecordedInput().getSize());
126 curi.setContentType("text/html");
127 curi.setFetchStatus(200);
128 curi.setHttpRecorder(rec);
129
130 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
131 new Object());
132 return curi;
133 }
134
135
136 /***
137 * Test a GET FORM ACTION extraction
138 *
139 * @throws URIException
140 */
141 public void testFormsLinkGet() throws URIException {
142 CrawlURI curi =
143 new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
144 CharSequence cs =
145 "<form name=\"testform\" method=\"GET\" action=\"redirect_me?form=true\"> " +
146 " <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
147 " <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
148 " <select name=\"selectBox\">" +
149 " <option value=\"selectedOption\" selected>option1</option>" +
150 " <option value=\"nonselectedOption\">option2</option>" +
151 " </select>" +
152 " <input type=\"submit\" name=\"test\" value=\"Go\">" +
153 "</form>";
154 this.extractor.extract(curi,cs);
155 curi.getOutLinks();
156 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
157 public boolean evaluate(Object object) {
158 return ((Link) object).getDestination().toString().indexOf(
159 "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
160 }
161 }));
162 }
163
164 /***
165 * Test a POST FORM ACTION being properly ignored
166 *
167 * @throws URIException
168 */
169 public void testFormsLinkIgnorePost() throws URIException {
170 CrawlURI curi =
171 new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
172 CharSequence cs =
173 "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
174 " <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
175 " <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
176 " <select name=\"selectBox\">" +
177 " <option value=\"selectedOption\" selected>option1</option>" +
178 " <option value=\"nonselectedOption\">option2</option>" +
179 " </select>" +
180 " <input type=\"submit\" name=\"test\" value=\"Go\">" +
181 "</form>";
182 this.extractor.extract(curi,cs);
183 curi.getOutLinks();
184 assertTrue(! CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
185 public boolean evaluate(Object object) {
186 return ((Link) object).getDestination().toString().indexOf(
187 "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
188 }
189 }));
190 }
191
192 /***
193 * Test a POST FORM ACTION being found with non-default setting
194 *
195 * @throws URIException
196 * @throws ReflectionException
197 * @throws MBeanException
198 * @throws InvalidAttributeValueException
199 * @throws AttributeNotFoundException
200 */
201 public void testFormsLinkFindPost() throws URIException, AttributeNotFoundException, InvalidAttributeValueException, MBeanException, ReflectionException {
202 CrawlURI curi =
203 new CrawlURI(UURIFactory.getInstance("http://www.example.org"));
204 CharSequence cs =
205 "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
206 " <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
207 " <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
208 " <select name=\"selectBox\">" +
209 " <option value=\"selectedOption\" selected>option1</option>" +
210 " <option value=\"nonselectedOption\">option2</option>" +
211 " </select>" +
212 " <input type=\"submit\" name=\"test\" value=\"Go\">" +
213 "</form>";
214 this.extractor.setAttribute(
215 new Attribute(ExtractorHTML.ATTR_EXTRACT_ONLY_FORM_GETS,false));
216 this.extractor.extract(curi,cs);
217 curi.getOutLinks();
218 assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
219 public boolean evaluate(Object object) {
220 return ((Link) object).getDestination().toString().indexOf(
221 "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
222 }
223 }));
224 }
225
226 public void testMultipleAttributesPerElement() throws URIException {
227 CrawlURI curi = new CrawlURI(UURIFactory
228 .getInstance("http://www.example.com"));
229 CharSequence cs = "<a src=\"http://www.example.com/\" href=\"http://www.archive.org/\"> ";
230 this.extractor.extract(curi, cs);
231 Link[] links = curi.getOutLinks().toArray(new Link[0]);
232 assertTrue("not all links found", links.length == 2);
233 }
234 }