View Javadoc

1   /* ExtractorXMLTest
2    * 
3    * Copyright (C) 2011 Internet Archive.
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   */
21  package org.archive.crawler.extractor;
22  
23  import java.io.ByteArrayInputStream;
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.Collection;
27  import java.util.Iterator;
28  
29  import javax.management.AttributeNotFoundException;
30  import javax.management.InvalidAttributeValueException;
31  import javax.management.MBeanException;
32  import javax.management.ReflectionException;
33  
34  import org.apache.commons.httpclient.URIException;
35  import org.archive.crawler.datamodel.CoreAttributeConstants;
36  import org.archive.crawler.datamodel.CrawlOrder;
37  import org.archive.crawler.datamodel.CrawlURI;
38  import org.archive.crawler.settings.MapType;
39  import org.archive.crawler.settings.SettingsHandler;
40  import org.archive.crawler.settings.XMLSettingsHandler;
41  import org.archive.net.UURI;
42  import org.archive.net.UURIFactory;
43  import org.archive.util.HttpRecorder;
44  import org.archive.util.TmpDirTestCase;
45  
46  import com.google.common.base.Charsets;
47  
48  
49  /***
50   * Test XML extractor.
51   *
52   * @contributor gojomo
53   * @contributor stack
54   * @version $Revision: 6830 $, $Date: 2010-04-21 16:39:57 -0700 (Wed, 21 Apr 2010) $
55   */
56  public class ExtractorXMLTest
57  extends TmpDirTestCase
58  implements CoreAttributeConstants {
59      private final String LINK_TO_FIND = "http://www.example.org/";
60      private HttpRecorder recorder = null;
61      private ExtractorXML extractor = null;
62      
63      protected ExtractorXML createExtractor()
64      throws InvalidAttributeValueException, AttributeNotFoundException,
65      MBeanException, ReflectionException {
66          // Hack in a settings handler.  Do this by adding this extractor
67          // to the order file (I'm adding it to a random MapType; seemingly
68          // can only add to MapTypes post-construction). This takes care
69          // of setting a valid SettingsHandler into the ExtractorHTML (This
70          // shouldn't be so difficult).  Of note, the order file below is
71          // not written to disk.
72          final String name = this.getClass().getName();
73          SettingsHandler handler = new XMLSettingsHandler(
74              new File(getTmpDir(), name + ".order.xml"));
75          handler.initialize();
76          return (ExtractorXML)((MapType)handler.getOrder().
77              getAttribute(CrawlOrder.ATTR_RULES)).addElement(handler.
78                  getSettingsObject(null), new ExtractorXML(name));
79      }
80      
81      protected void setUp() throws Exception {
82          super.setUp();
83          this.extractor = createExtractor();
84          ByteArrayInputStream bais = new ByteArrayInputStream(
85              "<?xml version=\"1.0\"?><x y='http://www.example.org'>z</x>".getBytes(Charsets.UTF_8));
86          this.recorder = HttpRecorder.wrapInputStreamWithHttpRecord(getTmpDir(),
87              this.getClass().getName(), bais, "UTF-8");
88      }
89  
90      protected void tearDown() throws Exception {
91          super.tearDown();
92      }
93  
94      public void testNoHintsOtherThanContentPrefix() throws IOException {
95          UURI uuri = UURIFactory.getInstance("http://www.example.com"); // no XML-related suffix
96          CrawlURI curi = setupCrawlURI(this.recorder, uuri.toString());
97          curi.setContentType("image/gif"); // [SIC] remove mimetype indicator
98          this.extractor.innerProcess(curi);
99          Collection<Link> links = curi.getOutLinks();
100         boolean foundLink = false;
101         for (Iterator<Link> i = links.iterator(); i.hasNext();) {
102             Link link = (Link)i.next();
103             if (link.getDestination().toString().equals(this.LINK_TO_FIND)) {
104                 foundLink = true;
105             }
106         }
107         assertTrue("Did not find url", foundLink);
108     }
109     
110     private CrawlURI setupCrawlURI(HttpRecorder rec, String url)
111     		throws URIException {
112         CrawlURI curi = new CrawlURI(UURIFactory.getInstance(url));
113         curi.setContentSize(this.recorder.getRecordedInput().getSize());
114         curi.setContentType("text/xml"); // FIXME: try other recommended XML types
115         curi.setFetchStatus(200);
116         curi.setHttpRecorder(rec);
117         // Fake out the extractor that this is a HTTP transaction.
118         curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
119             new Object());
120         return curi;
121     }
122 }