ExtractorXML xref

View Javadoc

1   /*
2    * ExtractorXML
3    *
4    * $Id: ExtractorXML.java 7143 2011-04-21 23:54:22Z nlevitt $
5    *
6    * Created on Sep 27, 2005
7    *
8    * Copyright (C) 2005 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.io.IOException;
30  import java.util.logging.Logger;
31  import java.util.regex.Matcher;
32  
33  import org.apache.commons.httpclient.URIException;
34  import org.apache.commons.lang.StringEscapeUtils;
35  import org.archive.crawler.datamodel.CoreAttributeConstants;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.framework.CrawlController;
38  import org.archive.io.ReplayCharSequence;
39  import org.archive.util.TextUtils;
40  
41  /***
42   * A simple extractor which finds HTTP URIs inside XML/RSS files,
43   * inside attribute values and simple elements (those with only
44   * whitespace + HTTP URI + whitespace as contents)
45   *
46   * @author gojomo
47   *
48   **/
49  
50  public class ExtractorXML extends Extractor implements CoreAttributeConstants {
51  
52      private static final long serialVersionUID = 3101230586822401584L;
53  
54      private static Logger logger =
55          Logger.getLogger(ExtractorXML.class.getName());
56  
57      static final String XML_URI_EXTRACTOR =    
58      "(?i)[\"\'>]//s*(https?:[^//s\"\'<>]+)//s*[\"\'<]"; 
59      // GROUPS:
60      // (G1) URI
61      
62      private long numberOfCURIsHandled = 0;
63      private long numberOfLinksExtracted = 0;
64  
65      /***
66       * @param name
67       */
68      public ExtractorXML(String name) {
69          super(name, "XML Extractor. Extracts links from XML/RSS.");
70      }
71  
72      /***
73       * @param curi Crawl URI to process.
74       */
75      public void extract(CrawlURI curi) {
76          if (!isHttpTransactionContentToProcess(curi)) {
77              return;
78          }
79          
80          if (!shouldExtract(curi)) {
81          	return;
82          }
83  
84          ReplayCharSequence cs = null;
85          try {
86              cs = curi.getHttpRecorder().getReplayCharSequence();
87          } catch (IOException e) {
88              logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
89          }
90          if (cs == null) {
91              logger.severe("Failed getting ReplayCharSequence: " +
92                  curi.toString());
93              return;
94          }
95          
96          this.numberOfCURIsHandled++;
97  
98          try {
99              this.numberOfLinksExtracted += processXml(curi, cs,
100                 getController());
101             // Set flag to indicate that link extraction is completed.
102             curi.linkExtractorFinished();
103         } finally {
104             if (cs != null) {
105                 try {
106                     cs.close();
107                 } catch (IOException ioe) {
108                     logger.warning(TextUtils.exceptionToString(
109                             "Failed close of ReplayCharSequence.", ioe));
110                 }
111             }
112         }
113     }
114 
115     protected boolean shouldExtract(CrawlURI curi) {
116     	String mimeType = curi.getContentType();
117 
118     	// first check for xml mimetype or file extension
119     	// application/vnd.openxmlformats.* seem to be zip archives
120 		if (mimeType != null
121 				&& (mimeType.toLowerCase().indexOf("xml") >= 0 && !mimeType
122 						.matches("(?i)application/vnd.openxmlformats.*"))
123 				|| curi.toString().toLowerCase().endsWith(".rss")
124 				|| curi.toString().toLowerCase().endsWith(".xml")) {
125     		return true;
126     	}
127 
128     	// check if content starts with xml preamble "<?xml" and does not
129     	// contain "<!doctype html" or "<html" early in the content
130     	String contentStartingChunk = curi.getHttpRecorder().getContentReplayPrefixString(400); 
131     	if (contentStartingChunk.matches("(?is)[//ufeff]?<//?xml//s.*")
132     			&& !contentStartingChunk.matches("(?is).*(?:<!doctype//s+html|<html[>//s]).*")) {
133     		return true;
134     	}
135 
136     	return false;
137     }
138 
139 	public static long processXml(CrawlURI curi, CharSequence cs,
140             CrawlController controller) {
141         long foundLinks = 0;
142         Matcher uris = null;
143         String xmlUri;
144         uris = TextUtils.getMatcher(XML_URI_EXTRACTOR, cs);
145         while (uris.find()) {
146             xmlUri = StringEscapeUtils.unescapeXml(uris.group(1));
147             foundLinks++;
148             try {
149                 // treat as speculative, as whether context really 
150                 // intends to create a followable/fetchable URI is
151                 // unknown
152                 curi.createAndAddLink(xmlUri,Link.SPECULATIVE_MISC,
153                         Link.SPECULATIVE_HOP);
154             } catch (URIException e) {
155                 // There may not be a controller (e.g. If we're being run
156                 // by the extractor tool).
157                 if (controller != null) {
158                     controller.logUriError(e, curi.getUURI(), xmlUri);
159                 } else {
160                     logger.info(curi + ", " + xmlUri + ": " +
161                         e.getMessage());
162                 }
163             }
164         }
165         TextUtils.recycleMatcher(uris);
166         return foundLinks;
167     }
168 
169     public String report() {
170         StringBuffer ret = new StringBuffer();
171         ret.append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
172         ret.append("  Function:          Link extraction on XML/RSS\n");
173         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
174         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
175 
176         return ret.toString();
177     }
178 }