1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException;
30 import java.util.logging.Logger;
31 import java.util.regex.Matcher;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.apache.commons.lang.StringEscapeUtils;
35 import org.archive.crawler.datamodel.CoreAttributeConstants;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.framework.CrawlController;
38 import org.archive.io.ReplayCharSequence;
39 import org.archive.util.TextUtils;
40
41 /***
42 * A simple extractor which finds HTTP URIs inside XML/RSS files,
43 * inside attribute values and simple elements (those with only
44 * whitespace + HTTP URI + whitespace as contents)
45 *
46 * @author gojomo
47 *
48 **/
49
50 public class ExtractorXML extends Extractor implements CoreAttributeConstants {
51
52 private static final long serialVersionUID = 3101230586822401584L;
53
54 private static Logger logger =
55 Logger.getLogger(ExtractorXML.class.getName());
56
57 static final String XML_URI_EXTRACTOR =
58 "(?i)[\"\'>]//s*(https?:[^//s\"\'<>]+)//s*[\"\'<]";
59
60
61
62 private long numberOfCURIsHandled = 0;
63 private long numberOfLinksExtracted = 0;
64
65 /***
66 * @param name
67 */
68 public ExtractorXML(String name) {
69 super(name, "XML Extractor. Extracts links from XML/RSS.");
70 }
71
72 /***
73 * @param curi Crawl URI to process.
74 */
75 public void extract(CrawlURI curi) {
76 if (!isHttpTransactionContentToProcess(curi)) {
77 return;
78 }
79
80 if (!shouldExtract(curi)) {
81 return;
82 }
83
84 ReplayCharSequence cs = null;
85 try {
86 cs = curi.getHttpRecorder().getReplayCharSequence();
87 } catch (IOException e) {
88 logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
89 }
90 if (cs == null) {
91 logger.severe("Failed getting ReplayCharSequence: " +
92 curi.toString());
93 return;
94 }
95
96 this.numberOfCURIsHandled++;
97
98 try {
99 this.numberOfLinksExtracted += processXml(curi, cs,
100 getController());
101
102 curi.linkExtractorFinished();
103 } finally {
104 if (cs != null) {
105 try {
106 cs.close();
107 } catch (IOException ioe) {
108 logger.warning(TextUtils.exceptionToString(
109 "Failed close of ReplayCharSequence.", ioe));
110 }
111 }
112 }
113 }
114
115 protected boolean shouldExtract(CrawlURI curi) {
116 String mimeType = curi.getContentType();
117
118
119
120 if (mimeType != null
121 && (mimeType.toLowerCase().indexOf("xml") >= 0 && !mimeType
122 .matches("(?i)application/vnd.openxmlformats.*"))
123 || curi.toString().toLowerCase().endsWith(".rss")
124 || curi.toString().toLowerCase().endsWith(".xml")) {
125 return true;
126 }
127
128
129
130 String contentStartingChunk = curi.getHttpRecorder().getContentReplayPrefixString(400);
131 if (contentStartingChunk.matches("(?is)[//ufeff]?<//?xml//s.*")
132 && !contentStartingChunk.matches("(?is).*(?:<!doctype//s+html|<html[>//s]).*")) {
133 return true;
134 }
135
136 return false;
137 }
138
139 public static long processXml(CrawlURI curi, CharSequence cs,
140 CrawlController controller) {
141 long foundLinks = 0;
142 Matcher uris = null;
143 String xmlUri;
144 uris = TextUtils.getMatcher(XML_URI_EXTRACTOR, cs);
145 while (uris.find()) {
146 xmlUri = StringEscapeUtils.unescapeXml(uris.group(1));
147 foundLinks++;
148 try {
149
150
151
152 curi.createAndAddLink(xmlUri,Link.SPECULATIVE_MISC,
153 Link.SPECULATIVE_HOP);
154 } catch (URIException e) {
155
156
157 if (controller != null) {
158 controller.logUriError(e, curi.getUURI(), xmlUri);
159 } else {
160 logger.info(curi + ", " + xmlUri + ": " +
161 e.getMessage());
162 }
163 }
164 }
165 TextUtils.recycleMatcher(uris);
166 return foundLinks;
167 }
168
169 public String report() {
170 StringBuffer ret = new StringBuffer();
171 ret.append("Processor: org.archive.crawler.extractor.ExtractorXML\n");
172 ret.append(" Function: Link extraction on XML/RSS\n");
173 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
174 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
175
176 return ret.toString();
177 }
178 }