1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.util.ArrayList;
30 import java.util.Collection;
31 import java.util.List;
32 import java.util.logging.Level;
33 import java.util.logging.Logger;
34 import java.util.regex.Matcher;
35
36 import org.apache.commons.codec.DecoderException;
37 import org.apache.commons.httpclient.URIException;
38 import org.archive.crawler.datamodel.CoreAttributeConstants;
39 import org.archive.crawler.datamodel.CrawlURI;
40 import org.archive.net.LaxURLCodec;
41 import org.archive.net.UURI;
42 import org.archive.util.TextUtils;
43
44 /***
45 * An extractor for finding URIs inside other URIs. Unlike most other
46 * extractors, this works on URIs discovered by previous extractors. Thus
47 * it should appear near the end of any set of extractors.
48 *
49 * Initially, only finds absolute HTTP(S) URIs in query-string or its
50 * parameters.
51 *
52 * TODO: extend to find URIs in path-info
53 *
54 * @author Gordon Mohr
55 *
56 **/
57
58 public class ExtractorURI extends Extractor implements CoreAttributeConstants {
59
60 private static final long serialVersionUID = -6273897743240970822L;
61
62 private static Logger LOGGER =
63 Logger.getLogger(ExtractorURI.class.getName());
64
65 static final String ABS_HTTP_URI_PATTERN = "^https?://[^//s<>]*$";
66
67
68
69 private long numberOfCURIsHandled = 0;
70 private long numberOfLinksExtracted = 0;
71
72 /***
73 * Constructor
74 *
75 * @param name
76 */
77 public ExtractorURI(String name) {
78 super(name, "URI Extractor. Extracts links inside other " +
79 "discovered URIs. Should appear last among extractors.");
80 }
81
82 /***
83 * Perform usual extraction on a CrawlURI
84 *
85 * @param curi Crawl URI to process.
86 */
87 public void extract(CrawlURI curi) {
88
89 this.numberOfCURIsHandled++;
90
91 Collection<Link> links = curi.getOutLinks();
92 Link[] sourceLinks = links.toArray(new Link[links.size()]);
93 for (Link wref: sourceLinks) {
94 extractLink(curi,wref);
95 }
96 }
97
98 /***
99 * Consider a single Link for internal URIs
100 *
101 * @param curi CrawlURI to add discoveries to
102 * @param wref Link to examine for internal URIs
103 */
104 protected void extractLink(CrawlURI curi, Link wref) {
105 UURI source = UURI.from(wref.getDestination());
106 if(source == null) {
107
108 return;
109 }
110 List<String> found = extractQueryStringLinks(source);
111 for (String uri : found) {
112 try {
113 curi.createAndAddLink(
114 uri,
115 Link.SPECULATIVE_MISC,
116 Link.SPECULATIVE_HOP);
117 numberOfLinksExtracted++;
118 } catch (URIException e) {
119 LOGGER.log(Level.FINE, "bad URI", e);
120 }
121 }
122
123
124 }
125
126 /***
127 * Look for URIs inside the supplied UURI.
128 *
129 * Static for ease of testing or outside use.
130 *
131 * @param source UURI to example
132 * @return List of discovered String URIs.
133 */
134 protected static List<String> extractQueryStringLinks(UURI source) {
135 List<String> results = new ArrayList<String>();
136 String decodedQuery;
137 try {
138 decodedQuery = source.getQuery();
139 } catch (URIException e1) {
140
141 return results;
142 }
143 if(decodedQuery==null) {
144 return results;
145 }
146
147 Matcher m = TextUtils.getMatcher(ABS_HTTP_URI_PATTERN,decodedQuery);
148 if(m.matches()) {
149 TextUtils.recycleMatcher(m);
150 results.add(decodedQuery);
151 }
152
153 String rawQuery = new String(source.getRawQuery());
154 String[] params = rawQuery.split("&");
155 for (String param : params) {
156 String[] keyVal = param.split("=");
157 if(keyVal.length==2) {
158 String candidate;
159 try {
160 candidate = LaxURLCodec.DEFAULT.decode(keyVal[1]);
161 } catch (DecoderException e) {
162 continue;
163 }
164
165 m.reset(candidate);
166 if(m.matches()) {
167 results.add(candidate);
168 }
169 }
170 }
171 return results;
172 }
173
174 public String report() {
175 StringBuffer ret = new StringBuffer();
176 ret.append("Processor: "+ExtractorURI.class.getName()+"\n");
177 ret.append(" Function: Extracts links inside other URIs\n");
178 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
179 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
180
181 return ret.toString();
182 }
183 }