1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.util.Collection;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32 import java.util.regex.Matcher;
33
34 import org.apache.commons.httpclient.URIException;
35 import org.archive.crawler.datamodel.CoreAttributeConstants;
36 import org.archive.crawler.datamodel.CrawlURI;
37 import org.archive.crawler.settings.SimpleType;
38 import org.archive.util.TextUtils;
39
40 /***
41 * An extractor for finding 'implied' URIs inside other URIs. If the
42 * 'trigger' regex is matched, a new URI will be constructed from the
43 * 'build' replacement pattern.
44 *
45 * Unlike most other extractors, this works on URIs discovered by
46 * previous extractors. Thus it should appear near the end of any
47 * set of extractors.
48 *
49 * Initially, only finds absolute HTTP(S) URIs in query-string or its
50 * parameters.
51 *
52 * TODO: extend to find URIs in path-info
53 *
54 * @author Gordon Mohr
55 *
56 **/
57
58 public class ExtractorImpliedURI extends Extractor implements CoreAttributeConstants {
59
60 private static final long serialVersionUID = 8579045413127769497L;
61
62 private static Logger LOGGER =
63 Logger.getLogger(ExtractorImpliedURI.class.getName());
64
65 /*** regex which when matched triggers addition of 'implied' URI */
66 public static final String ATTR_TRIGGER_REGEXP = "trigger-regexp";
67 /*** replacement pattern used to build 'implied' URI */
68 public static final String ATTR_BUILD_PATTERN = "build-pattern";
69
70 /*** whether to remove URIs that trigger addition of 'implied' URI;
71 * default false
72 */
73 public static final String ATTR_REMOVE_TRIGGER_URIS = "remove-trigger-uris";
74
75
76
77 private long numberOfCURIsHandled = 0;
78 private long numberOfLinksExtracted = 0;
79
80 /***
81 * Constructor
82 *
83 * @param name
84 */
85 public ExtractorImpliedURI(String name) {
86 super(name, "Implied URI Extractor. Finds URIs implied by other " +
87 "URIs according to regex/replacement patterns. Should " +
88 "appear after most other extractors.");
89
90 addElementToDefinition(
91 new SimpleType(ATTR_TRIGGER_REGEXP,
92 "Triggering regular expression. When a discovered URI " +
93 "matches this pattern, the 'implied' URI will be " +
94 "built. The capturing groups of this expression are " +
95 "available for the build replacement pattern.", ""));
96 addElementToDefinition(
97 new SimpleType(ATTR_BUILD_PATTERN,
98 "Replacement pattern to build 'implied' URI, using " +
99 "captured groups of trigger expression.", ""));
100 addElementToDefinition(
101 new SimpleType(ATTR_REMOVE_TRIGGER_URIS,
102 "If true, all URIs that match trigger regular expression " +
103 "are removed from the list of extracted URIs. " +
104 "Default is false.", Boolean.FALSE));
105 }
106
107 /***
108 * Perform usual extraction on a CrawlURI
109 *
110 * @param curi Crawl URI to process.
111 */
112 public void extract(CrawlURI curi) {
113
114 this.numberOfCURIsHandled++;
115
116 Collection<Link> links = curi.getOutLinks();
117 Link[] sourceLinks = links.toArray(new Link[links.size()]);
118 for (Link wref: sourceLinks) {
119 String implied = extractImplied(
120 wref.getDestination(),
121 (String)getUncheckedAttribute(curi,ATTR_TRIGGER_REGEXP),
122 (String)getUncheckedAttribute(curi,ATTR_BUILD_PATTERN));
123 if (implied!=null) {
124 try {
125 curi.createAndAddLink(
126 implied,
127 Link.SPECULATIVE_MISC,
128 Link.SPECULATIVE_HOP);
129
130 numberOfLinksExtracted++;
131
132 final boolean removeTriggerURI =
133 ((Boolean)getUncheckedAttribute(curi,
134 ATTR_REMOVE_TRIGGER_URIS)).booleanValue();
135
136
137 if (removeTriggerURI) {
138 if (curi.getOutLinks().remove(wref)) {
139 LOGGER.log(Level.FINE, wref.getDestination() +
140 " has been removed from " +
141 wref.getSource() + " outlinks list.");
142 numberOfLinksExtracted--;
143
144 } else {
145 LOGGER.log(Level.FINE, "Failed to remove " +
146 wref.getDestination() + " from " +
147 wref.getSource()+ " outlinks list.");
148 }
149 }
150
151 } catch (URIException e) {
152 LOGGER.log(Level.FINE, "bad URI", e);
153 }
154 }
155 }
156 }
157
158 /***
159 * Utility method for extracting 'implied' URI given a source uri,
160 * trigger pattern, and build pattern.
161 *
162 * @param uri source to check for implied URI
163 * @param trigger regex pattern which if matched implies another URI
164 * @param build replacement pattern to build the implied URI
165 * @return implied URI, or null if none
166 */
167 protected static String extractImplied(CharSequence uri, String trigger, String build) {
168 if(trigger.length()==0) {
169
170 return null;
171 }
172 Matcher m = TextUtils.getMatcher(trigger, uri);
173 if(m.matches()) {
174 String result = m.replaceFirst(build);
175 TextUtils.recycleMatcher(m);
176 return result;
177 }
178 return null;
179 }
180
181 public String report() {
182 StringBuffer ret = new StringBuffer();
183 ret.append("Processor: "+ExtractorImpliedURI.class.getName()+"\n");
184 ret.append(" Function: Extracts links inside other URIs\n");
185 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
186 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
187
188 return ret.toString();
189 }
190 }