1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.archive.crawler.extractor;
28
29 import java.io.IOException;
30 import java.util.logging.Logger;
31 import java.util.regex.Matcher;
32
33 import org.apache.commons.httpclient.URIException;
34 import org.archive.crawler.datamodel.CoreAttributeConstants;
35 import org.archive.crawler.datamodel.CrawlURI;
36 import org.archive.crawler.framework.CrawlController;
37 import org.archive.io.ReplayCharSequence;
38 import org.archive.net.UURI;
39 import org.archive.util.DevUtils;
40 import org.archive.util.TextUtils;
41
42 /***
43 * This extractor is parsing URIs from CSS type files.
44 * The format of a CSS URL value is 'url(' followed by optional white space
45 * followed by an optional single quote (') or double quote (") character
46 * followed by the URL itself followed by an optional single quote (') or
47 * double quote (") character followed by optional white space followed by ')'.
48 * Parentheses, commas, white space characters, single quotes (') and double
49 * quotes (") appearing in a URL must be escaped with a backslash:
50 * '\(', '\)', '\,'. Partial URLs are interpreted relative to the source of
51 * the style sheet, not relative to the document. <a href="http://www.w3.org/TR/REC-CSS1#url">
52 * Source: www.w3.org</a>
53 *
54 * @author Igor Ranitovic
55 *
56 **/
57
58 public class ExtractorCSS extends Extractor implements CoreAttributeConstants {
59
60 private static final long serialVersionUID = -1540252885329424902L;
61
62 private static Logger logger =
63 Logger.getLogger("org.archive.crawler.extractor.ExtractorCSS");
64
65 private static String ESCAPED_AMP = "&";
66
67
68
69 static final String CSS_BACKSLASH_ESCAPE = "////([,'\"//(//)//s])";
70
71 /***
72 * CSS URL extractor pattern.
73 *
74 * This pattern extracts URIs for CSS files
75 **/
76
77
78 static final String CSS_URI_EXTRACTOR =
79 "(?i)(?:@import (?:url[(]|)|url[(])//s*([//\"\']?)" +
80 "([^//\"\'].{0,"+UURI.MAX_URL_LENGTH+"}?)//1//s*[);]";
81
82
83
84
85 private long numberOfCURIsHandled = 0;
86 private long numberOfLinksExtracted = 0;
87
88 /***
89 * @param name
90 */
91 public ExtractorCSS(String name) {
92 super(name, "CSS Extractor. Extracts links from Cascading Style" +
93 " Sheets (.css).");
94 }
95
96 /***
97 * @param curi Crawl URI to process.
98 */
99 public void extract(CrawlURI curi) {
100 if (!isHttpTransactionContentToProcess(curi)) {
101 return;
102 }
103 String mimeType = curi.getContentType();
104 if (mimeType == null) {
105 return;
106 }
107 if ((mimeType.toLowerCase().indexOf("css") < 0) &&
108 (!curi.toString().toLowerCase().endsWith(".css"))) {
109 return;
110 }
111 this.numberOfCURIsHandled++;
112
113 ReplayCharSequence cs = null;
114 try {
115 cs = curi.getHttpRecorder().getReplayCharSequence();
116 } catch (IOException e) {
117 logger.severe("Failed getting ReplayCharSequence: " + e.getMessage());
118 }
119 if (cs == null) {
120 logger.warning("Failed getting ReplayCharSequence: " +
121 curi.toString());
122 return;
123 }
124
125
126
127 try {
128 this.numberOfLinksExtracted +=
129 processStyleCode(curi, cs, getController());
130
131 curi.linkExtractorFinished();
132 } finally {
133 if (cs != null) {
134 try {
135 cs.close();
136 } catch (IOException ioe) {
137 logger.warning(TextUtils.exceptionToString(
138 "Failed close of ReplayCharSequence.", ioe));
139 }
140 }
141 }
142 }
143
144 public static long processStyleCode(CrawlURI curi, CharSequence cs,
145 CrawlController controller) {
146 long foundLinks = 0;
147 Matcher uris = null;
148 String cssUri;
149 try {
150 uris = TextUtils.getMatcher(CSS_URI_EXTRACTOR, cs);
151 while (uris.find()) {
152 cssUri = uris.group(2);
153
154 cssUri = TextUtils.replaceAll(ESCAPED_AMP, cssUri, "&");
155
156 cssUri = TextUtils.replaceAll(CSS_BACKSLASH_ESCAPE, cssUri,
157 "$1");
158 foundLinks++;
159 try {
160 curi.createAndAddLinkRelativeToBase(cssUri,Link.EMBED_MISC,
161 Link.EMBED_HOP);
162 } catch (URIException e) {
163
164
165 if (controller != null) {
166 controller.logUriError(e, curi.getUURI(), cssUri);
167 } else {
168 logger.info(curi + ", " + cssUri + ": " +
169 e.getMessage());
170 }
171 }
172 }
173 } catch (StackOverflowError e) {
174 DevUtils.warnHandle(e, "ExtractorCSS StackOverflowError");
175 } finally {
176 TextUtils.recycleMatcher(uris);
177 }
178 return foundLinks;
179 }
180
181 public String report() {
182 StringBuffer ret = new StringBuffer();
183 ret.append("Processor: org.archive.crawler.extractor.ExtractorCSS\n");
184 ret.append(" Function: Link extraction on Cascading Style Sheets (.css)\n");
185 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
186 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
187
188 return ret.toString();
189 }
190 }