1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 package org.archive.crawler.extractor;
25
26 import java.util.logging.Level;
27 import java.util.logging.Logger;
28
29 import org.apache.commons.httpclient.Header;
30 import org.apache.commons.httpclient.HttpMethod;
31 import org.apache.commons.httpclient.URIException;
32 import org.archive.crawler.datamodel.CoreAttributeConstants;
33 import org.archive.crawler.datamodel.CrawlURI;
34 import org.archive.crawler.framework.Processor;
35
36 /***
37 * Extracts URIs from HTTP response headers.
38 * @author gojomo
39 */
40 public class ExtractorHTTP extends Processor
41 implements CoreAttributeConstants {
42
43 private static final long serialVersionUID = 8499072198570554647L;
44
45 private static final Logger LOGGER =
46 Logger.getLogger(ExtractorHTTP.class.getName());
47 protected long numberOfCURIsHandled = 0;
48 protected long numberOfLinksExtracted = 0;
49
50 public ExtractorHTTP(String name) {
51 super(name,
52 "HTTP extractor. Extracts URIs from HTTP response headers.");
53 }
54
55 public void innerProcess(CrawlURI curi) {
56 if (!curi.isHttpTransaction() || curi.getFetchStatus() <= 0) {
57
58 return;
59 }
60 numberOfCURIsHandled++;
61 HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
62 addHeaderLink(curi, method.getResponseHeader("Location"));
63 addHeaderLink(curi, method.getResponseHeader("Content-Location"));
64
65 try {
66 curi.createAndAddLink("/favicon.ico", Link.EMBED_MISC, Link.EMBED_HOP);
67 } catch (URIException e) {
68
69 LOGGER.log(Level.SEVERE, curi + ", /favicon.ico", e);
70 }
71 }
72
73 protected void addHeaderLink(CrawlURI curi, Header loc) {
74 if (loc == null) {
75
76 return;
77 }
78
79 try {
80 curi.createAndAddLink(loc.getValue(), loc.getName() + ":",
81 Link.REFER_HOP);
82 numberOfLinksExtracted++;
83 } catch (URIException e) {
84
85
86 if (getController() != null) {
87 getController().logUriError(e, curi.getUURI(), loc.getValue());
88 } else {
89 LOGGER.info(curi + ", " + loc.getValue() + ": " +
90 e.getMessage());
91 }
92 }
93
94 }
95
96 public String report() {
97 StringBuffer ret = new StringBuffer();
98 ret.append("Processor: org.archive.crawler.extractor.ExtractorHTTP\n");
99 ret.append(" Function: " +
100 "Extracts URIs from HTTP response headers\n");
101 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
102 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
103 return ret.toString();
104 }
105 }