View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * SimpleHTTPExtractor.java
20   * Created on Jul 3, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.extractor;
25  
26  import java.util.logging.Level;
27  import java.util.logging.Logger;
28  
29  import org.apache.commons.httpclient.Header;
30  import org.apache.commons.httpclient.HttpMethod;
31  import org.apache.commons.httpclient.URIException;
32  import org.archive.crawler.datamodel.CoreAttributeConstants;
33  import org.archive.crawler.datamodel.CrawlURI;
34  import org.archive.crawler.framework.Processor;
35  
36  /***
37   * Extracts URIs from HTTP response headers.
38   * @author gojomo
39   */
40  public class ExtractorHTTP extends Processor
41  implements CoreAttributeConstants {
42  
43      private static final long serialVersionUID = 8499072198570554647L;
44  
45      private static final Logger LOGGER =
46          Logger.getLogger(ExtractorHTTP.class.getName());
47      protected long numberOfCURIsHandled = 0;
48      protected long numberOfLinksExtracted = 0;
49  
50      public ExtractorHTTP(String name) {
51          super(name,
52              "HTTP extractor. Extracts URIs from HTTP response headers.");
53      }
54  
55      public void innerProcess(CrawlURI curi) {
56          if (!curi.isHttpTransaction() || curi.getFetchStatus() <= 0) {
57              // If not http or if an error status code, skip.
58              return;
59          }
60          numberOfCURIsHandled++;
61          HttpMethod method = (HttpMethod)curi.getObject(A_HTTP_TRANSACTION);
62          addHeaderLink(curi, method.getResponseHeader("Location"));
63          addHeaderLink(curi, method.getResponseHeader("Content-Location"));
64          // try /favicon.ico for every HTTP(S) server
65          try {
66              curi.createAndAddLink("/favicon.ico", Link.EMBED_MISC, Link.EMBED_HOP);
67          } catch (URIException e) {
68              // should be impossible so log SEVERE if happens
69              LOGGER.log(Level.SEVERE, curi + ", /favicon.ico", e);
70          }
71      }
72  
73      protected void addHeaderLink(CrawlURI curi, Header loc) {
74          if (loc == null) {
75              // If null, return without adding anything.
76              return;
77          }
78          // TODO: consider possibility of multiple headers
79          try {
80              curi.createAndAddLink(loc.getValue(), loc.getName() + ":",
81                  Link.REFER_HOP);
82              numberOfLinksExtracted++;
83          } catch (URIException e) {
84              // There may not be a controller (e.g. If we're being run
85              // by the extractor tool).
86              if (getController() != null) {
87                  getController().logUriError(e, curi.getUURI(), loc.getValue());
88              } else {
89                  LOGGER.info(curi + ", " + loc.getValue() + ": " +
90                      e.getMessage());
91              }
92          }
93  
94      }
95  
96      public String report() {
97          StringBuffer ret = new StringBuffer();
98          ret.append("Processor: org.archive.crawler.extractor.ExtractorHTTP\n");
99          ret.append("  Function:          " +
100             "Extracts URIs from HTTP response headers\n");
101         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
102         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
103         return ret.toString();
104     }
105 }