1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  
16  
17  
18  
19  
20  
21  
22  
23  
24   
25  package org.archive.crawler.extractor;
26  
27  import java.util.logging.Level;
28  import java.util.logging.Logger;
29  
30  import org.archive.crawler.datamodel.CrawlOrder;
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.framework.Processor;
33  
34  /***
35   * Convenience shared superclass for Extractor Processors.
36   * 
37   * Currently only wraps Extractor-specific extract() action with
38   * a StackOverflowError catch/log/proceed handler, so that any
39   * extractors that recurse too deep on problematic input will
40   * only suffer a local error, and other normal CrawlURI processing
41   * can continue. See:
42   *  [ 1122836 ] Localize StackOverflowError in Extractors
43   *  http://sourceforge.net/tracker/index.php?func=detail&aid=1122836&group_id=73833&atid=539099
44   * 
45   * This class could also become home to common utility features
46   * of extractors, like a running tally of the URIs examined/discovered,
47   * etc.
48   * 
49   * @author gojomo
50   */
51  public abstract class Extractor extends Processor {
52      private static final Logger logger = Logger
53          .getLogger(Extractor.class.getName());
54  
55      /***
56       * Passthrough constructor.
57       * 
58       * @param name
59       * @param description
60       */
61      public Extractor(String name, String description) {
62          super(name, description);
63          
64      }
65  
66      public void innerProcess(CrawlURI curi) {
67          try {
68              extract(curi);
69          } catch (NullPointerException npe) {
70              
71              curi.addAnnotation("err=" + npe.getClass().getName());
72              curi.addLocalizedError(getName(), npe, "");
73              
74              logger.log(Level.WARNING, getName() + ": NullPointerException",
75                  npe);
76          } catch (StackOverflowError soe) {
77              
78              curi.addAnnotation("err=" + soe.getClass().getName());
79              curi.addLocalizedError(getName(), soe, "");
80              
81              logger.log(Level.WARNING, getName() + ": StackOverflowError", soe);
82          } catch (java.nio.charset.CoderMalfunctionError cme) {
83              
84              
85              curi.addAnnotation("err=" + cme.getClass().getName());
86              curi.addLocalizedError(getName(), cme, ""); 
87              logger.log(Level.WARNING, getName() + ": CoderMalfunctionError",
88                  cme);
89          }
90      }
91  
92      protected boolean isIndependentExtractors() {
93          try {
94              return ((Boolean) getController().getOrder().getAttribute(
95                      CrawlOrder.ATTR_INDEPENDENT_EXTRACTORS)).booleanValue();
96          } catch (Exception e) {
97              return false;
98          }
99      }
100 
101     /***
102      * @return true if the setting
103      *         {@link CrawlOrder#ATTR_INDEPENDENT_EXTRACTORS} is disabled or
104      *         {@link CrawlURI#hasBeenLinkExtracted()} is false, and
105      *         {@link Processor#isHttpTransactionContentToProcess(CrawlURI)} is
106      *         true.
107      */
108     @Override
109     protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
110         return (isIndependentExtractors() || !curi.hasBeenLinkExtracted())
111                 && super.isHttpTransactionContentToProcess(curi);
112     }
113 
114     protected abstract void extract(CrawlURI curi);
115 }