View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jul 11, 2003
20   *
21   */
22  package org.archive.crawler.extractor;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.ArrayList;
27  import java.util.Iterator;
28  import java.util.logging.Logger;
29  
30  import org.apache.commons.httpclient.URIException;
31  import org.archive.crawler.datamodel.CoreAttributeConstants;
32  import org.archive.crawler.datamodel.CrawlURI;
33  import org.archive.crawler.framework.ToeThread;
34  
35  
36  /*** Allows the caller to process a CrawlURI representing a PDF
37   *  for the purpose of extracting URIs
38   *
39   * @author Parker Thompson
40   *
41   */
42  public class ExtractorPDF extends Extractor implements CoreAttributeConstants {
43  
44      private static final long serialVersionUID = -6040669467531928494L;
45  
46      private static final Logger LOGGER =
47          Logger.getLogger(ExtractorPDF.class.getName());
48      private static int DEFAULT_MAX_SIZE_TO_PARSE = 5*1024*1024; // 5MB
49  
50      // TODO: make configurable
51      private long maxSizeToParse = DEFAULT_MAX_SIZE_TO_PARSE;
52  
53      protected long numberOfCURIsHandled = 0;
54      protected long numberOfLinksExtracted = 0;
55  
56      /***
57       * @param name
58       */
59      public ExtractorPDF(String name) {
60          super(name, "PDF extractor. Link extraction on PDF documents.");
61      }
62  
63      protected void extract(CrawlURI curi){
64          if (!isHttpTransactionContentToProcess(curi) ||
65                  !isExpectedMimeType(curi.getContentType(),
66                      "application/pdf")) {
67              return;
68          }
69  
70          numberOfCURIsHandled++;
71  
72          File tempFile;
73  
74          if(curi.getHttpRecorder().getRecordedInput().getSize()>maxSizeToParse)
75          {
76              return;
77          }
78  
79          int sn = ((ToeThread)Thread.currentThread()).getSerialNumber();
80          tempFile = new File(getController().getScratchDisk(),"tt"+sn+"tmp.pdf");
81  
82          PDFParser parser;
83          ArrayList uris;
84          try {
85              curi.getHttpRecorder().getRecordedInput().
86                  copyContentBodyTo(tempFile);
87              parser = new PDFParser(tempFile.getAbsolutePath());
88              uris = parser.extractURIs();
89          } catch (IOException e) {
90              curi.addLocalizedError(getName(), e, "ExtractorPDF IOException");
91              return;
92          } catch (RuntimeException e) {
93              // Truncated/corrupt  PDFs may generate ClassCast exceptions, or
94              // other problems
95              curi.addLocalizedError(getName(), e,
96                  "ExtractorPDF RuntimeException");
97              return;
98          } finally {
99              tempFile.delete();
100         }
101 
102         if(uris!=null && uris.size()>0) {
103             Iterator iter = uris.iterator();
104             while(iter.hasNext()) {
105                 String uri = (String)iter.next();
106                 try {
107                     curi.createAndAddLink(uri,Link.NAVLINK_MISC,Link.NAVLINK_HOP);
108                 } catch (URIException e1) {
109                     // There may not be a controller (e.g. If we're being run
110                     // by the extractor tool).
111                     if (getController() != null) {
112                         getController().logUriError(e1, curi.getUURI(), uri);
113                     } else {
114                         LOGGER.info(curi + ", " + uri + ": " +
115                             e1.getMessage());
116                     }
117                 }
118             }
119             numberOfLinksExtracted += uris.size();
120         }
121 
122         LOGGER.fine(curi+" has "+uris.size()+" links.");
123         // Set flag to indicate that link extraction is completed.
124         curi.linkExtractorFinished();
125     }
126 
127     /***
128      * Provide a human-readable textual summary of this Processor's state.
129      *
130      * @see org.archive.crawler.framework.Processor#report()
131      */
132     public String report() {
133         StringBuffer ret = new StringBuffer();
134         ret.append("Processor: org.archive.crawler.extractor.ExtractorPDF\n");
135         ret.append("  Function:          Link extraction on PDF documents\n");
136         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
137         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
138 
139         return ret.toString();
140     }
141 }