1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.archive.crawler.extractor;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.ArrayList;
27 import java.util.Iterator;
28 import java.util.logging.Logger;
29
30 import org.apache.commons.httpclient.URIException;
31 import org.archive.crawler.datamodel.CoreAttributeConstants;
32 import org.archive.crawler.datamodel.CrawlURI;
33 import org.archive.crawler.framework.ToeThread;
34
35
36 /*** Allows the caller to process a CrawlURI representing a PDF
37 * for the purpose of extracting URIs
38 *
39 * @author Parker Thompson
40 *
41 */
42 public class ExtractorPDF extends Extractor implements CoreAttributeConstants {
43
44 private static final long serialVersionUID = -6040669467531928494L;
45
46 private static final Logger LOGGER =
47 Logger.getLogger(ExtractorPDF.class.getName());
48 private static int DEFAULT_MAX_SIZE_TO_PARSE = 5*1024*1024;
49
50
51 private long maxSizeToParse = DEFAULT_MAX_SIZE_TO_PARSE;
52
53 protected long numberOfCURIsHandled = 0;
54 protected long numberOfLinksExtracted = 0;
55
56 /***
57 * @param name
58 */
59 public ExtractorPDF(String name) {
60 super(name, "PDF extractor. Link extraction on PDF documents.");
61 }
62
63 protected void extract(CrawlURI curi){
64 if (!isHttpTransactionContentToProcess(curi) ||
65 !isExpectedMimeType(curi.getContentType(),
66 "application/pdf")) {
67 return;
68 }
69
70 numberOfCURIsHandled++;
71
72 File tempFile;
73
74 if(curi.getHttpRecorder().getRecordedInput().getSize()>maxSizeToParse)
75 {
76 return;
77 }
78
79 int sn = ((ToeThread)Thread.currentThread()).getSerialNumber();
80 tempFile = new File(getController().getScratchDisk(),"tt"+sn+"tmp.pdf");
81
82 PDFParser parser;
83 ArrayList uris;
84 try {
85 curi.getHttpRecorder().getRecordedInput().
86 copyContentBodyTo(tempFile);
87 parser = new PDFParser(tempFile.getAbsolutePath());
88 uris = parser.extractURIs();
89 } catch (IOException e) {
90 curi.addLocalizedError(getName(), e, "ExtractorPDF IOException");
91 return;
92 } catch (RuntimeException e) {
93
94
95 curi.addLocalizedError(getName(), e,
96 "ExtractorPDF RuntimeException");
97 return;
98 } finally {
99 tempFile.delete();
100 }
101
102 if(uris!=null && uris.size()>0) {
103 Iterator iter = uris.iterator();
104 while(iter.hasNext()) {
105 String uri = (String)iter.next();
106 try {
107 curi.createAndAddLink(uri,Link.NAVLINK_MISC,Link.NAVLINK_HOP);
108 } catch (URIException e1) {
109
110
111 if (getController() != null) {
112 getController().logUriError(e1, curi.getUURI(), uri);
113 } else {
114 LOGGER.info(curi + ", " + uri + ": " +
115 e1.getMessage());
116 }
117 }
118 }
119 numberOfLinksExtracted += uris.size();
120 }
121
122 LOGGER.fine(curi+" has "+uris.size()+" links.");
123
124 curi.linkExtractorFinished();
125 }
126
127 /***
128 * Provide a human-readable textual summary of this Processor's state.
129 *
130 * @see org.archive.crawler.framework.Processor#report()
131 */
132 public String report() {
133 StringBuffer ret = new StringBuffer();
134 ret.append("Processor: org.archive.crawler.extractor.ExtractorPDF\n");
135 ret.append(" Function: Link extraction on PDF documents\n");
136 ret.append(" CrawlURIs handled: " + numberOfCURIsHandled + "\n");
137 ret.append(" Links extracted: " + numberOfLinksExtracted + "\n\n");
138
139 return ret.toString();
140 }
141 }