1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.archive.crawler.extractor;
23
24 import com.lowagie.text.pdf.PdfReader;
25 import com.lowagie.text.pdf.PdfName;
26 import com.lowagie.text.pdf.PdfObject;
27 import com.lowagie.text.pdf.PdfDictionary;
28 import com.lowagie.text.pdf.PRIndirectReference;
29 import com.lowagie.text.pdf.PdfArray;
30
31 import java.io.*;
32 import java.util.*;
33
34
35 /*** Supports PDF parsing operations. For now this primarily means
36 * extracting URIs, but the logic in extractURIs() could easily be adopted/extended
37 * for a variety of PDF processing tasks.
38 *
39 * @author Parker Thompson
40 *
41 */
42
43
44
45 public class PDFParser {
46
47 ArrayList<String> foundURIs;
48 ArrayList<ArrayList<Integer>> encounteredReferences;
49 PdfReader documentReader;
50 byte[] document;
51 PdfDictionary catalog;
52
53 public PDFParser(String doc) throws IOException {
54 resetState();
55 getInFromFile(doc);
56 initialize();
57 }
58 public PDFParser(byte[] doc) throws IOException{
59 resetState();
60 document = doc;
61 initialize();
62 }
63
64 /*** Reinitialize the object as though a new one were created.
65 */
66 protected void resetState(){
67 foundURIs = new ArrayList<String>();
68 encounteredReferences = new ArrayList<ArrayList<Integer>>();
69 documentReader = null;
70 document = null;
71 catalog = null;
72
73 for(int i=0; i < encounteredReferences.size(); i++){
74 encounteredReferences.add(new ArrayList<Integer>());
75 }
76 }
77
78 /***
79 * Reset the object and initialize it with a new byte array (the document).
80 * @param doc
81 * @throws IOException
82 */
83 public void resetState(byte[] doc) throws IOException{
84 resetState();
85 document = doc;
86 initialize();
87 }
88
89 /*** Reinitialize the object as though a new one were created, complete
90 * with a valid pointer to a document that can be read
91 * @param doc
92 * @throws IOException
93 */
94 public void resetState(String doc) throws IOException{
95 resetState();
96 getInFromFile(doc);
97 initialize();
98 }
99
100 /***
101 * Read a file named 'doc' and store its' bytes for later processing.
102 * @param doc
103 * @throws IOException
104 */
105 protected void getInFromFile(String doc) throws IOException{
106 File documentOnDisk = new File(doc);
107
108 long length = documentOnDisk.length();
109 document = new byte[(int)length];
110
111 FileInputStream inStream = new FileInputStream(documentOnDisk);
112
113 inStream.read(document);
114 }
115
116 /***
117 * Indicates, based on a PDFObject's generation/id pair whether
118 * the parser has already encountered this object (or a reference to it)
119 * so we don't infinitely loop on circuits within the PDF.
120 * @param generation
121 * @param id
122 * @return True if already seen.
123 */
124 protected boolean haveSeen(int generation, int id){
125
126
127 if(generation >= encounteredReferences.size()){
128 for(int i=encounteredReferences.size(); i <= generation; i++){
129 encounteredReferences.add(new ArrayList<Integer>());
130 }
131
132
133 return false;
134 }
135
136 ArrayList<Integer> generationList
137 = encounteredReferences.get(generation);
138
139 for (int i: generationList) {
140 if(i == id){
141 return true;
142 }
143 }
144 return false;
145 }
146
147 /***
148 * Note that an object (id/generation pair) has been seen by this parser
149 * so that it can be handled differently when it is encountered again.
150 * @param generation
151 * @param id
152 */
153 protected void markAsSeen(int generation, int id){
154 ArrayList<Integer> objectIds = encounteredReferences.get(generation);
155 objectIds.add(id);
156 }
157
158 /***
159 * Get a list of URIs retrieved from the Pdf during the
160 * extractURIs operation.
161 * @return A list of URIs retrieved from the Pdf during the
162 * extractURIs operation.
163 */
164 public ArrayList getURIs(){
165 return foundURIs;
166 }
167
168 /***
169 * Initialize opens the document for reading. This is done implicitly
170 * by the constuctor. This should only need to be called directly following
171 * a reset.
172 * @throws IOException
173 */
174 protected void initialize() throws IOException{
175 if(document != null){
176 documentReader = new PdfReader(document);
177 }
178
179 catalog = documentReader.getCatalog();
180 }
181
182 /***
183 * Extract URIs from all objects found in a Pdf document's catalog.
184 * Returns an array list representing all URIs found in the document catalog tree.
185 * @return URIs from all objects found in a Pdf document's catalog.
186 */
187 public ArrayList extractURIs(){
188 extractURIs(catalog);
189 return getURIs();
190 }
191
192 /***
193 * Parse a PdfDictionary, looking for URIs recursively and adding
194 * them to foundURIs
195 * @param entity
196 */
197 protected void extractURIs(PdfObject entity){
198
199
200 if(entity.isDictionary()){
201
202 PdfDictionary dictionary= (PdfDictionary)entity;
203
204 @SuppressWarnings("unchecked")
205 Set<PdfName> allkeys = dictionary.getKeys();
206 for (PdfName key: allkeys) {
207 PdfObject value = dictionary.get(key);
208
209
210 if( key.toString().equals("/URI") ||
211 key.toString().equals("/URL") ) {
212 foundURIs.add(value.toString());
213
214 }else{
215 this.extractURIs(value);
216 }
217
218 }
219
220
221 }else if(entity.isArray()){
222
223 PdfArray array = (PdfArray)entity;
224 ArrayList arrayObjects = array.getArrayList();
225 Iterator objectList = arrayObjects.iterator();
226
227 while(objectList.hasNext()){
228 this.extractURIs( (PdfObject)objectList.next());
229 }
230
231
232 }else if(entity.getClass() == PRIndirectReference.class){
233
234 PRIndirectReference indirect = (PRIndirectReference)entity;
235
236
237 if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){
238 return;
239
240
241 }else{
242 markAsSeen(indirect.getGeneration(), indirect.getNumber() );
243 }
244
245
246 indirect.getReader();
247 PdfObject direct = PdfReader.getPdfObject(indirect);
248
249 this.extractURIs(direct);
250 }
251 }
252
253 public static void main(String[] argv){
254
255 try{
256 PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf");
257
258 ArrayList uris = parser.extractURIs();
259
260 Iterator i = uris.iterator();
261
262 while(i.hasNext()){
263 String uri = (String)i.next();
264 System.out.println("got uri: " + uri);
265 }
266
267 }catch(IOException e){
268 e.printStackTrace();
269 }
270 }
271 }