View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * Created on Jul 14, 2003
20   *
21   */
22  package org.archive.crawler.extractor;
23  
24  import com.lowagie.text.pdf.PdfReader;
25  import com.lowagie.text.pdf.PdfName;
26  import com.lowagie.text.pdf.PdfObject;
27  import com.lowagie.text.pdf.PdfDictionary;
28  import com.lowagie.text.pdf.PRIndirectReference;
29  import com.lowagie.text.pdf.PdfArray;
30  
31  import java.io.*;
32  import java.util.*;
33  
34  
35  /*** Supports PDF parsing operations.  For now this primarily means
36   *  extracting URIs, but the logic in extractURIs() could easily be adopted/extended
37   * for a variety of PDF processing tasks.
38   *
39   * @author Parker Thompson
40   *
41   */
42  //TODO make this more effecient, it currently had to read the whole file into memory
43  // before processing can begin, and appears to take much longer than it "should"
44  // to parse small, but admittedly complex, documents.
45  public class PDFParser {
46  
47      ArrayList<String> foundURIs;
48      ArrayList<ArrayList<Integer>> encounteredReferences;
49      PdfReader documentReader;
50      byte[] document;
51      PdfDictionary catalog;
52  
53      public PDFParser(String doc) throws IOException {
54          resetState();
55          getInFromFile(doc);
56          initialize();
57      }
58       public PDFParser(byte[] doc) throws IOException{
59          resetState();
60          document = doc;
61          initialize();
62      }
63  
64      /*** Reinitialize the object as though a new one were created.
65       */
66      protected void resetState(){
67          foundURIs = new ArrayList<String>();
68          encounteredReferences = new ArrayList<ArrayList<Integer>>();
69          documentReader = null;
70          document = null;
71          catalog = null;
72  
73          for(int i=0; i < encounteredReferences.size(); i++){
74              encounteredReferences.add(new ArrayList<Integer>());
75          }
76      }
77  
78      /***
79       * Reset the object and initialize it with a new byte array (the document).
80       * @param doc
81       * @throws IOException
82       */
83      public void resetState(byte[] doc) throws IOException{
84          resetState();
85          document = doc;
86          initialize();
87      }
88  
89      /*** Reinitialize the object as though a new one were created, complete
90       * with a valid pointer to a document that can be read
91       * @param doc
92       * @throws IOException
93       */
94      public void resetState(String doc) throws IOException{
95          resetState();
96          getInFromFile(doc);
97          initialize();
98      }
99  
100     /***
101      * Read a file named 'doc' and store its' bytes for later processing.
102      * @param doc
103      * @throws IOException
104      */
105     protected void getInFromFile(String doc) throws IOException{
106         File documentOnDisk = new File(doc);
107 
108         long length = documentOnDisk.length();
109         document = new byte[(int)length];
110 
111         FileInputStream inStream = new FileInputStream(documentOnDisk);
112 
113         inStream.read(document);
114     }
115 
116     /***
117      * Indicates, based on a PDFObject's generation/id pair whether
118      * the parser has already encountered this object (or a reference to it)
119      * so we don't infinitely loop on circuits within the PDF.
120      * @param generation
121      * @param id
122      * @return True if already seen.
123      */
124     protected boolean haveSeen(int generation, int id){
125 
126         // if we can't store this generation grow our list until we can
127         if(generation >= encounteredReferences.size()){
128             for(int i=encounteredReferences.size(); i <= generation; i++){
129                 encounteredReferences.add(new ArrayList<Integer>());
130             }
131 
132             // clearly we haven't seen it
133             return false;
134         }
135 
136         ArrayList<Integer> generationList
137          = encounteredReferences.get(generation);
138         
139         for (int i: generationList) {
140             if(i == id){
141                 return true;
142             }
143         }
144         return false;
145     }
146 
147     /***
148      * Note that an object (id/generation pair) has been seen by this parser
149      * so that it can be handled differently when it is encountered again.
150      * @param generation
151      * @param id
152      */
153     protected void markAsSeen(int generation, int id){
154         ArrayList<Integer> objectIds = encounteredReferences.get(generation);
155         objectIds.add(id);
156     }
157 
158     /***
159      * Get a list of URIs retrieved from the Pdf during the
160      * extractURIs operation.
161      * @return A list of URIs retrieved from the Pdf during the
162      * extractURIs operation.
163      */
164     public ArrayList getURIs(){
165         return foundURIs;
166     }
167 
168     /***
169      * Initialize opens the document for reading.  This is done implicitly
170      * by the constuctor.  This should only need to be called directly following
171      * a reset.
172      * @throws IOException
173      */
174     protected void initialize() throws IOException{
175         if(document != null){
176             documentReader = new PdfReader(document);
177         }
178 
179         catalog = documentReader.getCatalog();
180     }
181 
182     /***
183      * Extract URIs from all objects found in a Pdf document's catalog.
184      * Returns an array list representing all URIs found in the document catalog tree.
185      * @return URIs from all objects found in a Pdf document's catalog.
186      */
187     public ArrayList extractURIs(){
188         extractURIs(catalog);
189         return getURIs();
190     }
191 
192     /***
193      * Parse a PdfDictionary, looking for URIs recursively and adding
194      * them to foundURIs
195      * @param entity
196      */
197     protected void extractURIs(PdfObject entity){
198 
199             // deal with dictionaries
200             if(entity.isDictionary()){
201 
202                 PdfDictionary dictionary= (PdfDictionary)entity;
203 
204                 @SuppressWarnings("unchecked")
205                 Set<PdfName> allkeys = dictionary.getKeys();
206                 for (PdfName key: allkeys) {
207                     PdfObject value = dictionary.get(key);
208 
209                     // see if it's the key is a UR[I,L]
210                     if( key.toString().equals("/URI") ||
211 		            key.toString().equals("/URL") ) {
212                         foundURIs.add(value.toString());
213 
214                     }else{
215                         this.extractURIs(value);
216                     }
217 
218                 }
219 
220             // deal with arrays
221             }else if(entity.isArray()){
222 
223                 PdfArray array = (PdfArray)entity;
224                 ArrayList arrayObjects = array.getArrayList();
225                 Iterator objectList = arrayObjects.iterator();
226 
227                 while(objectList.hasNext()){
228                     this.extractURIs( (PdfObject)objectList.next());
229                 }
230 
231             // deal with indirect references
232             }else if(entity.getClass() == PRIndirectReference.class){
233 
234                     PRIndirectReference indirect = (PRIndirectReference)entity;
235 
236                     // if we've already seen a reference to this object
237                     if( haveSeen( indirect.getGeneration(), indirect.getNumber()) ){
238                         return;
239 
240                     // note that we've seen it if it's new
241                     }else{
242                         markAsSeen(indirect.getGeneration(), indirect.getNumber() );
243                     }
244 
245                     // dereference the "pointer" and process the object
246                     indirect.getReader(); // FIXME: examine side-effects
247                     PdfObject direct = PdfReader.getPdfObject(indirect);
248 
249                     this.extractURIs(direct);
250             }
251     }
252 
253     public static void main(String[] argv){
254 
255         try{
256             PDFParser parser = new PDFParser("/home/parkert/files/pdfspec.pdf");
257 
258             ArrayList uris = parser.extractURIs();
259 
260             Iterator i = uris.iterator();
261 
262             while(i.hasNext()){
263                 String uri = (String)i.next();
264                 System.out.println("got uri: " + uri);
265             }
266 
267         }catch(IOException e){
268             e.printStackTrace();
269         }
270     }
271 }