View Javadoc

1   /* ExtractorTool
2    * 
3    * Created on Mar 14, 2005
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.extractor;
24  
25  import java.io.File;
26  import java.io.IOException;
27  import java.lang.reflect.Constructor;
28  import java.util.ArrayList;
29  import java.util.Iterator;
30  import java.util.List;
31  import java.util.logging.ConsoleHandler;
32  import java.util.logging.Handler;
33  import java.util.logging.Logger;
34  
35  import javax.management.Attribute;
36  
37  import org.apache.commons.cli.CommandLine;
38  import org.apache.commons.cli.HelpFormatter;
39  import org.apache.commons.cli.Option;
40  import org.apache.commons.cli.Options;
41  import org.apache.commons.cli.PosixParser;
42  import org.apache.commons.httpclient.Header;
43  import org.apache.commons.httpclient.HttpMethodBase;
44  import org.apache.commons.httpclient.URIException;
45  import org.archive.crawler.datamodel.CoreAttributeConstants;
46  import org.archive.crawler.datamodel.CrawlOrder;
47  import org.archive.crawler.datamodel.CrawlURI;
48  import org.archive.crawler.framework.Processor;
49  import org.archive.crawler.settings.CrawlerSettings;
50  import org.archive.crawler.settings.MapType;
51  import org.archive.crawler.settings.SettingsHandler;
52  import org.archive.crawler.settings.XMLSettingsHandler;
53  import org.archive.io.arc.ARCReader;
54  import org.archive.io.arc.ARCReaderFactory;
55  import org.archive.io.arc.ARCRecord;
56  import org.archive.net.UURIFactory;
57  import org.archive.util.HttpRecorder;
58  import org.archive.util.OneLineSimpleLogger;
59  
60  /***
61   * Run named extractors against passed ARC file.
62   * This extractor tool runs suboptimally.  It takes each ARC file record,
63   * writes it to a new scratch file, and then it runs each listed
64   * extractor against the scratch.  It works in this manner because
65   * extractors want CharSequence, being able to refer to characters
66   * by absolute position, but ARCs are compressed streams.  The work
67   * to get a CharSequence on an underlying compressed stream has not
68   * been done.  Other issues are need to setup CrawlerSetting environment
69   * so extractors can run.
70   * @author stack
71   * @version $Date: 2006-09-26 23:47:15 +0000 (Tue, 26 Sep 2006) $, $Revision: 4671 $
72   */
73  public class ExtractorTool {
74  //    private static final Logger logger =
75  //        Logger.getLogger(ExtractorTool.class.getName());
76      static {
77          // Setup the oneline logger.
78          Handler [] hs = Logger.getLogger("").getHandlers();
79          for (int i = 0; i < hs.length; i++) {
80              Handler h = hs[0];
81              if (h instanceof ConsoleHandler) {
82                  h.setFormatter(new OneLineSimpleLogger());
83              }
84          }
85      }
86      
87      private static final String [] DEFAULT_EXTRACTORS =
88          {"org.archive.crawler.extractor.ExtractorHTTP",
89              "org.archive.crawler.extractor.ExtractorHTML"};
90      private final List<Processor> extractors;
91      private final File scratchDir;
92      private static final String DEFAULT_SCRATCH = "/tmp";
93      
94      public ExtractorTool()
95      throws Exception {
96          this(DEFAULT_EXTRACTORS, DEFAULT_SCRATCH);
97      }
98      
99      public ExtractorTool(String [] e, String scratch)
100     throws Exception {
101         super();
102         // Setup the scratch directory.
103         this.scratchDir = scratch == null?
104             new File(DEFAULT_SCRATCH): new File(scratch);
105         if (!this.scratchDir.exists()) {
106             this.scratchDir.mkdirs();
107         }
108         // Set up settings system.  Needed by extractors.
109         File orderFile = new File(this.scratchDir.getAbsolutePath(),
110             ExtractorTool.class.getName() + "_order.xml");
111         SettingsHandler settingsHandler = new XMLSettingsHandler(orderFile);
112         settingsHandler.initialize();
113         settingsHandler.getOrder().
114             setAttribute(new Attribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY,
115                 this.scratchDir.getAbsolutePath()));
116         CrawlerSettings globalSettings =
117             settingsHandler.getSettingsObject(null);
118         MapType extractorsSettings = (MapType)settingsHandler.getOrder().
119             getAttribute(CrawlOrder.ATTR_EXTRACT_PROCESSORS);
120         this.extractors = new ArrayList<Processor>();
121         for (int i = 0; i < e.length; i++) {
122             Constructor c = Class.forName(e[i]).
123                 getConstructor(new Class [] {String.class});
124             String name = Integer.toString(i);
125             Processor p  = (Processor)c.newInstance(new Object [] {name});
126             extractorsSettings.addElement(globalSettings, p);
127             p.setAttribute(
128                 new Attribute(Processor.ATTR_ENABLED, Boolean.TRUE));
129             this.extractors.add(p);
130         }
131     }
132     
133     public void extract(String resource) throws IOException,
134     URIException, InterruptedException {
135         ARCReader reader = ARCReaderFactory.get(new File(resource));
136         for (Iterator i = reader.iterator(); i.hasNext();) {
137             ARCRecord ar = (ARCRecord)i.next();
138             HttpRecorder hr = HttpRecorder.
139                 wrapInputStreamWithHttpRecord(this.scratchDir,
140                     this.getClass().getName(), ar, null);
141             CrawlURI curi = getCrawlURI(ar, hr);
142             for (Iterator ii = this.extractors.iterator(); ii.hasNext();) {
143                 ((Processor)ii.next()).process(curi);
144             }
145             outlinks(curi);
146         }
147     }
148     
149     protected void outlinks(CrawlURI curi) {
150         System.out.println(curi.getUURI().toString());
151         for(Link l: curi.getOutLinks()) {
152             System.out.println(" " + l.getDestination() + " " +
153                 l.getHopType() + " " + l.getContext());
154         }
155     }
156     
157     protected CrawlURI getCrawlURI(final ARCRecord record,
158             final HttpRecorder hr)
159     throws URIException {
160         CrawlURI curi = new CrawlURI(UURIFactory.
161             getInstance(record.getMetaData().getUrl()));
162         curi.setContentSize(record.getMetaData().getLength());
163         curi.setContentType(record.getMetaData().getMimetype());
164         curi.setHttpRecorder(hr);
165         // Fake out the extractor that this is a legit HTTP transaction.
166         if (!curi.getUURI().getScheme().equals("filedesc")) {
167             curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
168                 new HttpMethodBase() {
169                     public String getName() {
170                         return this.getClass().getName() + "_method";
171                     }
172 
173                     public Header getResponseHeader(String headerName) {
174                         String value = (String)record.getMetaData().
175                             getHeaderValue(headerName);
176                         return (value == null || value.length() == 0)?
177                             null: new Header(headerName, value);
178                     }
179             });
180             String statusCode = record.getMetaData().getStatusCode();
181             curi.setFetchStatus(statusCode == null?
182                 200: Integer.parseInt(statusCode));
183         }
184         return curi;
185     }
186     
187     /***
188      * Format usage message.
189      * @param formatter Help formatter instance.
190      * @param options Usage options.
191      * @param exitCode Exit code.
192      */
193     private static void usage(HelpFormatter formatter, Options options,
194             int exitCode) {
195         formatter.printHelp("java " + ExtractorTool.class.getName() +
196             " //\n[--scratch=DIR] [--extractor=EXTRACTOR1,EXTRACTOR2,...] ARC", options);
197         System.exit(exitCode);
198     }
199     
200     public static void main(String[] args)
201     throws Exception {
202         Options options = new Options();
203         options.addOption(new Option("h", "help", false,
204             "Prints this message and exits."));
205         StringBuffer defaultExtractors = new StringBuffer();
206         for (int i = 0; i < DEFAULT_EXTRACTORS.length; i++) {
207             if (i > 0) {
208                 defaultExtractors.append(", ");
209             }
210             defaultExtractors.append(DEFAULT_EXTRACTORS[i]);
211         }
212         options.addOption(new Option("e", "extractor", true,
213             "List of comma-separated extractor class names. " +
214             "Run in order listed. " +
215             "If no extractors listed, runs following: " +
216             defaultExtractors.toString() + "."));
217         options.addOption(new Option("s", "scratch", true,
218             "Directory to write scratch files to. Default: '/tmp'."));
219         PosixParser parser = new PosixParser();
220         CommandLine cmdline = parser.parse(options, args, false);
221         List cmdlineArgs = cmdline.getArgList();
222         Option [] cmdlineOptions = cmdline.getOptions();
223         HelpFormatter formatter = new HelpFormatter();
224         // If no args, print help.
225         if (cmdlineArgs.size() <= 0) {
226             usage(formatter, options, 0);
227         }
228 
229         // Now look at options passed.
230         String [] extractors = DEFAULT_EXTRACTORS;
231         String scratch = null;
232         for (int i = 0; i < cmdlineOptions.length; i++) {
233             switch(cmdlineOptions[i].getId()) {
234                 case 'h':
235                     usage(formatter, options, 0);
236                     break;
237 
238                 case 'e':
239                     String value = cmdlineOptions[i].getValue();
240                     if (value == null || value.length() <= 0) {
241                         // Allow saying NO extractors so we can see
242                         // how much it costs just reading through
243                         // ARCs.
244                         extractors = new String [0];
245                     } else {
246                         extractors = value.split(",");
247                     }
248                     break;
249                     
250                 case 's':
251                     scratch = cmdlineOptions[i].getValue();
252                     break;
253                   
254                 default:
255                     throw new RuntimeException("Unexpected option: " +
256                         + cmdlineOptions[i].getId());
257             }
258         }
259         
260         ExtractorTool tool = new ExtractorTool(extractors, scratch);
261         for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
262             tool.extract((String)i.next());
263         }
264     }
265 }