1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.extractor;
24
25 import java.io.File;
26 import java.io.IOException;
27 import java.lang.reflect.Constructor;
28 import java.util.ArrayList;
29 import java.util.Iterator;
30 import java.util.List;
31 import java.util.logging.ConsoleHandler;
32 import java.util.logging.Handler;
33 import java.util.logging.Logger;
34
35 import javax.management.Attribute;
36
37 import org.apache.commons.cli.CommandLine;
38 import org.apache.commons.cli.HelpFormatter;
39 import org.apache.commons.cli.Option;
40 import org.apache.commons.cli.Options;
41 import org.apache.commons.cli.PosixParser;
42 import org.apache.commons.httpclient.Header;
43 import org.apache.commons.httpclient.HttpMethodBase;
44 import org.apache.commons.httpclient.URIException;
45 import org.archive.crawler.datamodel.CoreAttributeConstants;
46 import org.archive.crawler.datamodel.CrawlOrder;
47 import org.archive.crawler.datamodel.CrawlURI;
48 import org.archive.crawler.framework.Processor;
49 import org.archive.crawler.settings.CrawlerSettings;
50 import org.archive.crawler.settings.MapType;
51 import org.archive.crawler.settings.SettingsHandler;
52 import org.archive.crawler.settings.XMLSettingsHandler;
53 import org.archive.io.arc.ARCReader;
54 import org.archive.io.arc.ARCReaderFactory;
55 import org.archive.io.arc.ARCRecord;
56 import org.archive.net.UURIFactory;
57 import org.archive.util.HttpRecorder;
58 import org.archive.util.OneLineSimpleLogger;
59
60 /***
61 * Run named extractors against passed ARC file.
62 * This extractor tool runs suboptimally. It takes each ARC file record,
63 * writes it to a new scratch file, and then it runs each listed
64 * extractor against the scratch. It works in this manner because
65 * extractors want CharSequence, being able to refer to characters
66 * by absolute position, but ARCs are compressed streams. The work
67 * to get a CharSequence on an underlying compressed stream has not
68 * been done. Other issues are need to setup CrawlerSetting environment
69 * so extractors can run.
70 * @author stack
71 * @version $Date: 2006-09-26 23:47:15 +0000 (Tue, 26 Sep 2006) $, $Revision: 4671 $
72 */
73 public class ExtractorTool {
74
75
76 static {
77
78 Handler [] hs = Logger.getLogger("").getHandlers();
79 for (int i = 0; i < hs.length; i++) {
80 Handler h = hs[0];
81 if (h instanceof ConsoleHandler) {
82 h.setFormatter(new OneLineSimpleLogger());
83 }
84 }
85 }
86
87 private static final String [] DEFAULT_EXTRACTORS =
88 {"org.archive.crawler.extractor.ExtractorHTTP",
89 "org.archive.crawler.extractor.ExtractorHTML"};
90 private final List<Processor> extractors;
91 private final File scratchDir;
92 private static final String DEFAULT_SCRATCH = "/tmp";
93
94 public ExtractorTool()
95 throws Exception {
96 this(DEFAULT_EXTRACTORS, DEFAULT_SCRATCH);
97 }
98
99 public ExtractorTool(String [] e, String scratch)
100 throws Exception {
101 super();
102
103 this.scratchDir = scratch == null?
104 new File(DEFAULT_SCRATCH): new File(scratch);
105 if (!this.scratchDir.exists()) {
106 this.scratchDir.mkdirs();
107 }
108
109 File orderFile = new File(this.scratchDir.getAbsolutePath(),
110 ExtractorTool.class.getName() + "_order.xml");
111 SettingsHandler settingsHandler = new XMLSettingsHandler(orderFile);
112 settingsHandler.initialize();
113 settingsHandler.getOrder().
114 setAttribute(new Attribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY,
115 this.scratchDir.getAbsolutePath()));
116 CrawlerSettings globalSettings =
117 settingsHandler.getSettingsObject(null);
118 MapType extractorsSettings = (MapType)settingsHandler.getOrder().
119 getAttribute(CrawlOrder.ATTR_EXTRACT_PROCESSORS);
120 this.extractors = new ArrayList<Processor>();
121 for (int i = 0; i < e.length; i++) {
122 Constructor c = Class.forName(e[i]).
123 getConstructor(new Class [] {String.class});
124 String name = Integer.toString(i);
125 Processor p = (Processor)c.newInstance(new Object [] {name});
126 extractorsSettings.addElement(globalSettings, p);
127 p.setAttribute(
128 new Attribute(Processor.ATTR_ENABLED, Boolean.TRUE));
129 this.extractors.add(p);
130 }
131 }
132
133 public void extract(String resource) throws IOException,
134 URIException, InterruptedException {
135 ARCReader reader = ARCReaderFactory.get(new File(resource));
136 for (Iterator i = reader.iterator(); i.hasNext();) {
137 ARCRecord ar = (ARCRecord)i.next();
138 HttpRecorder hr = HttpRecorder.
139 wrapInputStreamWithHttpRecord(this.scratchDir,
140 this.getClass().getName(), ar, null);
141 CrawlURI curi = getCrawlURI(ar, hr);
142 for (Iterator ii = this.extractors.iterator(); ii.hasNext();) {
143 ((Processor)ii.next()).process(curi);
144 }
145 outlinks(curi);
146 }
147 }
148
149 protected void outlinks(CrawlURI curi) {
150 System.out.println(curi.getUURI().toString());
151 for(Link l: curi.getOutLinks()) {
152 System.out.println(" " + l.getDestination() + " " +
153 l.getHopType() + " " + l.getContext());
154 }
155 }
156
157 protected CrawlURI getCrawlURI(final ARCRecord record,
158 final HttpRecorder hr)
159 throws URIException {
160 CrawlURI curi = new CrawlURI(UURIFactory.
161 getInstance(record.getMetaData().getUrl()));
162 curi.setContentSize(record.getMetaData().getLength());
163 curi.setContentType(record.getMetaData().getMimetype());
164 curi.setHttpRecorder(hr);
165
166 if (!curi.getUURI().getScheme().equals("filedesc")) {
167 curi.putObject(CoreAttributeConstants.A_HTTP_TRANSACTION,
168 new HttpMethodBase() {
169 public String getName() {
170 return this.getClass().getName() + "_method";
171 }
172
173 public Header getResponseHeader(String headerName) {
174 String value = (String)record.getMetaData().
175 getHeaderValue(headerName);
176 return (value == null || value.length() == 0)?
177 null: new Header(headerName, value);
178 }
179 });
180 String statusCode = record.getMetaData().getStatusCode();
181 curi.setFetchStatus(statusCode == null?
182 200: Integer.parseInt(statusCode));
183 }
184 return curi;
185 }
186
187 /***
188 * Format usage message.
189 * @param formatter Help formatter instance.
190 * @param options Usage options.
191 * @param exitCode Exit code.
192 */
193 private static void usage(HelpFormatter formatter, Options options,
194 int exitCode) {
195 formatter.printHelp("java " + ExtractorTool.class.getName() +
196 " //\n[--scratch=DIR] [--extractor=EXTRACTOR1,EXTRACTOR2,...] ARC", options);
197 System.exit(exitCode);
198 }
199
200 public static void main(String[] args)
201 throws Exception {
202 Options options = new Options();
203 options.addOption(new Option("h", "help", false,
204 "Prints this message and exits."));
205 StringBuffer defaultExtractors = new StringBuffer();
206 for (int i = 0; i < DEFAULT_EXTRACTORS.length; i++) {
207 if (i > 0) {
208 defaultExtractors.append(", ");
209 }
210 defaultExtractors.append(DEFAULT_EXTRACTORS[i]);
211 }
212 options.addOption(new Option("e", "extractor", true,
213 "List of comma-separated extractor class names. " +
214 "Run in order listed. " +
215 "If no extractors listed, runs following: " +
216 defaultExtractors.toString() + "."));
217 options.addOption(new Option("s", "scratch", true,
218 "Directory to write scratch files to. Default: '/tmp'."));
219 PosixParser parser = new PosixParser();
220 CommandLine cmdline = parser.parse(options, args, false);
221 List cmdlineArgs = cmdline.getArgList();
222 Option [] cmdlineOptions = cmdline.getOptions();
223 HelpFormatter formatter = new HelpFormatter();
224
225 if (cmdlineArgs.size() <= 0) {
226 usage(formatter, options, 0);
227 }
228
229
230 String [] extractors = DEFAULT_EXTRACTORS;
231 String scratch = null;
232 for (int i = 0; i < cmdlineOptions.length; i++) {
233 switch(cmdlineOptions[i].getId()) {
234 case 'h':
235 usage(formatter, options, 0);
236 break;
237
238 case 'e':
239 String value = cmdlineOptions[i].getValue();
240 if (value == null || value.length() <= 0) {
241
242
243
244 extractors = new String [0];
245 } else {
246 extractors = value.split(",");
247 }
248 break;
249
250 case 's':
251 scratch = cmdlineOptions[i].getValue();
252 break;
253
254 default:
255 throw new RuntimeException("Unexpected option: " +
256 + cmdlineOptions[i].getId());
257 }
258 }
259
260 ExtractorTool tool = new ExtractorTool(extractors, scratch);
261 for (Iterator i = cmdlineArgs.iterator(); i.hasNext();) {
262 tool.extract((String)i.next());
263 }
264 }
265 }