1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler;
26
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileNotFoundException;
30 import java.io.FileOutputStream;
31 import java.io.FilenameFilter;
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.io.PrintStream;
35 import java.io.PrintWriter;
36 import java.net.HttpURLConnection;
37 import java.net.InetAddress;
38 import java.net.URL;
39 import java.net.URLConnection;
40 import java.net.UnknownHostException;
41 import java.util.ArrayList;
42 import java.util.Arrays;
43 import java.util.Collection;
44 import java.util.Collections;
45 import java.util.Enumeration;
46 import java.util.Hashtable;
47 import java.util.Iterator;
48 import java.util.List;
49 import java.util.Map;
50 import java.util.Properties;
51 import java.util.StringTokenizer;
52 import java.util.TimeZone;
53 import java.util.Vector;
54 import java.util.logging.Level;
55 import java.util.logging.LogManager;
56 import java.util.logging.Logger;
57
58 import javax.management.Attribute;
59 import javax.management.AttributeList;
60 import javax.management.AttributeNotFoundException;
61 import javax.management.DynamicMBean;
62 import javax.management.InstanceAlreadyExistsException;
63 import javax.management.InstanceNotFoundException;
64 import javax.management.InvalidAttributeValueException;
65 import javax.management.MBeanInfo;
66 import javax.management.MBeanNotificationInfo;
67 import javax.management.MBeanOperationInfo;
68 import javax.management.MBeanRegistration;
69 import javax.management.MBeanRegistrationException;
70 import javax.management.MBeanServer;
71 import javax.management.MBeanServerFactory;
72 import javax.management.MalformedObjectNameException;
73 import javax.management.NotCompliantMBeanException;
74 import javax.management.ObjectName;
75 import javax.management.ReflectionException;
76 import javax.management.RuntimeOperationsException;
77 import javax.management.openmbean.CompositeData;
78 import javax.management.openmbean.CompositeDataSupport;
79 import javax.management.openmbean.CompositeType;
80 import javax.management.openmbean.OpenDataException;
81 import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
82 import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
83 import javax.management.openmbean.OpenMBeanInfoSupport;
84 import javax.management.openmbean.OpenMBeanOperationInfoSupport;
85 import javax.management.openmbean.OpenMBeanParameterInfo;
86 import javax.management.openmbean.OpenMBeanParameterInfoSupport;
87 import javax.management.openmbean.OpenType;
88 import javax.management.openmbean.SimpleType;
89 import javax.management.openmbean.TabularData;
90 import javax.management.openmbean.TabularDataSupport;
91 import javax.management.openmbean.TabularType;
92 import javax.naming.CompoundName;
93 import javax.naming.Context;
94 import javax.naming.NameNotFoundException;
95 import javax.naming.NamingException;
96 import javax.naming.NoInitialContextException;
97
98 import org.apache.commons.cli.Option;
99 import org.archive.crawler.admin.CrawlJob;
100 import org.archive.crawler.admin.CrawlJobErrorHandler;
101 import org.archive.crawler.admin.CrawlJobHandler;
102 import org.archive.crawler.datamodel.CredentialStore;
103 import org.archive.crawler.datamodel.credential.Credential;
104 import org.archive.crawler.event.CrawlStatusListener;
105 import org.archive.crawler.framework.AlertManager;
106 import org.archive.crawler.framework.CrawlController;
107 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
108 import org.archive.crawler.framework.exceptions.InitializationException;
109 import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
110 import org.archive.crawler.settings.XMLSettingsHandler;
111 import org.archive.io.SinkHandler;
112 import org.archive.io.SinkHandlerLogRecord;
113 import org.archive.net.UURI;
114 import org.archive.util.FileUtils;
115 import org.archive.util.IoUtils;
116 import org.archive.util.JmxUtils;
117 import org.archive.util.JndiUtils;
118 import org.archive.util.PropertyUtils;
119 import org.archive.util.TextUtils;
120
121 import sun.net.www.protocol.file.FileURLConnection;
122
123
124 /***
125 * Main class for Heritrix crawler.
126 *
127 * Heritrix is usually launched by a shell script that backgrounds heritrix
128 * that redirects all stdout and stderr emitted by heritrix to a log file. So
129 * that startup messages emitted subsequent to the redirection of stdout and
130 * stderr show on the console, this class prints usage or startup output
131 * such as where the web UI can be found, etc., to a STARTLOG that the shell
132 * script is waiting on. As soon as the shell script sees output in this file,
133 * it prints its content and breaks out of its wait.
134 * See ${HERITRIX_HOME}/bin/heritrix.
135 *
136 * <p>Heritrix can also be embedded or launched by webapp initialization or
137 * by JMX bootstrapping. So far I count 4 methods of instantiation:
138 * <ol>
139 * <li>From this classes main -- the method usually used;</li>
140 * <li>From the Heritrix UI (The local-instances.jsp) page;</li>
141 * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
142 * <li>A container such as tomcat or jboss.</li>
143 * </ol>
144 *
145 * @author gojomo
146 * @author Kristinn Sigurdsson
147 * @author Stack
148 */
149 public class Heritrix implements DynamicMBean, MBeanRegistration {
150 /***
151 * Heritrix logging instance.
152 */
153 private static final Logger logger =
154 Logger.getLogger(Heritrix.class.getName());
155
156 public static final File TMPDIR =
157 new File(System.getProperty("java.io.tmpdir", "/tmp"));
158
159 /***
160 * Name of the heritrix properties file.
161 */
162 public static final String PROPERTIES = "heritrix.properties";
163
164 /***
165 * Name of the key to use specifying alternate heritrix properties on
166 * command line.
167 */
168 public static final String PROPERTIES_KEY = PROPERTIES;
169
170 /***
171 * Prefix used on our properties we'll add to the System.properties list.
172 */
173 public static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
174
175 /***
176 * Prefix used on other properties we'll add to the System.properties
177 * list (after stripping this prefix).
178 */
179 public static final String SYSTEM_PREFIX = "system.";
180
181 /***
182 * Instance of web server if one was started.
183 */
184 private static SimpleHttpServer httpServer = null;
185
186 /***
187 * CrawlJob handler. Manages multiple crawl jobs at runtime.
188 */
189 private CrawlJobHandler jobHandler = null;
190
191 /***
192 * Heritrix start log file.
193 *
194 * This file contains standard out produced by this main class for startup
195 * only. Used by heritrix shell script. Name here MUST match that in the
196 * <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell
197 * wrapper has on this here java heritrix.
198 */
199 public static final String STARTLOG = "heritrix_dmesg.log";
200
201 /***
202 * Default encoding.
203 *
204 * Used for content when fetching if none specified.
205 */
206 public static final String DEFAULT_ENCODING = "ISO-8859-1";
207
208 /***
209 * Heritrix stderr/stdout log file.
210 *
211 * This file should have nothing in it except messages over which we have
212 * no control (JVM stacktrace, 3rd-party lib emissions). The wrapper
213 * startup script directs stderr/stdout here. This is an INTERDEPENDENCY
214 * this program has with the wrapper shell script. Shell can actually
215 * pass us an alternate to use for this file.
216 */
217 public static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";
218
219 /***
220 * Where to write this classes startup output.
221 *
222 * This out should only be used if Heritrix is being run from the
223 * command-line.
224 */
225 private static PrintWriter out = null;
226
227 /***
228 * The org.archive package
229 */
230 public static final String ARCHIVE_PACKAGE = "org.archive.";
231
232 /***
233 * The crawler package.
234 */
235 public static final String CRAWLER_PACKAGE = Heritrix.class.getName().
236 substring(0, Heritrix.class.getName().lastIndexOf('.'));
237
238 /***
239 * The root context for a webapp.
240 */
241 public static final String ROOT_CONTEXT = "/";
242
243 /***
244 * Set to true if application is started from command line.
245 */
246 public static boolean commandLine = false;
247
248 /***
249 * True if container initialization has been run.
250 */
251 private static boolean containerInitialized = false;
252
253 /***
254 * True if properties have been loaded.
255 */
256 private static boolean propertiesLoaded = false;
257
258 public static final String JAR_SUFFIX = ".jar";
259
260 private AlertManager alertManager;
261
262 /***
263 * The context of the GUI webapp. Default is root.
264 */
265 private static String adminContext = ROOT_CONTEXT;
266
267 /***
268 * True if we're to put up a GUI.
269 * Cmdline processing can override.
270 */
271 public static boolean gui =
272 !PropertyUtils.getBooleanProperty("heritrix.cmdline.nowui");
273
274 /***
275 * Port to put the GUI up on.
276 * Cmdline processing can override.
277 */
278 public static int guiPort = SimpleHttpServer.DEFAULT_PORT;
279
280
281 /***
282 * A collection containing only localhost. Used as default value
283 * for guiHosts, and passed to SimpleHttpServer when doing selftest.
284 */
285 final private static Collection<String> LOCALHOST_ONLY =
286 Collections.unmodifiableList(Arrays.asList(new String[] { "127.0.0.1" }));
287
288
289 /***
290 * Hosts to bind the GUI webserver to.
291 * By default, only contans localhost.
292 * Set to an empty collection to indicate that all available network
293 * interfaces should be used for the webserver.
294 */
295 public static Collection<String> guiHosts = LOCALHOST_ONLY;
296
297
298 /***
299 * Web UI server, realm, context name.
300 */
301 public static String ADMIN = "admin";
302
303
304 /***
305 * The MBean server we're registered with (May be null).
306 */
307 private MBeanServer mbeanServer = null;
308
309 /***
310 * MBean name we were registered as.
311 */
312 private ObjectName mbeanName = null;
313
314 /***
315 * Keep reference to all instances of Heritrix.
316 * Used by the UI to figure which of the local Heritrice it should
317 * be going against and to figure what to shutdown on the way out (If
318 * there was always a JMX Agent, we wouldn't need to keep this list. We
319 * could always ask the JMX Agent for all instances. UPDATE: True we could
320 * always ask the JMX Agent but we might keep around this local reference
321 * because it will allow faster, less awkward -- think of marshalling the args
322 * for JMX invoke operation -- access to local Heritrix instances. A new
323 * usage for this instances Map is in CrawlJob#preRegister to find the hosting
324 * Heritrix instance).
325 */
326 private static Map<String,Heritrix> instances
327 = new Hashtable<String,Heritrix>();
328
329 private OpenMBeanInfoSupport openMBeanInfo;
330 public static final String STATUS_ATTR = "Status";
331 public static final String VERSION_ATTR = "Version";
332 public static final String ISRUNNING_ATTR = "IsRunning";
333 public static final String ISCRAWLING_ATTR = "IsCrawling";
334 public static final String ALERTCOUNT_ATTR = "AlertCount";
335 public static final String NEWALERTCOUNT_ATTR = "NewAlertCount";
336 public static final String CURRENTJOB_ATTR = "CurrentJob";
337 public static final List ATTRIBUTE_LIST;
338 static {
339 ATTRIBUTE_LIST = Arrays.asList(new String [] {STATUS_ATTR,
340 VERSION_ATTR, ISRUNNING_ATTR, ISCRAWLING_ATTR,
341 ALERTCOUNT_ATTR, NEWALERTCOUNT_ATTR, CURRENTJOB_ATTR});
342 }
343
344 public static final String START_OPER = "start";
345 public static final String STOP_OPER = "stop";
346 public static final String DESTROY_OPER = "destroy";
347 public static final String INTERRUPT_OPER = "interrupt";
348 public static final String START_CRAWLING_OPER = "startCrawling";
349 public static final String STOP_CRAWLING_OPER = "stopCrawling";
350 public static final String ADD_CRAWL_JOB_OPER = "addJob";
351 public static final String TERMINATE_CRAWL_JOB_OPER =
352 "terminateCurrentJob";
353 public static final String DELETE_CRAWL_JOB_OPER = "deleteJob";
354 public static final String ALERT_OPER = "alert";
355 public static final String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon";
356 public static final String PENDING_JOBS_OPER = "pendingJobs";
357 public static final String COMPLETED_JOBS_OPER = "completedJobs";
358 public static final String CRAWLEND_REPORT_OPER = "crawlendReport";
359 public static final String SHUTDOWN_OPER = "shutdown";
360 public static final String LOG_OPER = "log";
361 public static final String REBIND_JNDI_OPER = "rebindJNDI";
362 public static final List OPERATION_LIST;
363 static {
364 OPERATION_LIST = Arrays.asList(new String [] {START_OPER, STOP_OPER,
365 INTERRUPT_OPER, START_CRAWLING_OPER, STOP_CRAWLING_OPER,
366 ADD_CRAWL_JOB_OPER, ADD_CRAWL_JOB_BASEDON_OPER,
367 DELETE_CRAWL_JOB_OPER, ALERT_OPER, PENDING_JOBS_OPER,
368 COMPLETED_JOBS_OPER, CRAWLEND_REPORT_OPER, SHUTDOWN_OPER,
369 LOG_OPER, DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER,
370 REBIND_JNDI_OPER});
371 }
372 private CompositeType jobCompositeType = null;
373 private TabularType jobsTabularType = null;
374 public static final String [] JOB_KEYS =
375 new String [] {"uid", "name", "status"};
376
377 private static String adminUsername;
378
379 private static String adminPassword;
380
381 /***
382 * Constructor.
383 * Does not register the created instance with JMX. Assumed this
384 * constructor is used by such as JMX agent creating an instance of
385 * Heritrix at the commmand of a remote client (In this case Heritrix will
386 * be registered by the invoking agent).
387 * @throws IOException
388 */
389 public Heritrix() throws IOException {
390 this(null, false);
391 }
392
393 public Heritrix(final boolean jmxregister) throws IOException {
394 this(null, jmxregister);
395 }
396
397 /***
398 * Constructor.
399 * @param name If null, we bring up the default Heritrix instance.
400 * @param jmxregister True if we are to register this instance with JMX
401 * agent.
402 * @throws IOException
403 */
404 public Heritrix(final String name, final boolean jmxregister)
405 throws IOException {
406 this(name, jmxregister, new CrawlJobHandler(getJobsdir()));
407 }
408
409 /***
410 * Constructor.
411 * @param name If null, we bring up the default Heritrix instance.
412 * @param jmxregister True if we are to register this instance with JMX
413 * agent.
414 * @param cjh CrawlJobHandler to use.
415 * @throws IOException
416 */
417 public Heritrix(final String name, final boolean jmxregister,
418 final CrawlJobHandler cjh)
419 throws IOException {
420 super();
421 containerInitialization();
422 this.jobHandler = cjh;
423 this.openMBeanInfo = buildMBeanInfo();
424
425
426
427 final SinkHandler sinkHandler = SinkHandler.getInstance();
428 if (sinkHandler == null) {
429 throw new NullPointerException("SinkHandler not found.");
430 }
431
432 this.alertManager = new AlertManager() {
433 public void add(SinkHandlerLogRecord record) {
434 sinkHandler.publish(record);
435 }
436
437 public Vector getAll() {
438 return sinkHandler.getAll();
439 }
440
441 public Vector getNewAll() {
442 return sinkHandler.getAllUnread();
443 }
444
445 public SinkHandlerLogRecord get(String alertID) {
446 return sinkHandler.get(Long.parseLong(alertID));
447 }
448
449 public int getCount() {
450 return sinkHandler.getCount();
451 }
452
453 public int getNewCount() {
454 return sinkHandler.getUnreadCount();
455 }
456
457 public void remove(String alertID) {
458 sinkHandler.remove(Long.parseLong(alertID));
459 }
460
461 public void read(String alertID) {
462 sinkHandler.read(Long.parseLong(alertID));
463 }
464 };
465
466 try {
467 Heritrix.registerHeritrix(this, name, jmxregister);
468 } catch (InstanceAlreadyExistsException e) {
469 throw new RuntimeException(e);
470 } catch (MBeanRegistrationException e) {
471 throw new RuntimeException(e);
472 } catch (NotCompliantMBeanException e) {
473 throw new RuntimeException(e);
474 } catch (MalformedObjectNameException e) {
475 throw new RuntimeException(e);
476 }
477 }
478
479 /***
480 * Run setup tasks for this 'container'. Idempotent.
481 *
482 * @throws IOException
483 */
484 protected static void containerInitialization() throws IOException {
485 if (Heritrix.containerInitialized) {
486 return;
487 }
488 Heritrix.containerInitialized = true;
489
490
491
492
493 Heritrix.loadProperties();
494 Heritrix.patchLogging();
495 Heritrix.configureTrustStore();
496
497
498
499 Runtime.getRuntime().addShutdownHook(
500 Heritrix.getShutdownThread(false, 0, "Heritrix shutdown hook"));
501
502
503 try {
504 registerContainerJndi();
505 } catch (Exception e) {
506 logger.log(Level.WARNING, "Failed jndi container registration.", e);
507 }
508 }
509
510 /***
511 * Do inverse of construction. Used by anyone who does a 'new Heritrix' when
512 * they want to cleanup the instance.
513 * Of note, there may be Heritrix threads still hanging around after the
514 * call to destroy completes. They'll eventually go down after they've
515 * finished their cleanup routines. In particular, if you are watching
516 * Heritrix via JMX, you can see the Heritrix instance JMX bean unregister
517 * ahead of the CrawlJob JMX bean that its hosting.
518 */
519 public void destroy() {
520 stop();
521 try {
522 Heritrix.unregisterHeritrix(this);
523 } catch (InstanceNotFoundException e) {
524 e.printStackTrace();
525 } catch (MBeanRegistrationException e) {
526 e.printStackTrace();
527 } catch (NullPointerException e) {
528 e.printStackTrace();
529 }
530 this.jobHandler = null;
531 this.openMBeanInfo = null;
532 }
533
534 /***
535 * Launch program.
536 * Optionally will launch a web server to host UI. Will also register
537 * Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM
538 * Agent).
539 *
540 * @param args Command line arguments.
541 * @throws Exception
542 */
543 public static void main(String[] args)
544 throws Exception {
545 Heritrix.commandLine = true;
546
547
548
549 TimeZone.setDefault(TimeZone.getTimeZone("GMT"));
550
551 File startLog = new File(getHeritrixHome(), STARTLOG);
552 Heritrix.out = new PrintWriter(isDevelopment()?
553 System.out: new PrintStream(new FileOutputStream(startLog)));
554
555 try {
556 containerInitialization();
557 String status = doCmdLineArgs(args);
558 if (status != null) {
559 Heritrix.out.println(status);
560 }
561 }
562
563 catch(Exception e) {
564
565 e.printStackTrace(Heritrix.out);
566 throw e;
567 }
568
569 finally {
570
571
572
573 if (!isDevelopment()) {
574 if (Heritrix.out != null) {
575 Heritrix.out.close();
576 }
577 System.out.println("Heritrix version: " +
578 Heritrix.getVersion());
579 } else {
580 if (Heritrix.out != null) {
581 Heritrix.out.flush();
582 }
583 }
584 }
585 }
586
587 protected static String doCmdLineArgs(final String [] args)
588 throws Exception {
589
590 String tmpStr = PropertyUtils.
591 getPropertyOrNull("heritrix.context");
592 if (tmpStr != null) {
593 Heritrix.adminContext = tmpStr;
594 }
595 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.port");
596 if (tmpStr != null) {
597 Heritrix.guiPort = Integer.parseInt(tmpStr);
598 }
599 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.admin");
600 String adminLoginPassword = (tmpStr == null)? "": tmpStr;
601 String crawlOrderFile =
602 PropertyUtils.getPropertyOrNull("heritrix.cmdline.order");
603 tmpStr = PropertyUtils.getPropertyOrNull("heritrix.cmdline.run");
604 boolean runMode =
605 PropertyUtils.getBooleanProperty("heritrix.cmdline.run");
606 boolean selfTest = false;
607 String selfTestName = null;
608 CommandLineParser clp = new CommandLineParser(args, Heritrix.out,
609 Heritrix.getVersion());
610 List arguments = clp.getCommandLineArguments();
611 Option [] options = clp.getCommandLineOptions();
612
613
614
615 if (arguments.size() > 1) {
616 clp.usage(1);
617 } else if (arguments.size() == 1) {
618 crawlOrderFile = (String)arguments.get(0);
619 if (!(new File(crawlOrderFile).exists())) {
620 clp.usage("ORDER.XML <" + crawlOrderFile +
621 "> specified does not exist.", 1);
622 }
623
624 if (crawlOrderFile.length() > 4 &&
625 !crawlOrderFile.substring(crawlOrderFile.length() - 4).
626 equalsIgnoreCase(".xml")) {
627 clp.usage("ORDER.XML <" + crawlOrderFile +
628 "> does not have required '.xml' suffix.", 1);
629 }
630 }
631
632
633 for (int i = 0; i < options.length; i++) {
634 switch(options[i].getId()) {
635 case 'h':
636 clp.usage();
637 break;
638
639 case 'a':
640 adminLoginPassword = options[i].getValue();
641 break;
642
643 case 'n':
644 if (crawlOrderFile == null) {
645 clp.usage("You must specify an ORDER_FILE with" +
646 " '--nowui' option.", 1);
647 }
648 Heritrix.gui = false;
649 break;
650
651 case 'b':
652 Heritrix.guiHosts = parseHosts(options[i].getValue());
653 break;
654
655 case 'p':
656 try {
657 Heritrix.guiPort =
658 Integer.parseInt(options[i].getValue());
659 } catch (NumberFormatException e) {
660 clp.usage("Failed parse of port number: " +
661 options[i].getValue(), 1);
662 }
663 if (Heritrix.guiPort <= 0) {
664 clp.usage("Nonsensical port number: " +
665 options[i].getValue(), 1);
666 }
667 break;
668
669 case 'r':
670 runMode = true;
671 break;
672
673 case 's':
674 selfTestName = options[i].getValue();
675 selfTest = true;
676 break;
677
678 default:
679 assert false: options[i].getId();
680 }
681 }
682
683
684 String status = null;
685 if (selfTest) {
686
687
688
689 for (int i = 0; i < options.length; i++) {
690 if (options[i].getId() != 'p' && options[i].getId() != 's') {
691 clp.usage(1);
692 }
693 }
694
695 if (arguments.size() > 0) {
696
697 clp.usage(1);
698 }
699 status = selftest(selfTestName, Heritrix.guiPort);
700 } else {
701 if (!Heritrix.gui) {
702 if (options.length > 1) {
703
704
705
706 clp.usage(1);
707 }
708 Heritrix h = new Heritrix(true);
709 status = h.doOneCrawl(crawlOrderFile);
710 } else {
711 if (!isValidLoginPasswordString(adminLoginPassword)) {
712
713 clp.usage("Invalid admin login:password value, or none "
714 + "specified. ", 1);
715 }
716 status = startEmbeddedWebserver(
717 Heritrix.guiHosts, Heritrix.guiPort,
718 adminLoginPassword);
719 Heritrix h = new Heritrix(true);
720
721 String tmp = h.launch(crawlOrderFile, runMode);
722 if (tmp != null) {
723 status += ('\n' + tmp);
724 }
725 }
726 }
727 return status;
728 }
729
730 /***
731 * @return The file we dump stdout and stderr into.
732 */
733 public static String getHeritrixOut() {
734 String tmp = System.getProperty("heritrix.out");
735 if (tmp == null || tmp.length() == 0) {
736 tmp = Heritrix.DEFAULT_HERITRIX_OUT;
737 }
738 return tmp;
739 }
740
741 /***
742 * Exploit <code>-Dheritrix.home</code> if available to us.
743 * Is current working dir if no heritrix.home property supplied.
744 * @return Heritrix home directory.
745 * @throws IOException
746 */
747 protected static File getHeritrixHome()
748 throws IOException {
749 File heritrixHome = null;
750 String home = System.getProperty("heritrix.home");
751 if (home != null && home.length() > 0) {
752 heritrixHome = new File(home);
753 if (!heritrixHome.exists()) {
754 throw new IOException("HERITRIX_HOME <" + home +
755 "> does not exist.");
756 }
757 } else {
758 heritrixHome = new File(new File("").getAbsolutePath());
759 }
760 return heritrixHome;
761 }
762
763 /***
764 * @return The directory into which we put jobs. If the system property
765 * 'heritrix.jobsdir' is set, we will use its value in place of the default
766 * 'jobs' directory in the current working directory.
767 * @throws IOException
768 */
769 public static File getJobsdir() throws IOException {
770 Heritrix.loadProperties();
771 String jobsdirStr = System.getProperty("heritrix.jobsdir", "jobs");
772 File jobsdir = new File(jobsdirStr);
773 return (jobsdir.isAbsolute())?
774 jobsdir:
775 new File(getHeritrixHome(), jobsdirStr);
776 }
777
778 /***
779 * Get and check for existence of expected subdir.
780 *
781 * If development flag set, then look for dir under src dir.
782 *
783 * @param subdirName Dir to look for.
784 * @return The extant subdir. Otherwise null if we're running
785 * in a webapp context where there is no conf directory available.
786 * @throws IOException if unable to find expected subdir.
787 */
788 protected static File getSubDir(String subdirName)
789 throws IOException {
790 return getSubDir(subdirName, true);
791 }
792
793 /***
794 * Get and optionally check for existence of subdir.
795 *
796 * If development flag set, then look for dir under src dir.
797 *
798 * @param subdirName Dir to look for.
799 * @param fail True if we are to fail if directory does not
800 * exist; false if we are to return false if the directory does not exist.
801 * @return The extant subdir. Otherwise null if we're running
802 * in a webapp context where there is no subdir directory available.
803 * @throws IOException if unable to find expected subdir.
804 */
805 protected static File getSubDir(String subdirName, boolean fail)
806 throws IOException {
807 String path = isDevelopment()?
808 "src" + File.separator + subdirName:
809 subdirName;
810 File dir = new File(getHeritrixHome(), path);
811 if (!dir.exists()) {
812 if (fail) {
813 throw new IOException("Cannot find subdir: " + subdirName);
814 }
815 dir = null;
816 }
817 return dir;
818 }
819
820 /***
821 * Test string is valid login/password string.
822 *
823 * A valid login/password string has the login and password compounded
824 * w/ a ':' delimiter.
825 *
826 * @param str String to test.
827 * @return True if valid password/login string.
828 */
829 protected static boolean isValidLoginPasswordString(String str) {
830 boolean isValid = false;
831 StringTokenizer tokenizer = new StringTokenizer(str, ":");
832 if (tokenizer.countTokens() == 2) {
833 String login = ((String)tokenizer.nextElement()).trim();
834 String password = ((String)tokenizer.nextElement()).trim();
835 if (login.length() > 0 && password.length() > 0) {
836 isValid = true;
837 }
838 }
839 return isValid;
840 }
841
842 protected static boolean isDevelopment() {
843 return System.getProperty("heritrix.development") != null;
844 }
845
846 /***
847 * Load the heritrix.properties file.
848 *
849 * Adds any property that starts with
850 * <code>HERITRIX_PROPERTIES_PREFIX</code>
851 * or <code>ARCHIVE_PACKAGE</code>
852 * into system properties (except logging '.level' directives).
853 * @return Loaded properties.
854 * @throws IOException
855 */
856 protected static Properties loadProperties()
857 throws IOException {
858 if (Heritrix.propertiesLoaded) {
859 return System.getProperties();
860 }
861 Heritrix.propertiesLoaded = true;
862
863 Properties properties = new Properties();
864 properties.load(getPropertiesInputStream());
865
866
867
868
869
870 for (Enumeration e = properties.keys(); e.hasMoreElements();) {
871 String key = ((String)e.nextElement()).trim();
872 if (key.startsWith(ARCHIVE_PACKAGE) ||
873 key.startsWith(HERITRIX_PROPERTIES_PREFIX)) {
874
875
876 String value = properties.getProperty(key).trim();
877 if (key.indexOf(".level") < 0) {
878 copyToSystemProperty(key, value);
879 }
880 } else if (key.startsWith(SYSTEM_PREFIX)) {
881 String value = properties.getProperty(key).trim();
882 copyToSystemProperty(key.substring(SYSTEM_PREFIX.length()), value);
883 }
884 }
885 return properties;
886 }
887
888 /***
889 * Copy the given key-value into System properties, as long as there
890 * is no existing value.
891 * @param key property key
892 * @param value property value
893 */
894 protected static void copyToSystemProperty(String key, String value) {
895 if (System.getProperty(key) == null ||
896 System.getProperty(key).length() == 0) {
897 System.setProperty(key, value);
898 }
899 }
900
901 protected static InputStream getPropertiesInputStream()
902 throws IOException {
903 File file = null;
904
905 String alternateProperties = System.getProperty(PROPERTIES_KEY);
906 if (alternateProperties != null && alternateProperties.length() > 0) {
907 file = new File(alternateProperties);
908 }
909
910 if ((file == null || !file.exists()) && getConfdir(false) != null) {
911 file = new File(getConfdir(), PROPERTIES);
912 if (!file.exists()) {
913
914
915 file = null;
916 }
917 }
918
919
920
921 InputStream is = (file != null)?
922 new FileInputStream(file):
923 Heritrix.class.getResourceAsStream("/" + PROPERTIES_KEY);
924 if (is == null) {
925 throw new IOException("Failed to load properties file from" +
926 " filesystem or from classpath.");
927 }
928 return is;
929 }
930
931 /***
932 * If the user hasn't altered the default logging parameters, tighten them
933 * up somewhat: some of our libraries are way too verbose at the INFO or
934 * WARNING levels.
935 *
936 * This might be a problem running inside in someone else's
937 * container. Container's seem to prefer commons logging so we
938 * ain't messing them doing the below.
939 *
940 * @throws IOException
941 * @throws SecurityException
942 */
943 protected static void patchLogging()
944 throws SecurityException, IOException {
945 if (System.getProperty("java.util.logging.config.class") != null) {
946 return;
947 }
948
949 if (System.getProperty("java.util.logging.config.file") != null) {
950 return;
951 }
952
953
954
955 LogManager.getLogManager().
956 readConfiguration(getPropertiesInputStream());
957 }
958
959 /***
960 * Configure our trust store.
961 *
962 * If system property is defined, then use it for our truststore. Otherwise
963 * use the heritrix truststore under conf directory if it exists.
964 *
965 * <p>If we're not launched from the command-line, we will not be able
966 * to find our truststore. The truststore is nor normally used so rare
967 * should this be a problem (In case where we don't use find our trust
968 * store, we'll use the 'default' -- either the JVMs or the containers).
969 */
970 protected static void configureTrustStore() {
971
972 final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore";
973 String value = System.getProperty(TRUSTSTORE_KEY);
974 File confdir = null;
975 try {
976 confdir = getConfdir(false);
977 } catch (IOException e) {
978 logger.log(Level.WARNING, "Failed to get confdir.", e);
979 }
980 if ((value == null || value.length() <= 0) && confdir != null) {
981
982 File heritrixStore = new File(confdir, "heritrix.cacerts");
983 if(heritrixStore.exists()) {
984 value = heritrixStore.getAbsolutePath();
985 }
986 }
987
988 if (value != null && value.length() > 0) {
989 System.setProperty(TRUSTSTORE_KEY, value);
990 }
991 }
992
993 /***
994 * Run the selftest
995 *
996 * @param oneSelfTestName Name of a test if we are to run one only rather
997 * than the default running all tests.
998 * @param port Port number to use for web UI.
999 *
1000 * @exception Exception
1001 * @return Status of how selftest startup went.
1002 */
1003 protected static String selftest(final String oneSelfTestName,
1004 final int port)
1005 throws Exception {
1006
1007 final String SELFTEST = "selftest";
1008 Heritrix.httpServer = new SimpleHttpServer(SELFTEST,
1009 Heritrix.adminContext, LOCALHOST_ONLY, port, true);
1010
1011
1012
1013
1014
1015 Heritrix.httpServer.setAuthentication(SELFTEST, Heritrix.adminContext,
1016 SELFTEST, SELFTEST, SELFTEST);
1017 Heritrix.httpServer.startServer();
1018
1019
1020 File selftestDir = (isDevelopment())?
1021 new File(getConfdir(), SELFTEST):
1022 new File(File.separator + SELFTEST);
1023 File crawlOrderFile = new File(selftestDir, "order.xml");
1024
1025
1026
1027
1028 final String ROOTURI = "127.0.0.1:" + Integer.toString(port);
1029 String selfTestUrl = "http://" + ROOTURI + '/';
1030 if (oneSelfTestName != null && oneSelfTestName.length() > 0) {
1031 selfTestUrl += (oneSelfTestName + '/');
1032 }
1033 CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(),
1034 oneSelfTestName, selfTestUrl);
1035 Heritrix h = new Heritrix("Selftest", true, cjh);
1036 CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template");
1037 job = h.getJobHandler().newJob(job, null, SELFTEST,
1038 "Integration self test", selfTestUrl, CrawlJob.PRIORITY_AVERAGE);
1039 h.getJobHandler().addJob(job);
1040
1041 CredentialStore cs = (CredentialStore)job.getSettingsHandler().
1042 getOrder().getAttribute(CredentialStore.ATTR_NAME);
1043 for (Iterator i = cs.iterator(null); i.hasNext();) {
1044 ((Credential)i.next()).setCredentialDomain(null, ROOTURI);
1045 }
1046 h.getJobHandler().startCrawler();
1047 StringBuffer buffer = new StringBuffer();
1048 buffer.append("Heritrix " + Heritrix.getVersion() +
1049 " selftest started.");
1050 buffer.append("\nSelftest first crawls " + selfTestUrl +
1051 " and then runs an analysis.");
1052 buffer.append("\nResult of analysis printed to " +
1053 getHeritrixOut() + " when done.");
1054 buffer.append("\nSelftest job directory for logs and arcs:\n" +
1055 job.getDirectory().getAbsolutePath());
1056 return buffer.toString();
1057 }
1058
1059 /***
1060 * Launch the crawler without a web UI and run the passed crawl only.
1061 *
1062 * Specialized version of {@link #launch()}.
1063 *
1064 * @param crawlOrderFile The crawl order to crawl.
1065 * @throws InitializationException
1066 * @throws InvalidAttributeValueException
1067 * @return Status string.
1068 */
1069 protected String doOneCrawl(String crawlOrderFile)
1070 throws InitializationException, InvalidAttributeValueException {
1071 return doOneCrawl(crawlOrderFile, null);
1072 }
1073
1074 /***
1075 * Launch the crawler without a web UI and run passed crawl only.
1076 *
1077 * Specialized version of {@link #launch()}.
1078 *
1079 * @param crawlOrderFile The crawl order to crawl.
1080 * @param listener Register this crawl status listener before starting
1081 * crawl (You can use this listener to notice end-of-crawl).
1082 * @throws InitializationException
1083 * @throws InvalidAttributeValueException
1084 * @return Status string.
1085 */
1086 protected String doOneCrawl(String crawlOrderFile,
1087 CrawlStatusListener listener)
1088 throws InitializationException, InvalidAttributeValueException {
1089 XMLSettingsHandler handler =
1090 new XMLSettingsHandler(new File(crawlOrderFile));
1091 handler.initialize();
1092 CrawlController controller = new CrawlController();
1093 controller.initialize(handler);
1094 if (listener != null) {
1095 controller.addCrawlStatusListener(listener);
1096 }
1097 controller.requestCrawlStart();
1098 return "Crawl started using " + crawlOrderFile + ".";
1099 }
1100
1101 /***
1102 * Launch the crawler for a web UI.
1103 *
1104 * Crawler hangs around waiting on jobs.
1105 *
1106 * @exception Exception
1107 * @return A status string describing how the launch went.
1108 * @throws Exception
1109 */
1110 public String launch() throws Exception {
1111 return launch(null, false);
1112 }
1113
1114 /***
1115 * Launch the crawler for a web UI.
1116 *
1117 * Crawler hangs around waiting on jobs.
1118 *
1119 * @param crawlOrderFile File to crawl. May be null.
1120 * @param runMode Whether crawler should be set to run mode.
1121 *
1122 * @exception Exception
1123 * @return A status string describing how the launch went.
1124 */
1125 public String launch(String crawlOrderFile, boolean runMode)
1126 throws Exception {
1127 String status = null;
1128 if (crawlOrderFile != null) {
1129 addCrawlJob(crawlOrderFile, "Autolaunched", "", "");
1130 if(runMode) {
1131 this.jobHandler.startCrawler();
1132 status = "Job being crawled: " + crawlOrderFile;
1133 } else {
1134 status = "Crawl job ready and pending: " + crawlOrderFile;
1135 }
1136 } else if(runMode) {
1137
1138
1139
1140 this.jobHandler.startCrawler();
1141 status = "Crawler set to run mode.";
1142 }
1143 return status;
1144 }
1145
1146 /***
1147 * Start up the embedded Jetty webserver instance.
1148 * This is done when we're run from the command-line.
1149 * @param port Port number to use for web UI.
1150 * @param adminLoginPassword Compound of login and password.
1151 * @throws Exception
1152 * @return Status on webserver startup.
1153 * @deprecated Use startEmbeddedWebserver(hosts, port, adminLoginPassword)
1154 */
1155 protected static String startEmbeddedWebserver(final int port,
1156 final boolean lho, final String adminLoginPassword)
1157 throws Exception {
1158 ArrayList<String> hosts = new ArrayList<String>();
1159 if (lho) {
1160 hosts.add("127.0.0.1");
1161 }
1162 return startEmbeddedWebserver(hosts, port, adminLoginPassword);
1163 }
1164
1165
1166 /***
1167 * Parses a list of host names.
1168 *
1169 * <p>If the given string is <code>/</code>, then an empty
1170 * collection is returned. This indicates that all available network
1171 * interfaces should be used.
1172 *
1173 * <p>Otherwise, the string must contain a comma-separated list of
1174 * IP addresses or host names. The parsed list is then returned.
1175 *
1176 * @param hosts the string to parse
1177 * @return the parsed collection of hosts
1178 */
1179 private static Collection<String> parseHosts(String hosts) {
1180 hosts = hosts.trim();
1181 if (hosts.equals("/")) {
1182 return new ArrayList<String>(1);
1183 }
1184 String[] hostArray = hosts.split(",");
1185 for (int i = 0; i < hostArray.length; i++) {
1186 hostArray[i] = hostArray[i].trim();
1187 }
1188 return Arrays.asList(hostArray);
1189 }
1190
1191 /***
1192 * Start up the embedded Jetty webserver instance.
1193 * This is done when we're run from the command-line.
1194 *
1195 * @param hosts a list of IP addresses or hostnames to bind to, or an
1196 * empty collection to bind to all available network
1197 * interfaces
1198 * @param port Port number to use for web UI.
1199 * @param adminLoginPassword Compound of login and password.
1200 * @throws Exception
1201 * @return Status on webserver startup.
1202 */
1203 protected static String startEmbeddedWebserver(Collection<String> hosts,
1204 int port, String adminLoginPassword)
1205 throws Exception {
1206 adminUsername = adminLoginPassword.
1207 substring(0, adminLoginPassword.indexOf(":"));
1208 adminPassword = adminLoginPassword.
1209 substring(adminLoginPassword.indexOf(":") + 1);
1210 Heritrix.httpServer = new SimpleHttpServer("admin",
1211 Heritrix.adminContext, hosts, port, false);
1212
1213 final String DOTWAR = ".war";
1214 final String SELFTEST = "selftest";
1215
1216
1217 File[] wars = getWarsdir().listFiles();
1218 for(int i = 0; i < wars.length; i++) {
1219 if(wars[i].isFile()) {
1220 final String warName = wars[i].getName();
1221 final String warNameNC = warName.toLowerCase();
1222 if(warNameNC.endsWith(DOTWAR) &&
1223 !warNameNC.equals(ADMIN + DOTWAR) &&
1224 !warNameNC.equals(SELFTEST + DOTWAR)) {
1225 int dot = warName.indexOf('.');
1226 Heritrix.httpServer.addWebapp(warName.substring(0, dot),
1227 null, true);
1228 }
1229 }
1230 }
1231
1232
1233
1234 final String ROLE = ADMIN;
1235 Heritrix.httpServer.setAuthentication(ROLE, Heritrix.adminContext,
1236 adminUsername, adminPassword, ROLE);
1237 Heritrix.httpServer.startServer();
1238 StringBuffer buffer = new StringBuffer();
1239 buffer.append("Heritrix " + Heritrix.getVersion() + " is running.");
1240 for (String host: httpServer.getHosts()) {
1241 buffer.append("\nWeb console is at: http://");
1242 buffer.append(host).append(':').append(port);
1243 }
1244 buffer.append("\nWeb console login and password: " +
1245 adminUsername + "/" + adminPassword);
1246 return buffer.toString();
1247 }
1248
1249 /***
1250 * Replace existing administrator login info with new info.
1251 *
1252 * @param newUsername new administrator login username
1253 * @param newPassword new administrator login password
1254 */
1255 public static void resetAuthentication(String newUsername,
1256 String newPassword) {
1257 Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername,
1258 newUsername, newPassword);
1259 adminUsername = newUsername;
1260 adminPassword = newPassword;
1261 logger.info("administrative login changed to "
1262 +newUsername+":"+newPassword);
1263 }
1264
1265 protected static CrawlJob createCrawlJob(CrawlJobHandler handler,
1266 File crawlOrderFile, String name)
1267 throws InvalidAttributeValueException {
1268 XMLSettingsHandler settings = new XMLSettingsHandler(crawlOrderFile);
1269 settings.initialize();
1270 return new CrawlJob(handler.getNextJobUID(), name, settings,
1271 new CrawlJobErrorHandler(Level.SEVERE),
1272 CrawlJob.PRIORITY_HIGH,
1273 crawlOrderFile.getAbsoluteFile().getParentFile());
1274 }
1275
1276 /***
1277 * This method is called when we have an order file to hand that we want
1278 * to base a job on. It leaves the order file in place and just starts up
1279 * a job that uses all the order points to for locations for logs, etc.
1280 * @param orderPathOrUrl Path to an order file or to a seeds file.
1281 * @param name Name to use for this job.
1282 * @param description
1283 * @param seeds
1284 * @return A status string.
1285 * @throws IOException
1286 * @throws FatalConfigurationException
1287 */
1288 public String addCrawlJob(String orderPathOrUrl, String name,
1289 String description, String seeds)
1290 throws IOException, FatalConfigurationException {
1291 if (!UURI.hasScheme(orderPathOrUrl)) {
1292
1293 return addCrawlJob(new File(orderPathOrUrl), name, description,
1294 seeds);
1295 }
1296
1297
1298 URL url = new URL(orderPathOrUrl);
1299
1300
1301
1302
1303 String result = null;
1304 URLConnection connection = url.openConnection();
1305 if (connection instanceof HttpURLConnection) {
1306 result = addCrawlJob(url, (HttpURLConnection)connection, name,
1307 description, seeds);
1308 } else if (connection instanceof FileURLConnection) {
1309 result = addCrawlJob(new File(url.getPath()), name, description,
1310 seeds);
1311 } else {
1312 throw new UnsupportedOperationException("No support for "
1313 + connection);
1314 }
1315
1316 return result;
1317 }
1318
1319 protected String addCrawlJob(final URL url,
1320 final HttpURLConnection connection,
1321 final String name, final String description, final String seeds)
1322 throws IOException, FatalConfigurationException {
1323 connection.connect();
1324
1325 boolean isJar = url.getPath() != null
1326 && url.getPath().toLowerCase().endsWith(JAR_SUFFIX)
1327 || "application/java-archive".equals(connection
1328 .getHeaderField("Content-Type"));
1329
1330 File localFile = File.createTempFile(Heritrix.class.getName(),
1331 isJar? JAR_SUFFIX: null, TMPDIR);
1332 String result = null;
1333 try {
1334 IoUtils.readFullyToFile(connection.getInputStream(), localFile);
1335 result = addCrawlJob(localFile, name, description, seeds);
1336 } catch (IOException ioe) {
1337
1338 localFile.delete();
1339 localFile = null;
1340 } finally {
1341 connection.disconnect();
1342
1343
1344
1345 if (isJar && localFile != null && localFile.exists()) {
1346 localFile.delete();
1347 }
1348 }
1349 return result;
1350 }
1351
1352 protected String addCrawlJob(final File order, final String name,
1353 final String description, final String seeds)
1354 throws FatalConfigurationException, IOException {
1355 CrawlJob addedJob = null;
1356 if (this.jobHandler == null) {
1357 throw new NullPointerException("Heritrix jobhandler is null.");
1358 }
1359 try {
1360 if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) {
1361 return addCrawlJobBasedonJar(order, name, description, seeds);
1362 }
1363 addedJob = this.jobHandler.
1364 addJob(createCrawlJob(this.jobHandler, order, name));
1365 } catch (InvalidAttributeValueException e) {
1366 FatalConfigurationException fce = new FatalConfigurationException(
1367 "Converted InvalidAttributeValueException on " +
1368 order.getAbsolutePath() + ": " + e.getMessage());
1369 fce.setStackTrace(e.getStackTrace());
1370 }
1371 return addedJob != null? addedJob.getUID(): null;
1372 }
1373
1374 /***
1375 * Undo jar file and use as basis for a new job.
1376 * @param jarFile Pointer to file that holds jar.
1377 * @param name Name to use for new job.
1378 * @param description
1379 * @param seeds
1380 * @return Message.
1381 * @throws IOException
1382 * @throws FatalConfigurationException
1383 */
1384 protected String addCrawlJobBasedonJar(final File jarFile,
1385 final String name, final String description, final String seeds)
1386 throws IOException, FatalConfigurationException {
1387 if (jarFile == null || !jarFile.exists()) {
1388 throw new FileNotFoundException(jarFile.getAbsolutePath());
1389 }
1390
1391
1392
1393
1394 File dir = File.createTempFile(Heritrix.class.getName(), ".expandedjar",
1395 TMPDIR);
1396 dir.delete();
1397 dir.mkdir();
1398 try {
1399 org.archive.crawler.util.IoUtils.unzip(jarFile, dir);
1400
1401 File orderFile = new File(dir, "order.xml");
1402 if (!orderFile.exists()) {
1403 throw new IOException("Missing order: " +
1404 orderFile.getAbsolutePath());
1405 }
1406 CrawlJob job =
1407 createCrawlJobBasedOn(orderFile, name, description, seeds);
1408
1409
1410 FileUtils.copyFiles(dir,
1411 new FilenameFilter() {
1412 public boolean accept(File dir, String name) {
1413 return !name.equals("order.xml") && !name.equals("settings");
1414 }
1415 },
1416 job.getDirectory(), false, true);
1417 addCrawlJob(job);
1418 return job.getUID();
1419 } catch (RuntimeException e) {
1420 logger.severe("problem adding crawl job from order jar " + jarFile + ": " + e);
1421 throw new FatalConfigurationException(e.toString());
1422 } finally {
1423
1424
1425
1426
1427
1428 org.archive.util.FileUtils.deleteDir(dir);
1429 }
1430 }
1431
1432 public String addCrawlJobBasedOn(String jobUidOrProfile,
1433 String name, String description, String seeds) {
1434 try {
1435 CrawlJob cj = getJobHandler().getJob(jobUidOrProfile);
1436 if (cj == null) {
1437 throw new InvalidAttributeValueException(jobUidOrProfile +
1438 " is not a job UID or profile name (Job UIDs are " +
1439 " usually the 14 digit date portion of job name).");
1440 }
1441 CrawlJob job = addCrawlJobBasedOn(
1442 cj.getSettingsHandler().getOrderFile(), name, description,
1443 seeds);
1444 return job.getUID();
1445 } catch (Exception e) {
1446 e.printStackTrace();
1447 return "Exception on " + jobUidOrProfile + ": " + e.getMessage();
1448 }
1449 }
1450
1451 protected CrawlJob addCrawlJobBasedOn(final File orderFile,
1452 final String name, final String description, final String seeds)
1453 throws FatalConfigurationException {
1454 return addCrawlJob(createCrawlJobBasedOn(orderFile, name, description,
1455 seeds));
1456 }
1457
1458 protected CrawlJob createCrawlJobBasedOn(final File orderFile,
1459 final String name, final String description, final String seeds)
1460 throws FatalConfigurationException {
1461 CrawlJob job = getJobHandler().newJob(orderFile, name, description,
1462 seeds);
1463 return CrawlJobHandler.ensureNewJobWritten(job, name, description);
1464 }
1465
1466 protected CrawlJob addCrawlJob(final CrawlJob job) {
1467 return getJobHandler().addJob(job);
1468 }
1469
1470 public void startCrawling() {
1471 if (getJobHandler() == null) {
1472 throw new NullPointerException("Heritrix jobhandler is null.");
1473 }
1474 getJobHandler().startCrawler();
1475 }
1476
1477 public void stopCrawling() {
1478 if (getJobHandler() == null) {
1479 throw new NullPointerException("Heritrix jobhandler is null.");
1480 }
1481 getJobHandler().stopCrawler();
1482 }
1483
1484 /***
1485 * Get the heritrix version.
1486 *
1487 * @return The heritrix version. May be null.
1488 */
1489 public static String getVersion() {
1490 return System.getProperty("heritrix.version");
1491 }
1492
1493 /***
1494 * Get the job handler
1495 *
1496 * @return The CrawlJobHandler being used.
1497 */
1498 public CrawlJobHandler getJobHandler() {
1499 return this.jobHandler;
1500 }
1501
1502 /***
1503 * Get the configuration directory.
1504 * @return The conf directory under HERITRIX_HOME or null if none can
1505 * be found.
1506 * @throws IOException
1507 */
1508 public static File getConfdir()
1509 throws IOException {
1510 return getConfdir(true);
1511 }
1512
1513 /***
1514 * Get the configuration directory.
1515 * @param fail Throw IOE if can't find directory if true, else just
1516 * return null.
1517 * @return The conf directory under HERITRIX_HOME or null (or an IOE) if
1518 * can't be found.
1519 * @throws IOException
1520 */
1521 public static File getConfdir(final boolean fail)
1522 throws IOException {
1523 final String key = "heritrix.conf";
1524
1525 String tmp = System.getProperty(key);
1526
1527 if (tmp == null || tmp.length() == 0) {
1528 return getSubDir("conf", fail);
1529 }
1530 File dir = new File(tmp);
1531 if (!dir.exists()) {
1532 if (fail) {
1533 throw new IOException("Cannot find conf dir: " + tmp);
1534 } else {
1535 logger.log(Level.WARNING, "Specified " + key +
1536 " dir does not exist. Falling back on default");
1537 }
1538 dir = getSubDir("conf", fail);
1539 }
1540 return dir;
1541 }
1542
1543 /***
1544 * @return Returns the httpServer. May be null if one was not started.
1545 */
1546 public static SimpleHttpServer getHttpServer() {
1547 return Heritrix.httpServer;
1548 }
1549
1550 /***
1551 * @throws IOException
1552 * @return Returns the directory under which reside the WAR files
1553 * we're to load into the servlet container.
1554 */
1555 public static File getWarsdir()
1556 throws IOException {
1557 return getSubDir("webapps");
1558 }
1559
1560 /***
1561 * Prepars for program shutdown. This method does it's best to prepare the
1562 * program so that it can exit normally. It will kill the httpServer and
1563 * terminate any running job.<br>
1564 * It is advisible to wait a few (~1000) millisec after calling this method
1565 * and before calling performHeritrixShutDown() to allow as many threads as
1566 * possible to finish what they are doing.
1567 */
1568 public static void prepareHeritrixShutDown() {
1569
1570
1571
1572 final Object [] keys = Heritrix.instances.keySet().toArray();
1573 for (int i = 0; i < keys.length; i++) {
1574 ((Heritrix)Heritrix.instances.get(keys[i])).destroy();
1575 }
1576
1577 try {
1578 deregisterJndi(getJndiContainerName());
1579 } catch (NameNotFoundException e) {
1580
1581 logger.log(Level.WARNING, "deregistration of jndi", e);
1582 } catch (Exception e) {
1583 e.printStackTrace();
1584 }
1585
1586 if(Heritrix.httpServer != null) {
1587
1588 try {
1589 Heritrix.httpServer.stopServer();
1590 } catch (InterruptedException e) {
1591
1592
1593 e.printStackTrace();
1594 } finally {
1595 Heritrix.httpServer = null;
1596 }
1597 }
1598 }
1599
1600 /***
1601 * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1602 * prior to this method.
1603 */
1604 public static void performHeritrixShutDown() {
1605 performHeritrixShutDown(0);
1606 }
1607
1608 /***
1609 * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1610 * prior to this method.
1611 *
1612 * @param exitCode Code to pass System.exit.
1613 *
1614 */
1615 public static void performHeritrixShutDown(int exitCode) {
1616 System.exit(exitCode);
1617 }
1618
1619 /***
1620 * Shutdown all running heritrix instances and the JVM.
1621 * Assumes stop has already been called.
1622 * @param exitCode Exit code to pass system exit.
1623 */
1624 public static void shutdown(final int exitCode) {
1625 getShutdownThread(true, exitCode, "Heritrix shutdown").start();
1626 }
1627
1628 protected static Thread getShutdownThread(final boolean sysexit,
1629 final int exitCode, final String name) {
1630 Thread t = new Thread(name) {
1631 public void run() {
1632 Heritrix.prepareHeritrixShutDown();
1633 if (sysexit) {
1634 Heritrix.performHeritrixShutDown(exitCode);
1635 }
1636 }
1637 };
1638 t.setDaemon(true);
1639 return t;
1640 }
1641
1642 public static void shutdown() {
1643 shutdown(0);
1644 }
1645
1646 /***
1647 * Register Heritrix with JNDI, JMX, and with the static hashtable of all
1648 * Heritrix instances known to this JVM.
1649 *
1650 * If launched from cmdline, register Heritrix MBean if an agent to register
1651 * ourselves with. Usually this method will only have effect if we're
1652 * running in a 1.5.0 JDK and command line options such as
1653 * '-Dcom.sun.management.jmxremote.port=8082
1654 * -Dcom.sun.management.jmxremote.authenticate=false
1655 * -Dcom.sun.management.jmxremote.ssl=false' are supplied.
1656 * See <a href="http://java.sun.com/j2se/1.5.0/docs/guide/management/agent.html">Monitoring
1657 * and Management Using JMX</a>
1658 * for more on the command line options and how to connect to the
1659 * Heritrix bean using the JDK 1.5.0 jconsole tool. We register currently
1660 * with first server we find (TODO: Make configurable).
1661 *
1662 * <p>If we register successfully with a JMX agent, then part of the
1663 * registration will include our registering ourselves with JNDI.
1664 *
1665 * <p>Finally, add the heritrix instance to the hashtable of all the
1666 * Heritrix instances floating in the current VM. This latter registeration
1667 * happens whether or no there is a JMX agent to register with. This is
1668 * a list we keep out of convenience so its easy iterating over all
1669 * all instances calling stop when main application is going down.
1670 *
1671 * @param h Instance of heritrix to register.
1672 * @param name Name to use for this Heritrix instance.
1673 * @param jmxregister True if we are to register this instance with JMX.
1674 * @throws NullPointerException
1675 * @throws MalformedObjectNameException
1676 * @throws NotCompliantMBeanException
1677 * @throws MBeanRegistrationException
1678 * @throws InstanceAlreadyExistsException
1679 */
1680 protected static void registerHeritrix(final Heritrix h,
1681 final String name, final boolean jmxregister)
1682 throws MalformedObjectNameException, InstanceAlreadyExistsException,
1683 MBeanRegistrationException, NotCompliantMBeanException {
1684 MBeanServer server = getMBeanServer();
1685 if (server != null) {
1686
1687
1688
1689 if (jmxregister) {
1690 ObjectName objName = (name == null || name.length() <= 0)?
1691 getJmxObjectName(): getJmxObjectName(name);
1692 registerMBean(server, h, objName);
1693 }
1694 } else {
1695
1696
1697
1698
1699 Heritrix.instances.put(h.getNoJmxName(), h);
1700 }
1701 }
1702
1703 protected static void unregisterHeritrix(final Heritrix h)
1704 throws InstanceNotFoundException, MBeanRegistrationException,
1705 NullPointerException {
1706 MBeanServer server = getMBeanServer();
1707 if (server != null) {
1708 server.unregisterMBean(h.mbeanName);
1709 } else {
1710
1711
1712 Heritrix.instances.remove(h.getNoJmxName());
1713 }
1714 }
1715
1716 /***
1717 * Get MBeanServer.
1718 * Currently uses first MBeanServer found. This will definetly not be whats
1719 * always wanted. TODO: Make which server settable. Also, if none, put up
1720 * our own MBeanServer.
1721 * @return An MBeanServer to register with or null.
1722 */
1723 public static MBeanServer getMBeanServer() {
1724 MBeanServer result = null;
1725 List servers = MBeanServerFactory.findMBeanServer(null);
1726 if (servers == null) {
1727 return result;
1728 }
1729 for (Iterator i = servers.iterator(); i.hasNext();) {
1730 MBeanServer server = (MBeanServer)i.next();
1731 if (server == null) {
1732 continue;
1733 }
1734 result = server;
1735 break;
1736 }
1737 return result;
1738 }
1739
1740 public static MBeanServer registerMBean(final Object objToRegister,
1741 final String name, final String type)
1742 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1743 NotCompliantMBeanException {
1744 MBeanServer server = getMBeanServer();
1745 if (server != null) {
1746 server = registerMBean(server, objToRegister, name, type);
1747 }
1748 return server;
1749 }
1750
1751 public static MBeanServer registerMBean(final MBeanServer server,
1752 final Object objToRegister, final String name, final String type)
1753 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1754 NotCompliantMBeanException {
1755 try {
1756 Hashtable<String,String> ht = new Hashtable<String,String>();
1757 ht.put(JmxUtils.NAME, name);
1758 ht.put(JmxUtils.TYPE, type);
1759 registerMBean(server, objToRegister,
1760 new ObjectName(CRAWLER_PACKAGE, ht));
1761 } catch (MalformedObjectNameException e) {
1762 e.printStackTrace();
1763 }
1764 return server;
1765 }
1766
1767 public static MBeanServer registerMBean(final MBeanServer server,
1768 final Object objToRegister, final ObjectName objName)
1769 throws InstanceAlreadyExistsException, MBeanRegistrationException,
1770 NotCompliantMBeanException {
1771 server.registerMBean(objToRegister, objName);
1772 return server;
1773 }
1774
1775 public static void unregisterMBean(final MBeanServer server,
1776 final String name, final String type) {
1777 if (server == null) {
1778 return;
1779 }
1780 try {
1781 unregisterMBean(server, getJmxObjectName(name, type));
1782 } catch (MalformedObjectNameException e) {
1783 e.printStackTrace();
1784 }
1785 }
1786
1787 public static void unregisterMBean(final MBeanServer server,
1788 final ObjectName name) {
1789 try {
1790 server.unregisterMBean(name);
1791 logger.info("Unregistered bean " + name.getCanonicalName());
1792 } catch (InstanceNotFoundException e) {
1793 e.printStackTrace();
1794 } catch (MBeanRegistrationException e) {
1795 e.printStackTrace();
1796 } catch (NullPointerException e) {
1797 e.printStackTrace();
1798 }
1799 }
1800
1801 /***
1802 * @return Name to use when no JMX agent available.
1803 */
1804 protected String getNoJmxName() {
1805 return this.getClass().getName();
1806 }
1807
1808 public static ObjectName getJmxObjectName()
1809 throws MalformedObjectNameException, NullPointerException {
1810 return getJmxObjectName("Heritrix", JmxUtils.SERVICE);
1811 }
1812
1813 public static ObjectName getJmxObjectName(final String name)
1814 throws MalformedObjectNameException, NullPointerException {
1815 return getJmxObjectName(name, JmxUtils.SERVICE);
1816 }
1817
1818 public static ObjectName getJmxObjectName(final String name,
1819 final String type)
1820 throws MalformedObjectNameException, NullPointerException {
1821 Hashtable<String,String> ht = new Hashtable<String,String>();
1822 ht.put(JmxUtils.NAME, name);
1823 ht.put(JmxUtils.TYPE, type);
1824 return new ObjectName(CRAWLER_PACKAGE, ht);
1825 }
1826
1827 /***
1828 * @return Returns true if Heritrix was launched from the command line.
1829 * (When launched from command line, we do stuff like put up a web server
1830 * to manage our web interface and we register ourselves with the first
1831 * available jmx agent).
1832 */
1833 public static boolean isCommandLine() {
1834 return Heritrix.commandLine;
1835 }
1836
1837 /***
1838 * @return True if heritrix has been started.
1839 */
1840 public boolean isStarted() {
1841 return this.jobHandler != null;
1842 }
1843
1844 public String getStatus() {
1845 StringBuffer buffer = new StringBuffer();
1846 if (this.getJobHandler() != null) {
1847 buffer.append("isRunning=");
1848 buffer.append(this.getJobHandler().isRunning());
1849 buffer.append(" isCrawling=");
1850 buffer.append(this.getJobHandler().isCrawling());
1851 buffer.append(" alertCount=");
1852 buffer.append(getAlertsCount());
1853 buffer.append(" newAlertCount=");
1854 buffer.append(getNewAlertsCount());
1855 if (this.getJobHandler().isCrawling()) {
1856 buffer.append(" currentJob=");
1857 buffer.append(this.getJobHandler().getCurrentJob().
1858 getJmxJobName());
1859 }
1860 }
1861 return buffer.toString();
1862 }
1863
1864
1865 public int getAlertsCount() {
1866 return this.alertManager.getCount();
1867 }
1868
1869 public int getNewAlertsCount() {
1870 return this.alertManager.getNewCount();
1871 }
1872
1873 public Vector getAlerts() {
1874 return this.alertManager.getAll();
1875 }
1876
1877 public Vector getNewAlerts() {
1878 return this.alertManager.getNewAll();
1879 }
1880
1881 public SinkHandlerLogRecord getAlert(final String id) {
1882 return this.alertManager.get(id);
1883 }
1884
1885 public void readAlert(final String id) {
1886 this.alertManager.read(id);
1887 }
1888
1889 public void removeAlert(final String id) {
1890 this.alertManager.remove(id);
1891 }
1892
1893 /***
1894 * Start Heritrix.
1895 *
1896 * Used by JMX and webapp initialization for starting Heritrix.
1897 * Not by the cmdline launched Heritrix. Idempotent.
1898 * If start is called by JMX, then new instance of Heritrix is automatically
1899 * registered w/ JMX Agent. If started by webapp, need to register the new
1900 * Heritrix instance.
1901 */
1902 public void start() {
1903
1904
1905 if (!Heritrix.isCommandLine() && !isStarted()) {
1906 try {
1907 logger.info(launch());
1908 } catch (Exception e) {
1909 e.printStackTrace();
1910 }
1911 }
1912 }
1913
1914 /***
1915 * Stop Heritrix.
1916 *
1917 * Used by JMX and webapp initialization for stopping Heritrix.
1918 */
1919 public void stop() {
1920 if (this.jobHandler != null) {
1921 this.jobHandler.stop();
1922 }
1923 }
1924
1925 public String interrupt(String threadName) {
1926 String result = "Thread " + threadName + " not found";
1927 ThreadGroup group = Thread.currentThread().getThreadGroup();
1928 if (group == null) {
1929 return result;
1930 }
1931
1932
1933 ThreadGroup parent = null;
1934 while((parent = group.getParent()) != null) {
1935 group = parent;
1936 }
1937
1938
1939 final int max = group.activeCount() * 2;
1940 Thread [] threads = new Thread[max];
1941 int threadCount = group.enumerate(threads, true);
1942 if (threadCount >= max) {
1943 logger.info("Some threads not found...array too small: " +
1944 max);
1945 }
1946 for (int j = 0; j < threadCount; j++) {
1947 if (threads[j].getName().equals(threadName)) {
1948 threads[j].interrupt();
1949 result = "Interrupt sent to " + threadName;
1950 break;
1951 }
1952 }
1953 return result;
1954 }
1955
1956
1957
1958 /***
1959 * Build up the MBean info for Heritrix main.
1960 * @return Return created mbean info instance.
1961 */
1962 protected OpenMBeanInfoSupport buildMBeanInfo() {
1963 OpenMBeanAttributeInfoSupport[] attributes =
1964 new OpenMBeanAttributeInfoSupport[Heritrix.ATTRIBUTE_LIST.size()];
1965 OpenMBeanConstructorInfoSupport[] constructors =
1966 new OpenMBeanConstructorInfoSupport[1];
1967 OpenMBeanOperationInfoSupport[] operations =
1968 new OpenMBeanOperationInfoSupport[Heritrix.OPERATION_LIST.size()];
1969 MBeanNotificationInfo[] notifications =
1970 new MBeanNotificationInfo[0];
1971
1972
1973 attributes[0] =
1974 new OpenMBeanAttributeInfoSupport(Heritrix.STATUS_ATTR,
1975 "Short basic status message", SimpleType.STRING, true,
1976 false, false);
1977
1978 attributes[1] =
1979 new OpenMBeanAttributeInfoSupport(Heritrix.VERSION_ATTR,
1980 "Heritrix version", SimpleType.STRING, true, false, false);
1981
1982 attributes[2] =
1983 new OpenMBeanAttributeInfoSupport(Heritrix.ISRUNNING_ATTR,
1984 "Whether the crawler is running", SimpleType.BOOLEAN, true,
1985 false, false);
1986
1987 attributes[3] =
1988 new OpenMBeanAttributeInfoSupport(Heritrix.ISCRAWLING_ATTR,
1989 "Whether the crawler is crawling", SimpleType.BOOLEAN, true,
1990 false, false);
1991
1992 attributes[4] =
1993 new OpenMBeanAttributeInfoSupport(Heritrix.ALERTCOUNT_ATTR,
1994 "The number of alerts", SimpleType.INTEGER, true, false, false);
1995
1996 attributes[5] =
1997 new OpenMBeanAttributeInfoSupport(Heritrix.NEWALERTCOUNT_ATTR,
1998 "The number of new alerts", SimpleType.INTEGER, true, false,
1999 false);
2000
2001 attributes[6] =
2002 new OpenMBeanAttributeInfoSupport(Heritrix.CURRENTJOB_ATTR,
2003 "The name of the job currently being crawled",
2004 SimpleType.STRING, true, false, false);
2005
2006
2007 constructors[0] = new OpenMBeanConstructorInfoSupport(
2008 "HeritrixOpenMBean", "Constructs Heritrix OpenMBean instance ",
2009 new OpenMBeanParameterInfoSupport[0]);
2010
2011
2012 operations[0] = new OpenMBeanOperationInfoSupport(
2013 Heritrix.START_OPER, "Start Heritrix instance", null,
2014 SimpleType.VOID, MBeanOperationInfo.ACTION);
2015
2016 operations[1] = new OpenMBeanOperationInfoSupport(
2017 Heritrix.STOP_OPER, "Stop Heritrix instance", null,
2018 SimpleType.VOID, MBeanOperationInfo.ACTION);
2019
2020 OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[1];
2021 args[0] = new OpenMBeanParameterInfoSupport("threadName",
2022 "Name of thread to send interrupt", SimpleType.STRING);
2023 operations[2] = new OpenMBeanOperationInfoSupport(
2024 Heritrix.INTERRUPT_OPER, "Send thread an interrupt " +
2025 "(Used debugging)", args, SimpleType.STRING,
2026 MBeanOperationInfo.ACTION_INFO);
2027
2028 operations[3] = new OpenMBeanOperationInfoSupport(
2029 Heritrix.START_CRAWLING_OPER, "Set Heritrix instance " +
2030 "into crawling mode", null, SimpleType.VOID,
2031 MBeanOperationInfo.ACTION);
2032
2033 operations[4] = new OpenMBeanOperationInfoSupport(
2034 Heritrix.STOP_CRAWLING_OPER, "Unset Heritrix instance " +
2035 " crawling mode", null, SimpleType.VOID,
2036 MBeanOperationInfo.ACTION);
2037
2038 args = new OpenMBeanParameterInfoSupport[4];
2039 args[0] = new OpenMBeanParameterInfoSupport("pathOrURL",
2040 "Path/URL to order or jar of order+seed",
2041 SimpleType.STRING);
2042 args[1] = new OpenMBeanParameterInfoSupport("name",
2043 "Basename for new job", SimpleType.STRING);
2044 args[2] = new OpenMBeanParameterInfoSupport("description",
2045 "Description to save with new job", SimpleType.STRING);
2046 args[3] = new OpenMBeanParameterInfoSupport("seeds",
2047 "Initial seed(s)", SimpleType.STRING);
2048 operations[5] = new OpenMBeanOperationInfoSupport(
2049 Heritrix.ADD_CRAWL_JOB_OPER, "Add new crawl job", args,
2050 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2051
2052 args = new OpenMBeanParameterInfoSupport[4];
2053 args[0] = new OpenMBeanParameterInfoSupport("uidOrName",
2054 "Job UID or profile name", SimpleType.STRING);
2055 args[1] = new OpenMBeanParameterInfoSupport("name",
2056 "Basename for new job", SimpleType.STRING);
2057 args[2] = new OpenMBeanParameterInfoSupport("description",
2058 "Description to save with new job", SimpleType.STRING);
2059 args[3] = new OpenMBeanParameterInfoSupport("seeds",
2060 "Initial seed(s)", SimpleType.STRING);
2061 operations[6] = new OpenMBeanOperationInfoSupport(
2062 Heritrix.ADD_CRAWL_JOB_BASEDON_OPER,
2063 "Add a new crawl job based on passed Job UID or profile",
2064 args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2065
2066 args = new OpenMBeanParameterInfoSupport[1];
2067 args[0] = new OpenMBeanParameterInfoSupport("UID",
2068 "Job UID", SimpleType.STRING);
2069 operations[7] = new OpenMBeanOperationInfoSupport(DELETE_CRAWL_JOB_OPER,
2070 "Delete/stop this crawl job", args, SimpleType.VOID,
2071 MBeanOperationInfo.ACTION);
2072
2073 args = new OpenMBeanParameterInfoSupport[1];
2074 args[0] = new OpenMBeanParameterInfoSupport("index",
2075 "Zero-based index into array of alerts", SimpleType.INTEGER);
2076 operations[8] = new OpenMBeanOperationInfoSupport(
2077 Heritrix.ALERT_OPER, "Return alert at passed index", args,
2078 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2079
2080 try {
2081 this.jobCompositeType = new CompositeType("job",
2082 "Job attributes", JOB_KEYS,
2083 new String [] {"Job unique ID", "Job name", "Job status"},
2084 new OpenType [] {SimpleType.STRING, SimpleType.STRING,
2085 SimpleType.STRING});
2086 this.jobsTabularType = new TabularType("jobs", "List of jobs",
2087 this.jobCompositeType, new String [] {"uid"});
2088 } catch (OpenDataException e) {
2089
2090 throw new RuntimeException(e);
2091 }
2092 operations[9] = new OpenMBeanOperationInfoSupport(
2093 Heritrix.PENDING_JOBS_OPER,
2094 "List of pending jobs (or null if none)", null,
2095 this.jobsTabularType, MBeanOperationInfo.INFO);
2096 operations[10] = new OpenMBeanOperationInfoSupport(
2097 Heritrix.COMPLETED_JOBS_OPER,
2098 "List of completed jobs (or null if none)", null,
2099 this.jobsTabularType, MBeanOperationInfo.INFO);
2100
2101 args = new OpenMBeanParameterInfoSupport[2];
2102 args[0] = new OpenMBeanParameterInfoSupport("uid",
2103 "Job unique ID", SimpleType.STRING);
2104 args[1] = new OpenMBeanParameterInfoSupport("name",
2105 "Report name (e.g. crawl-report, etc.)",
2106 SimpleType.STRING);
2107 operations[11] = new OpenMBeanOperationInfoSupport(
2108 Heritrix.CRAWLEND_REPORT_OPER, "Return crawl-end report", args,
2109 SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2110
2111 operations[12] = new OpenMBeanOperationInfoSupport(
2112 Heritrix.SHUTDOWN_OPER, "Shutdown container", null,
2113 SimpleType.VOID, MBeanOperationInfo.ACTION);
2114
2115 args = new OpenMBeanParameterInfoSupport[2];
2116 args[0] = new OpenMBeanParameterInfoSupport("level",
2117 "Log level: e.g. SEVERE, WARNING, etc.", SimpleType.STRING);
2118 args[1] = new OpenMBeanParameterInfoSupport("message",
2119 "Log message", SimpleType.STRING);
2120 operations[13] = new OpenMBeanOperationInfoSupport(Heritrix.LOG_OPER,
2121 "Add a log message", args, SimpleType.VOID,
2122 MBeanOperationInfo.ACTION);
2123
2124 operations[14] = new OpenMBeanOperationInfoSupport(
2125 Heritrix.DESTROY_OPER, "Destroy Heritrix instance", null,
2126 SimpleType.VOID, MBeanOperationInfo.ACTION);
2127
2128 operations[15] = new OpenMBeanOperationInfoSupport(
2129 Heritrix.TERMINATE_CRAWL_JOB_OPER,
2130 "Returns false if no current job", null, SimpleType.BOOLEAN,
2131 MBeanOperationInfo.ACTION);
2132
2133 operations[16] = new OpenMBeanOperationInfoSupport(
2134 Heritrix.REBIND_JNDI_OPER,
2135 "Rebinds this Heritrix with JNDI.", null,
2136 SimpleType.VOID, MBeanOperationInfo.ACTION);
2137
2138
2139 return new OpenMBeanInfoSupport(this.getClass().getName(),
2140 "Heritrix Main OpenMBean", attributes, constructors, operations,
2141 notifications);
2142 }
2143
2144 public Object getAttribute(String attribute_name)
2145 throws AttributeNotFoundException {
2146 if (attribute_name == null) {
2147 throw new RuntimeOperationsException(
2148 new IllegalArgumentException("Attribute name cannot be null"),
2149 "Cannot call getAttribute with null attribute name");
2150 }
2151 if (!Heritrix.ATTRIBUTE_LIST.contains(attribute_name)) {
2152 throw new AttributeNotFoundException("Attribute " +
2153 attribute_name + " is unimplemented.");
2154 }
2155
2156
2157
2158
2159 if (attribute_name.equals(STATUS_ATTR)) {
2160 return getStatus();
2161 }
2162 if (attribute_name.equals(VERSION_ATTR)) {
2163 return getVersion();
2164 }
2165
2166 if (attribute_name.equals(ISRUNNING_ATTR)) {
2167 return new Boolean(this.getJobHandler().isRunning());
2168 }
2169 if (attribute_name.equals(ISCRAWLING_ATTR)) {
2170 return new Boolean(this.getJobHandler().isCrawling());
2171 }
2172 if (attribute_name.equals(ALERTCOUNT_ATTR)) {
2173 return new Integer(getAlertsCount());
2174 }
2175 if (attribute_name.equals(NEWALERTCOUNT_ATTR)) {
2176 return new Integer(getNewAlertsCount());
2177 }
2178 if (attribute_name.equals(CURRENTJOB_ATTR)) {
2179 if (this.getJobHandler().isCrawling()) {
2180 return this.getJobHandler().getCurrentJob().getJmxJobName();
2181 }
2182 return null;
2183 }
2184 throw new AttributeNotFoundException("Attribute " +
2185 attribute_name + " not found.");
2186 }
2187
2188 public void setAttribute(Attribute attribute)
2189 throws AttributeNotFoundException {
2190 throw new AttributeNotFoundException("No attribute can be set in " +
2191 "this MBean");
2192 }
2193
2194 public AttributeList getAttributes(String [] attributeNames) {
2195 if (attributeNames == null) {
2196 throw new RuntimeOperationsException(
2197 new IllegalArgumentException("attributeNames[] cannot be " +
2198 "null"), "Cannot call getAttributes with null attribute " +
2199 "names");
2200 }
2201 AttributeList resultList = new AttributeList();
2202 if (attributeNames.length == 0) {
2203 return resultList;
2204 }
2205 for (int i = 0; i < attributeNames.length; i++) {
2206 try {
2207 Object value = getAttribute(attributeNames[i]);
2208 resultList.add(new Attribute(attributeNames[i], value));
2209 } catch (Exception e) {
2210 e.printStackTrace();
2211 }
2212 }
2213 return(resultList);
2214 }
2215
2216 public AttributeList setAttributes(AttributeList attributes) {
2217 return new AttributeList();
2218 }
2219
2220 public Object invoke(final String operationName, final Object[] params,
2221 final String[] signature)
2222 throws ReflectionException {
2223 if (operationName == null) {
2224 throw new RuntimeOperationsException(
2225 new IllegalArgumentException("Operation name cannot be null"),
2226 "Cannot call invoke with null operation name");
2227 }
2228
2229 if (logger.isLoggable(Level.INFO)) {
2230
2231 StringBuilder buf = new StringBuilder();
2232 for (Object o : params) {
2233 buf.append("\"" + o + "\", ");
2234 }
2235 logger.info("JMX invoke: " + operationName + "(" + buf + ")");
2236 }
2237
2238
2239
2240
2241 if (operationName.equals(START_OPER)) {
2242 JmxUtils.checkParamsCount(START_OPER, params, 0);
2243 start();
2244 return null;
2245 }
2246 if (operationName.equals(STOP_OPER)) {
2247 JmxUtils.checkParamsCount(STOP_OPER, params, 0);
2248 stop();
2249 return null;
2250 }
2251 if (operationName.equals(DESTROY_OPER)) {
2252 JmxUtils.checkParamsCount(DESTROY_OPER, params, 0);
2253 destroy();
2254 return null;
2255 }
2256 if (operationName.equals(TERMINATE_CRAWL_JOB_OPER)) {
2257 JmxUtils.checkParamsCount(TERMINATE_CRAWL_JOB_OPER, params, 0);
2258 return new Boolean(this.jobHandler.terminateCurrentJob());
2259 }
2260 if (operationName.equals(REBIND_JNDI_OPER)) {
2261 JmxUtils.checkParamsCount(REBIND_JNDI_OPER, params, 0);
2262 try {
2263 registerContainerJndi();
2264 } catch (MalformedObjectNameException e) {
2265 throw new RuntimeOperationsException(new RuntimeException(e));
2266 } catch (UnknownHostException e) {
2267 throw new RuntimeOperationsException(new RuntimeException(e));
2268 } catch (NamingException e) {
2269 throw new RuntimeOperationsException(new RuntimeException(e));
2270 }
2271 return null;
2272 }
2273 if (operationName.equals(SHUTDOWN_OPER)) {
2274 JmxUtils.checkParamsCount(SHUTDOWN_OPER, params, 0);
2275 Heritrix.shutdown();
2276 return null;
2277 }
2278 if (operationName.equals(LOG_OPER)) {
2279 JmxUtils.checkParamsCount(LOG_OPER, params, 2);
2280 logger.log(Level.parse((String)params[0]), (String)params[1]);
2281 return null;
2282 }
2283 if (operationName.equals(INTERRUPT_OPER)) {
2284 JmxUtils.checkParamsCount(INTERRUPT_OPER, params, 1);
2285 return interrupt((String)params[0]);
2286 }
2287 if (operationName.equals(START_CRAWLING_OPER)) {
2288 JmxUtils.checkParamsCount(START_CRAWLING_OPER, params, 0);
2289 startCrawling();
2290 return null;
2291 }
2292 if (operationName.equals(STOP_CRAWLING_OPER)) {
2293 JmxUtils.checkParamsCount(STOP_CRAWLING_OPER, params, 0);
2294 stopCrawling();
2295 return null;
2296 }
2297 if (operationName.equals(ADD_CRAWL_JOB_OPER)) {
2298 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_OPER, params, 4);
2299 try {
2300 return addCrawlJob((String)params[0], (String)params[1],
2301 checkForEmptyPlaceHolder((String)params[2]),
2302 checkForEmptyPlaceHolder((String)params[3]));
2303 } catch (IOException e) {
2304 throw new RuntimeOperationsException(new RuntimeException(e));
2305 } catch (FatalConfigurationException e) {
2306 throw new RuntimeOperationsException(new RuntimeException(e));
2307 }
2308 }
2309 if (operationName.equals(DELETE_CRAWL_JOB_OPER)) {
2310 JmxUtils.checkParamsCount(DELETE_CRAWL_JOB_OPER, params, 1);
2311 this.jobHandler.deleteJob((String)params[0]);
2312 return null;
2313 }
2314
2315 if (operationName.equals(ADD_CRAWL_JOB_BASEDON_OPER)) {
2316 JmxUtils.checkParamsCount(ADD_CRAWL_JOB_BASEDON_OPER, params, 4);
2317 return addCrawlJobBasedOn((String)params[0], (String)params[1],
2318 checkForEmptyPlaceHolder((String)params[2]),
2319 checkForEmptyPlaceHolder((String)params[3]));
2320 }
2321 if (operationName.equals(ALERT_OPER)) {
2322 JmxUtils.checkParamsCount(ALERT_OPER, params, 1);
2323 SinkHandlerLogRecord slr = null;
2324 if (this.alertManager.getCount() > 0) {
2325
2326
2327
2328 slr = (SinkHandlerLogRecord)this.alertManager.getAll().
2329 get(((Integer)params[0]).intValue());
2330 }
2331 return (slr != null)? slr.toString(): null;
2332 }
2333
2334 if (operationName.equals(PENDING_JOBS_OPER)) {
2335 JmxUtils.checkParamsCount(PENDING_JOBS_OPER, params, 0);
2336 try {
2337 return makeJobsTabularData(getJobHandler().getPendingJobs());
2338 } catch (OpenDataException e) {
2339 throw new RuntimeOperationsException(new RuntimeException(e));
2340 }
2341 }
2342
2343 if (operationName.equals(COMPLETED_JOBS_OPER)) {
2344 JmxUtils.checkParamsCount(COMPLETED_JOBS_OPER, params, 0);
2345 try {
2346 return makeJobsTabularData(getJobHandler().getCompletedJobs());
2347 } catch (OpenDataException e) {
2348 throw new RuntimeOperationsException(new RuntimeException(e));
2349 }
2350 }
2351
2352 if (operationName.equals(CRAWLEND_REPORT_OPER)) {
2353 JmxUtils.checkParamsCount(CRAWLEND_REPORT_OPER, params, 2);
2354 try {
2355 return getCrawlendReport((String)params[0], (String) params[1]);
2356 } catch (IOException e) {
2357 throw new RuntimeOperationsException(new RuntimeException(e));
2358 }
2359 }
2360
2361 throw new ReflectionException(
2362 new NoSuchMethodException(operationName),
2363 "Cannot find the operation " + operationName);
2364 }
2365
2366 /***
2367 * Return named crawl end report for job with passed uid.
2368 * Crawler makes reports when its finished its crawl. Use this method
2369 * to get a String version of one of these files.
2370 * @param jobUid The unique ID for the job whose reports you want to see
2371 * (Must be a completed job).
2372 * @param reportName Name of report minus '.txt' (e.g. crawl-report).
2373 * @return String version of the on-disk report.
2374 * @throws IOException
2375 */
2376 protected String getCrawlendReport(String jobUid, String reportName)
2377 throws IOException {
2378 CrawlJob job = getJobHandler().getJob(jobUid);
2379 if (job == null) {
2380 throw new IOException("No such job: " + jobUid);
2381 }
2382 File report = new File(job.getDirectory(), reportName + ".txt");
2383 if (!report.exists()) {
2384 throw new FileNotFoundException(report.getAbsolutePath());
2385 }
2386 return FileUtils.readFileAsString(report);
2387 }
2388
2389 protected TabularData makeJobsTabularData(List jobs)
2390 throws OpenDataException {
2391 if (jobs == null || jobs.size() == 0) {
2392 return null;
2393 }
2394 TabularData td = new TabularDataSupport(this.jobsTabularType);
2395 for (Iterator i = jobs.iterator(); i.hasNext();) {
2396 CrawlJob job = (CrawlJob)i.next();
2397 CompositeData cd = new CompositeDataSupport(this.jobCompositeType,
2398 JOB_KEYS,
2399 new String [] {job.getUID(), job.getJobName(), job.getStatus()});
2400 td.put(cd);
2401 }
2402 return td;
2403 }
2404
2405 /***
2406 * If passed str has placeholder for the empty string, return the empty
2407 * string else return orginal.
2408 * Dumb jmx clients can't pass empty string so they'll pass a representation
2409 * of empty string such as ' ' or '-'. Convert such strings to empty
2410 * string.
2411 * @param str String to check.
2412 * @return Original <code>str</code> or empty string if <code>str</code>
2413 * contains a placeholder for the empty-string (e.g. '-', or ' ').
2414 */
2415 protected String checkForEmptyPlaceHolder(String str) {
2416 return TextUtils.matches("-| +", str)? "": str;
2417 }
2418
2419 public MBeanInfo getMBeanInfo() {
2420 return this.openMBeanInfo;
2421 }
2422
2423 /***
2424 * @return Name this instance registered in JMX (Only available after JMX
2425 * registration).
2426 */
2427 public ObjectName getMBeanName() {
2428 return this.mbeanName;
2429 }
2430
2431 public ObjectName preRegister(MBeanServer server, ObjectName name)
2432 throws Exception {
2433 this.mbeanServer = server;
2434 @SuppressWarnings("unchecked")
2435 Hashtable<String,String> ht = name.getKeyPropertyList();
2436 if (!ht.containsKey(JmxUtils.NAME)) {
2437 throw new IllegalArgumentException("Name property required" +
2438 name.getCanonicalName());
2439 }
2440 if (!ht.containsKey(JmxUtils.TYPE)) {
2441 ht.put(JmxUtils.TYPE, JmxUtils.SERVICE);
2442 name = new ObjectName(name.getDomain(), ht);
2443 }
2444 this.mbeanName = addGuiPort(addVitals(name));
2445 Heritrix.instances.put(this.mbeanName.
2446 getCanonicalKeyPropertyListString(), this);
2447 return this.mbeanName;
2448 }
2449
2450 /***
2451 * Add vital stats to passed in ObjectName.
2452 * @param name ObjectName to add to.
2453 * @return name with host, guiport, and jmxport added.
2454 * @throws UnknownHostException
2455 * @throws MalformedObjectNameException
2456 * @throws NullPointerException
2457 */
2458 protected static ObjectName addVitals(ObjectName name)
2459 throws UnknownHostException, MalformedObjectNameException,
2460 NullPointerException {
2461 @SuppressWarnings("unchecked")
2462 Hashtable<String,String> ht = name.getKeyPropertyList();
2463 if (!ht.containsKey(JmxUtils.HOST)) {
2464 ht.put(JmxUtils.HOST, InetAddress.getLocalHost().getCanonicalHostName());
2465 name = new ObjectName(name.getDomain(), ht);
2466 }
2467 if (!ht.containsKey(JmxUtils.JMX_PORT)) {
2468
2469
2470
2471
2472 String p = System.getProperty("com.sun.management.jmxremote.port");
2473 if (p != null && p.length() > 0) {
2474 ht.put(JmxUtils.JMX_PORT, p);
2475 name = new ObjectName(name.getDomain(), ht);
2476 }
2477 }
2478 return name;
2479 }
2480
2481 protected static ObjectName addGuiPort(ObjectName name)
2482 throws MalformedObjectNameException, NullPointerException {
2483 @SuppressWarnings("unchecked")
2484 Hashtable<String,String> ht = name.getKeyPropertyList();
2485 if (!ht.containsKey(JmxUtils.GUI_PORT)) {
2486
2487 if (Heritrix.gui) {
2488 ht.put(JmxUtils.GUI_PORT, Integer.toString(Heritrix.guiPort));
2489 name = new ObjectName(name.getDomain(), ht);
2490 }
2491 }
2492 return name;
2493 }
2494
2495 public void postRegister(Boolean registrationDone) {
2496 if (logger.isLoggable(Level.INFO)) {
2497 logger.info(
2498 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2499 this.mbeanServer, registrationDone.booleanValue()));
2500 }
2501 try {
2502 registerJndi(this.mbeanName);
2503 } catch (Exception e) {
2504 logger.log(Level.SEVERE, "Failed jndi registration", e);
2505 }
2506 }
2507
2508 public void preDeregister() throws Exception {
2509 deregisterJndi(this.mbeanName);
2510 }
2511
2512 public void postDeregister() {
2513 Heritrix.instances.
2514 remove(this.mbeanName.getCanonicalKeyPropertyListString());
2515 if (logger.isLoggable(Level.INFO)) {
2516 logger.info(JmxUtils.getLogUnregistrationMsg(
2517 this.mbeanName.getCanonicalName(), this.mbeanServer));
2518 }
2519 }
2520
2521 protected static void registerContainerJndi()
2522 throws MalformedObjectNameException, NullPointerException,
2523 UnknownHostException, NamingException {
2524 registerJndi(getJndiContainerName());
2525 }
2526
2527 protected static void registerJndi(final ObjectName name)
2528 throws NullPointerException, NamingException {
2529 Context c = getJndiContext();
2530 if (c == null) {
2531 return;
2532 }
2533 CompoundName key = JndiUtils.bindObjectName(c, name);
2534 if (logger.isLoggable(Level.FINE)) {
2535 logger.fine("Bound '" + key + "' to '" + JndiUtils.
2536 getCompoundName(c.getNameInNamespace()).toString()
2537 + "' jndi context");
2538 }
2539 }
2540
2541 protected static void deregisterJndi(final ObjectName name)
2542 throws NullPointerException, NamingException {
2543 Context c = getJndiContext();
2544 if (c == null) {
2545 return;
2546 }
2547 CompoundName key = JndiUtils.unbindObjectName(c, name);
2548 if (logger.isLoggable(Level.FINE)) {
2549 logger.fine("Unbound '" + key + "' from '" +
2550 JndiUtils.getCompoundName(c.getNameInNamespace()).toString() +
2551 "' jndi context");
2552 }
2553 }
2554
2555 /***
2556 * @return Jndi context for the crawler or null if none found.
2557 * @throws NamingException
2558 */
2559 protected static Context getJndiContext() throws NamingException {
2560 Context c = null;
2561 try {
2562 c = JndiUtils.getSubContext(CRAWLER_PACKAGE);
2563 } catch (NoInitialContextException e) {
2564 logger.fine("No JNDI Context: " + e.toString());
2565 }
2566 return c;
2567 }
2568
2569 /***
2570 * @return Jndi container name -- the name to use for the 'container' that
2571 * can host zero or more heritrix instances (Return a JMX ObjectName. We
2572 * use ObjectName because then we're sync'd with JMX naming and ObjectName
2573 * has nice parsing).
2574 * @throws NullPointerException
2575 * @throws MalformedObjectNameException
2576 * @throws UnknownHostException
2577 */
2578 protected static ObjectName getJndiContainerName()
2579 throws MalformedObjectNameException, NullPointerException,
2580 UnknownHostException {
2581 ObjectName objName = new ObjectName(CRAWLER_PACKAGE, "type",
2582 "container");
2583 return addVitals(objName);
2584 }
2585
2586 /***
2587 * @return Return all registered instances of Heritrix (Rare are there
2588 * more than one).
2589 */
2590 public static Map getInstances() {
2591 return Heritrix.instances;
2592 }
2593
2594 /***
2595 * @return True if only one instance of Heritrix.
2596 */
2597 public static boolean isSingleInstance() {
2598 return Heritrix.instances != null && Heritrix.instances.size() == 1;
2599 }
2600
2601 /***
2602 * @return Returns single instance or null if no instance or multiple.
2603 */
2604 public static Heritrix getSingleInstance() {
2605 return !isSingleInstance()?
2606 null:
2607 (Heritrix)Heritrix.instances.
2608 get(Heritrix.instances.keySet().iterator().next());
2609 }
2610 }