View Javadoc

1   /* CrawlJob
2    *
3    * Copyright (C) 2003 Internet Archive.
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   */
21  package org.archive.crawler.admin;
22  
23  import java.io.BufferedReader;
24  import java.io.File;
25  import java.io.FileNotFoundException;
26  import java.io.FileOutputStream;
27  import java.io.FileReader;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.io.InputStreamReader;
31  import java.io.OutputStreamWriter;
32  import java.io.PrintWriter;
33  import java.io.Serializable;
34  import java.io.StringWriter;
35  import java.util.ArrayList;
36  import java.util.Arrays;
37  import java.util.Collection;
38  import java.util.EventObject;
39  import java.util.Hashtable;
40  import java.util.Iterator;
41  import java.util.List;
42  import java.util.Map;
43  import java.util.logging.Level;
44  import java.util.logging.Logger;
45  
46  import javax.management.Attribute;
47  import javax.management.AttributeList;
48  import javax.management.AttributeNotFoundException;
49  import javax.management.DynamicMBean;
50  import javax.management.InstanceAlreadyExistsException;
51  import javax.management.InvalidAttributeValueException;
52  import javax.management.MBeanAttributeInfo;
53  import javax.management.MBeanException;
54  import javax.management.MBeanInfo;
55  import javax.management.MBeanNotificationInfo;
56  import javax.management.MBeanOperationInfo;
57  import javax.management.MBeanParameterInfo;
58  import javax.management.MBeanRegistration;
59  import javax.management.MBeanRegistrationException;
60  import javax.management.MBeanServer;
61  import javax.management.NotCompliantMBeanException;
62  import javax.management.Notification;
63  import javax.management.NotificationBroadcasterSupport;
64  import javax.management.ObjectName;
65  import javax.management.ReflectionException;
66  import javax.management.RuntimeOperationsException;
67  import javax.management.openmbean.CompositeData;
68  import javax.management.openmbean.CompositeDataSupport;
69  import javax.management.openmbean.CompositeType;
70  import javax.management.openmbean.OpenDataException;
71  import javax.management.openmbean.OpenMBeanAttributeInfo;
72  import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
73  import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
74  import javax.management.openmbean.OpenMBeanInfoSupport;
75  import javax.management.openmbean.OpenMBeanOperationInfo;
76  import javax.management.openmbean.OpenMBeanOperationInfoSupport;
77  import javax.management.openmbean.OpenMBeanParameterInfo;
78  import javax.management.openmbean.OpenMBeanParameterInfoSupport;
79  import javax.management.openmbean.SimpleType;
80  
81  import org.apache.commons.httpclient.URIException;
82  import org.apache.commons.io.IOUtils;
83  import org.archive.crawler.Heritrix;
84  import org.archive.crawler.datamodel.CandidateURI;
85  import org.archive.crawler.datamodel.Checkpoint;
86  import org.archive.crawler.datamodel.CrawlOrder;
87  import org.archive.crawler.event.CrawlStatusListener;
88  import org.archive.crawler.framework.CrawlController;
89  import org.archive.crawler.framework.FrontierMarker;
90  import org.archive.crawler.framework.StatisticsTracking;
91  import org.archive.crawler.framework.exceptions.InitializationException;
92  import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
93  import org.archive.crawler.frontier.AbstractFrontier;
94  import org.archive.crawler.settings.ComplexType;
95  import org.archive.crawler.settings.ModuleAttributeInfo;
96  import org.archive.crawler.settings.TextField;
97  import org.archive.crawler.settings.XMLSettingsHandler;
98  import org.archive.crawler.util.CheckpointUtils;
99  import org.archive.crawler.util.IoUtils;
100 import org.archive.util.ArchiveUtils;
101 import org.archive.util.FileUtils;
102 import org.archive.util.JEMBeanHelper;
103 import org.archive.util.JmxUtils;
104 import org.archive.util.iterator.LineReadingIterator;
105 import org.archive.util.iterator.RegexpLineIterator;
106 
107 import com.sleepycat.je.DatabaseException;
108 import com.sleepycat.je.Environment;
109 
110 /***
111  * A CrawlJob encapsulates a 'crawl order' with any and all information and
112  * methods needed by a CrawlJobHandler to accept and execute them.
113  *
114  * <p>A given crawl job may also be a 'profile' for a crawl. In that case it
115  * should not be executed as a crawl but can be edited and used as a template
116  * for creating new CrawlJobs.
117  *
118  * <p>All of it's constructors are protected since only a CrawlJobHander
119  * should construct new CrawlJobs.
120  *
121  * @author Kristinn Sigurdsson
122  *
123  * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String,
124  * String, String, String, int)
125  * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob,
126  *  String, String, String)
127  */
128 
129 public class CrawlJob extends NotificationBroadcasterSupport
130 implements DynamicMBean, MBeanRegistration, CrawlStatusListener, Serializable {
131     /***
132      * Eclipse generated serial number.
133      */
134     private static final long serialVersionUID = 3411161000452525856L;
135     
136     private static final Logger logger =
137         Logger.getLogger(CrawlJob.class.getName());
138     /*
139      * Possible values for Priority
140      */
141     /*** lowest */
142     public static final int PRIORITY_MINIMAL = 0;
143     /*** low */
144     public static final int PRIORITY_LOW = 1;
145     /*** average */
146     public static final int PRIORITY_AVERAGE = 2;
147     /*** high */
148     public static final int PRIORITY_HIGH = 3;
149     /*** highest */
150     public static final int PRIORITY_CRITICAL = 4;
151 
152     /*
153      * Possible states for a Job.
154      */
155     /*** Inital value. May not be ready to run/incomplete. */
156     public static final String STATUS_CREATED = "Created";
157     /*** Job has been successfully submitted to a CrawlJobHandler */
158     public static final String STATUS_PENDING = "Pending";
159     /*** Job is being crawled */
160     public static final String STATUS_RUNNING = "Running";
161     /*** Job was deleted by user, will not be displayed in UI. */
162     public static final String STATUS_DELETED = "Deleted";
163     /*** Job was terminted by user input while crawling */
164     public static final String STATUS_ABORTED = "Finished - Ended by operator";
165     /*** Something went very wrong */
166     public static final String STATUS_FINISHED_ABNORMAL =
167         "Finished - Abnormal exit from crawling";
168     /*** Job finished normally having completed its crawl. */
169     public static final String STATUS_FINISHED = "Finished";
170     /*** Job finished normally when the specified timelimit was hit. */
171     public static final String STATUS_FINISHED_TIME_LIMIT =
172         "Finished - Timelimit hit";
173     /*** Job finished normally when the specifed amount of 
174      * data (MB) had been downloaded */
175     public static final String STATUS_FINISHED_DATA_LIMIT =
176         "Finished - Maximum amount of data limit hit";
177     /*** Job finished normally when the specified number of documents had been
178      * fetched.
179      */
180     public static final String STATUS_FINISHED_DOCUMENT_LIMIT =
181         "Finished - Maximum number of documents limit hit";
182     /*** Job is going to be temporarly stopped after active threads are finished. */
183     public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - " +
184         "Waiting for threads to finish";
185     /*** Job was temporarly stopped. State is kept so it can be resumed */
186     public static final String STATUS_PAUSED = "Paused";
187     /***
188      * Job is being checkpointed.  When finished checkpointing, job is set
189      * back to STATUS_PAUSED (Job must be first paused before checkpointing
190      * will run).
191      */
192     public static final String STATUS_CHECKPOINTING = "Checkpointing";
193     /*** Job could not be launced due to an InitializationException */
194     public static final String STATUS_MISCONFIGURED = "Could not launch job " +
195         "- Fatal InitializationException";
196     /*** Job is actually a profile */
197     public static final String STATUS_PROFILE = "Profile";
198     
199     public static final String STATUS_PREPARING = "Preparing";
200 
201     // Class variables
202     private String UID;       //A UID issued by the CrawlJobHandler.
203     private String name;
204     private String status;
205     private boolean isReadOnly = false;
206     private boolean isNew = true;
207     private boolean isProfile = false;
208     private boolean isRunning = false;
209     private int priority;
210     private int numberOfJournalEntries = 0;
211     
212     private String statisticsFileSave = "";
213 
214     private String errorMessage = null;
215 
216     private File jobDir = null;
217 
218     private transient CrawlJobErrorHandler errorHandler = null;
219 
220     protected transient XMLSettingsHandler settingsHandler;
221     
222     private transient CrawlController controller = null;
223     
224     public static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal";
225     public static final String CRAWL_LOG_STYLE = "crawlLog";
226     
227     // OpenMBean support.
228 
229     /***
230      * Server we registered with. Maybe null.
231      */
232     private transient MBeanServer mbeanServer = null;
233     private transient ObjectName mbeanName = null;
234     public static final String CRAWLJOB_JMXMBEAN_TYPE =
235         JmxUtils.SERVICE + ".Job";
236     private transient JEMBeanHelper bdbjeMBeanHelper = null;
237     private transient List<String> bdbjeAttributeNameList = null;
238     private transient List<String> bdbjeOperationsNameList = null;
239     
240     
241     /***
242      * The MBean we've registered ourselves with (May be null
243      * throughout life of Heritrix).
244      */
245     private transient OpenMBeanInfoSupport openMBeanInfo;
246     
247     public static final String NAME_ATTR = "Name";
248     public static final String UID_ATTR = "UID";
249     public static final String STATUS_ATTR = "Status";
250     public static final String FRONTIER_SHORT_REPORT_ATTR =
251         "FrontierShortReport";
252     public static final String THREADS_SHORT_REPORT_ATTR =
253         "ThreadsShortReport";
254     public static final String TOTAL_DATA_ATTR = "TotalData";
255     public static final String CRAWL_TIME_ATTR = "CrawlTime";
256     public static final String DOC_RATE_ATTR = "DocRate";
257     public static final String CURRENT_DOC_RATE_ATTR = "CurrentDocRate";
258     public static final String KB_RATE_ATTR = "KbRate";
259     public static final String CURRENT_KB_RATE_ATTR = "CurrentKbRate";
260     public static final String THREAD_COUNT_ATTR = "ThreadCount";
261     public static final String DOWNLOAD_COUNT_ATTR = "DownloadedCount";
262     public static final String DISCOVERED_COUNT_ATTR = "DiscoveredCount";
263     public static final String [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR,
264         STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR,
265         TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR,
266         CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR,
267         THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR};
268     public static final List ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY);
269     
270     public static final String IMPORT_URI_OPER = "importUri";
271     public static final String IMPORT_URIS_OPER = "importUris";
272     public static final String DUMP_URIS_OPER = "dumpUris";
273     public static final String PAUSE_OPER = "pause";
274     public static final String RESUME_OPER = "resume";
275     public static final String FRONTIER_REPORT_OPER = "frontierReport";
276     public static final String THREADS_REPORT_OPER = "threadsReport";
277     public static final String SEEDS_REPORT_OPER = "seedsReport";
278     public static final String CHECKPOINT_OPER = "startCheckpoint";
279     public static final String PROGRESS_STATISTICS_OPER =
280         "progressStatistics";
281     public static final String PROGRESS_STATISTICS_LEGEND_OPER =
282         "progressStatisticsLegend";
283     
284     public static final String PROG_STATS = "progressStatistics";
285     
286     // Same as JEMBeanHelper.OP_DB_STAT
287     public static final String OP_DB_STAT = "getDatabaseStats";
288     
289     /***
290      * Don't add the following crawl-order items.
291      */
292     public static final List ORDER_EXCLUDE;
293     static {
294         ORDER_EXCLUDE = Arrays.asList(new String [] {"bdb-cache-percent",
295             "extract-processors", "DNS", "uri-included-structure"});
296     }
297     
298     /***
299      * Sequence number for jmx notifications.
300      */
301     private static int notificationsSequenceNumber = 1;
302     
303     /***
304      * A shutdown Constructor.
305      */
306     protected CrawlJob() {
307         super();
308     }
309 
310     /***
311      * A constructor for jobs.
312      *
313      * <p> Create, ready to crawl, jobs.
314      * @param UID A unique ID for this job. Typically emitted by the
315      *            CrawlJobHandler.
316      * @param name The name of the job
317      * @param settingsHandler The associated settings
318      * @param errorHandler The crawl jobs settings error handler.
319      *           <tt>null</tt> means none is set
320      * @param priority job priority.
321      * @param dir The directory that is considered this jobs working directory.
322      */
323     public CrawlJob(final String UID,
324             final String name, final XMLSettingsHandler settingsHandler,
325             final CrawlJobErrorHandler errorHandler, final int priority,
326             final File dir) {
327         this(UID, name, settingsHandler, errorHandler,
328                 priority, dir, null, false, true);
329     }
330 
331     /***
332      * A constructor for profiles.
333      *
334      * <p> Any job created with this constructor will be
335      * considered a profile. Profiles are not stored on disk (only their
336      * settings files are stored on disk). This is because their data is
337      * predictible given any settings files.
338      * @param UIDandName A unique ID for this job. For profiles this is the same
339      *           as name
340      * @param settingsHandler The associated settings
341      * @param errorHandler The crawl jobs settings error handler.
342      *           <tt>null</tt> means none is set
343      */
344     protected CrawlJob(final String UIDandName,
345             final XMLSettingsHandler settingsHandler,
346             final CrawlJobErrorHandler errorHandler) {
347         this(UIDandName, UIDandName, settingsHandler, errorHandler,
348             PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);
349     }
350     
351     public CrawlJob(final String UID,
352             final String name, final XMLSettingsHandler settingsHandler,
353             final CrawlJobErrorHandler errorHandler, final int priority,
354             final File dir, final String status, final boolean isProfile,
355             final boolean isNew) {
356         super();
357         this.UID = UID;
358         this.name = name;
359         this.settingsHandler = settingsHandler;
360         this.errorHandler = errorHandler;
361         this.status = status;
362         this.isProfile = isProfile;
363         this.isNew = isNew;
364         this.jobDir = dir;
365         this.priority = priority;
366     }
367 
368     /***
369      * A constructor for reloading jobs from disk. Jobs (not profiles) have
370      * their data written to persistent storage in the file system. This method
371      * is used to load the job from such storage. This is done by the
372      * <code>CrawlJobHandler</code>.
373      * <p>
374      * Proper structure of a job file (TODO: Maybe one day make this an XML file)
375      * Line 1. UID <br>
376      * Line 2. Job name (string) <br>
377      * Line 3. Job status (string) <br>
378      * Line 4. is job read only (true/false) <br>
379      * Line 5. is job running (true/false) <br>
380      * Line 6. job priority (int) <br>
381      * Line 7. number of journal entries <br>
382      * Line 8. setting file (with path) <br>
383      * Line 9. statistics tracker file (with path) <br>
384      * Line 10-?. error message (String, empty for null), can be many lines <br>
385      * @param jobFile
386      *            a file containing information about the job to load.
387      * @param errorHandler The crawl jobs settings error handler.
388      *            null means none is set
389      * @throws InvalidJobFileException
390      *            if the specified file does not refer to a valid job file.
391      * @throws IOException
392      *            if io operations fail
393      */
394     protected CrawlJob(final File jobFile,
395             final CrawlJobErrorHandler errorHandler)
396             throws InvalidJobFileException, IOException {
397         this(null, null, null, errorHandler,
398                 PRIORITY_AVERAGE, null, null, false, true);
399         this.jobDir = jobFile.getParentFile();
400         
401         // Check for corrupt job.state files (can be corrupt if we crash).
402         if (jobFile.length() == 0) {
403             throw new InvalidJobFileException(jobFile.getCanonicalPath() +
404                 " is corrupt (length is zero)");
405         }
406         
407         // Open file. Read data and set up class variables accordingly...
408         BufferedReader jobReader =
409             new BufferedReader(new FileReader(jobFile), 4096);
410         // UID
411         this.UID = jobReader.readLine();
412         // name
413         this.name = jobReader.readLine();
414         // status
415         this.status = jobReader.readLine();
416         if(status.equals(STATUS_ABORTED)==false
417                 && status.equals(STATUS_CREATED)==false
418                 && status.equals(STATUS_DELETED)==false
419                 && status.equals(STATUS_FINISHED)==false
420                 && status.equals(STATUS_FINISHED_ABNORMAL)==false
421                 && status.equals(STATUS_FINISHED_DATA_LIMIT)==false
422                 && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false
423                 && status.equals(STATUS_FINISHED_TIME_LIMIT)==false
424                 && status.equals(STATUS_MISCONFIGURED)==false
425                 && status.equals(STATUS_PAUSED)==false
426                 && status.equals(STATUS_CHECKPOINTING)==false
427                 && status.equals(STATUS_PENDING)==false
428                 && status.equals(STATUS_RUNNING)==false
429                 && status.equals(STATUS_WAITING_FOR_PAUSE)==false
430                 && status.equals(STATUS_PREPARING)==false){
431             // status is invalid. Must be one of the above
432             throw new InvalidJobFileException("Status (line 3) in job file " +
433                     "is not valid: '" + status + "'");
434         }
435         // isReadOnly
436         String tmp = jobReader.readLine();
437         if(tmp.equals("true")){
438             isReadOnly = true;
439         } else if(tmp.equals("false")){
440             isReadOnly = false;
441         } else {
442             throw new InvalidJobFileException("isReadOnly (line 4) in job" +
443                     " file '" + jobFile.getAbsolutePath() + "' is not " +
444                     "valid: '" + tmp + "'");
445         }
446         // isRunning
447         tmp = jobReader.readLine();
448         if(tmp.equals("true")){
449             this.isRunning = true;
450         } else if(tmp.equals("false")){
451             this.isRunning = false;
452         } else {
453             throw new InvalidJobFileException("isRunning (line 5) in job " +
454                     "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
455                     "'" + tmp + "'");
456         }
457         // priority
458         tmp = jobReader.readLine();
459         try{
460             this.priority = Integer.parseInt(tmp);
461         } catch(NumberFormatException e){
462             throw new InvalidJobFileException("priority (line 5) in job " +
463                     "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
464                     "'" + tmp + "'");
465         }
466         // numberOfJournalEntries
467         tmp = jobReader.readLine();
468         try{
469             this.numberOfJournalEntries = Integer.parseInt(tmp);
470         } catch(NumberFormatException e){
471             throw new InvalidJobFileException("numberOfJournalEntries " +
472                     "(line 5) in job file '" + jobFile.getAbsolutePath() +
473                     "' is not valid: " + "'" + tmp + "'");
474         }
475         // settingsHandler
476         tmp = jobReader.readLine();
477         try {
478             File f = new File(tmp);
479             this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())?
480                 f: new File(jobDir, f.getName()));
481             if(this.errorHandler != null){
482                 this.settingsHandler.registerValueErrorHandler(errorHandler);
483             }
484             this.settingsHandler.initialize();
485         } catch (InvalidAttributeValueException e1) {
486             throw new InvalidJobFileException("Problem reading from settings " +
487                     "file (" + tmp + ") specified in job file '" +
488                     jobFile.getAbsolutePath() + "'\n" + e1.getMessage());
489         }
490         // Statistics tracker.
491         jobReader.readLine();
492         // errorMessage
493         // TODO: Multilines
494         tmp = jobReader.readLine();
495         errorMessage = "";
496         while(tmp!=null){
497             errorMessage+=tmp+'\n';
498             tmp = jobReader.readLine();
499         }
500         if(errorMessage.length()==0){
501             // Empty error message should be null
502             errorMessage = null;
503         }
504         // TODO: Load stattrack if needed.
505 
506         // TODO: This should be inside a finally block.
507         jobReader.close();
508     }
509 
510     /***
511      * Cause the job to be written to persistent storage.
512      * This will also save the statistics tracker if it is not null and the
513      * job status is finished (regardless of how it's finished)
514      */
515     private void writeJobFile() {
516         if (isProfile) {
517             return;
518         }
519         
520         final String jobDirAbsolute = jobDir.getAbsolutePath();
521         if (!jobDir.exists() || !jobDir.canWrite()) {
522             logger.warning("Can't update status on " +
523                 jobDirAbsolute + " because file does not" +
524                 " exist (or is unwriteable)");
525             return;
526         }
527         File f = new File(jobDirAbsolute, "state.job");
528 
529         String settingsFile = getSettingsDirectory();
530         // Make settingsFile's path relative if order.xml is somewhere in the
531         // job's directory tree
532         if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) {
533             settingsFile = settingsFile.substring(jobDirAbsolute.length()+1);
534         }
535         try {
536             OutputStreamWriter jobWriter = 
537                 new OutputStreamWriter(
538                     new FileOutputStream(f, false),
539                     "UTF-8");
540             try {
541                 jobWriter.write(UID + "\n");
542                 jobWriter.write(name + "\n");
543                 jobWriter.write(status + "\n");
544                 jobWriter.write(isReadOnly + "\n");
545                 jobWriter.write(isRunning + "\n");
546                 jobWriter.write(priority + "\n");
547                 jobWriter.write(numberOfJournalEntries + "\n");
548                 jobWriter.write(settingsFile + "\n");
549                 jobWriter.write(statisticsFileSave + "\n");// TODO: Is this
550                                                             // right?
551                 // Can be multiple lines so we keep it last
552                 if (errorMessage != null) {
553                     jobWriter.write(errorMessage + "\n");
554                 }
555             } finally {
556                 if (jobWriter != null) {
557                     jobWriter.close();
558                 }
559             }
560         } catch (IOException e) {
561             logger.log(Level.WARNING, "An IOException occured saving job " +
562                     name + " (" + UID + ")", e);
563         }
564     }
565   
566     /***
567      * Returns this jobs unique ID (UID) that was issued by the
568      * CrawlJobHandler() when this job was first created.
569      * 
570      * @return Job This jobs UID.
571      * @see CrawlJobHandler#getNextJobUID()
572      */
573     public String getUID(){
574         return UID;
575     }
576 
577     /***
578      * Returns this job's 'name'. The name comes from the settings for this job,
579      * need not be unique and may change. For a unique identifier use
580      * {@link #getUID() getUID()}.
581      * <p>
582      * The name corrisponds to the value of the 'name' tag in the 'meta' section
583      * of the settings file.
584      *
585      * @return This job's 'name'
586      */
587     public String getJobName(){
588         return name;
589     }
590 
591     /***
592      * Return the combination of given name and UID most commonly
593      * used in administrative interface.
594      *
595      * @return Job's name with UID notation
596      */
597     public String getDisplayName() {
598         return getJobName()+" ["+getUID()+"]";
599     }
600 
601     /***
602      * Set this job's level of priority.
603      *
604      * @param priority The level of priority
605      *
606      * @see #getJobPriority()
607      * @see #PRIORITY_MINIMAL
608      * @see #PRIORITY_LOW
609      * @see #PRIORITY_AVERAGE
610      * @see #PRIORITY_HIGH
611      * @see #PRIORITY_CRITICAL
612      */
613     public void setJobPriority(int priority) {
614         this.priority = priority;
615     }
616 
617     /***
618      * Get this job's level of priority.
619      *
620      * @return this job's priority
621      * @see #setJobPriority(int)
622      * @see #PRIORITY_MINIMAL
623      * @see #PRIORITY_LOW
624      * @see #PRIORITY_AVERAGE
625      * @see #PRIORITY_HIGH
626      * @see #PRIORITY_CRITICAL
627      */
628     public int getJobPriority() {
629         return priority;
630     }
631 
632     /***
633      * Once called no changes can be made to the settings for this job.
634      * Typically this is done once a crawl is completed and further changes
635      * to the crawl order are therefor meaningless.
636      */
637     public void setReadOnly() {
638         isReadOnly = true;
639         writeJobFile(); //Save changes
640     }
641 
642     /***
643      * Is job read only?
644      * @return false until setReadOnly has been invoked, after that it returns true.
645      */
646     public boolean isReadOnly(){
647         return isReadOnly;
648     }
649 
650     /***
651      * Set the status of this CrawlJob.
652      *
653      * @param status Current status of CrawlJob
654      *         (see constants defined here beginning with STATUS)
655      */
656     public void setStatus(String status) {
657         this.status = status;
658         writeJobFile(); //Save changes
659         // TODO: If job finished, save StatisticsTracker!
660     }
661 
662     /***
663      * @return Status of the crawler (Used by JMX).
664      */
665     public String getCrawlStatus() {
666         return this.controller != null?
667             this.controller.getState().toString(): "Illegal State";
668     }
669     
670     /***
671      * Get the current status of this CrawlJob
672      *
673      * @return The current status of this CrawlJob
674      *         (see constants defined here beginning with STATUS)
675      */
676     public String getStatus() {
677         return this.status;
678     }
679 
680     /***
681      * Returns the settings handler for this job. It will have been initialized.
682      * @return the settings handler for this job.
683      */
684     public XMLSettingsHandler getSettingsHandler() {
685         return this.settingsHandler;
686     }
687     /***
688      * Is this a new job?
689      * @return True if is new.
690      */
691     public boolean isNew() {
692         return isNew;
693     }
694 
695     /***
696      * Set if the job is considered to be a profile
697      * @return True if is a profile.
698      */
699     public boolean isProfile() {
700         return isProfile;
701     }
702 
703     /***
704      * Set if the job is considered a new job or not.
705      * @param b Is the job considered to be new.
706      */
707     public void setNew(boolean b) {
708         isNew = b;
709         writeJobFile(); //Save changes
710     }
711 
712     /***
713      * Returns true if the job is being crawled.
714      * @return true if the job is being crawled
715      */
716     public boolean isRunning() {
717         return isRunning;
718     }
719 
720     /***
721      * Set if job is being crawled.
722      * @param b Is job being crawled.
723      */
724     protected void setRunning(boolean b) {
725         isRunning = b;
726         writeJobFile(); // Save changes
727         //TODO: Job ending -> Save statistics tracker.
728         //TODO: This is likely to happen as the CrawlEnding event occurs,
729         // need to ensure that the StatisticsTracker is saved to disk on
730         // CrawlEnded. Maybe move responsibility for this into the
731         // StatisticsTracker?
732     }
733     
734     protected void unregisterMBean() {
735         // Unregister current job from JMX agent, if there one.
736         if (this.mbeanServer == null) {
737             return;
738         }
739         try {
740             this.mbeanServer.unregisterMBean(this.mbeanName);
741             this.mbeanServer = null;
742         } catch (Exception e) {
743             logger.log(Level.SEVERE, "Failed with " + this.mbeanName, e);
744         }
745     }
746     
747     /***
748      * Subclass of crawlcontroller that unregisters beans when stopped.
749      * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')
750      * pollution, so for sure CrawlJob is unregistered with JMX and so any
751      * listeners on the CrawlJob get a chance to get crawl ended message
752      * (These latter notifications may not actually be getting through -- TBD).
753      * <p>TODO: This override dirtys the data model since CC knows about CJs.
754      * The facility provided by this class emitting events and statistics so
755      * they can be read by JMX needs to go back into CC.  Probably best to
756      * registering in JMX the CC, rather than CJ.  Lets do this in Heritrix 2.0
757      * since means changing the JMX API some.
758      */
759     public class MBeanCrawlController extends CrawlController
760     implements Serializable {
761         private static final long serialVersionUID = -4608537998168407222L;
762         private CrawlJob cj = null;
763         private CompositeType ct =  null;
764         
765         public CrawlJob getCrawlJob() {
766             return this.cj;
767         }
768 
769         public void setCrawlJob(CrawlJob cj) {
770             this.cj = cj;
771         }
772         
773         @SuppressWarnings("unchecked")
774         public void progressStatisticsEvent(final EventObject e) {
775             super.progressStatisticsEvent(e);
776             if (this.cj.getMbeanName() == null) {
777                 // Can be null around job startup.  Return w/o doing anything.
778                 return;
779             }
780                 
781             Map s = ((StatisticsTracking)e.getSource()).getProgressStatistics();
782             // Convert the statistics to OpenType CompositeData and add as
783             // user data to Notification.
784             CompositeData cd = null;
785             try {
786                 if (this.ct == null) {
787                     this.ct = JmxUtils.createCompositeType(s, PROG_STATS,
788                         PROG_STATS + " for " + this.cj.getMbeanName());
789                 }
790                 cd = new CompositeDataSupport(this.ct, s);
791             } catch (OpenDataException ode) {
792                 ode.printStackTrace();
793             }
794             if (cd != null) {
795                 Notification n = new Notification(PROG_STATS,
796                     this.cj.getMbeanName(), getNotificationsSequenceNumber(),
797                     ((StatisticsTracking)e.getSource()).
798                         getProgressStatisticsLine());
799                 n.setUserData(cd);
800                 this.cj.sendNotification(n);
801             }
802         }
803         
804         protected void completeStop() {
805             try {
806                 super.completeStop();
807             } finally {
808                 if (this.cj != null) {
809                     this.cj.unregisterMBean();
810                 }
811                 this.cj = null;
812             }
813         }
814     }
815     
816     protected CrawlController setupCrawlController()
817     throws InitializationException {
818         CrawlController controller = null;
819         
820         // Check if we're to do a checkpoint recover.  If so, deserialize
821         // the checkpoint's CrawlController and use that in place of a new
822         // CrawlController instance.
823         Checkpoint cp = CrawlController.
824             getCheckpointRecover(getSettingsHandler().getOrder());
825         if (cp != null) {
826             try {
827             	controller = (MBeanCrawlController)CheckpointUtils.
828                     readObjectFromFile(MBeanCrawlController.class,
829                         cp.getDirectory());
830             } catch (FileNotFoundException e) {
831                 throw new InitializationException(e);
832             } catch (IOException e) {
833                 throw new InitializationException(e);
834             } catch (ClassNotFoundException e) {
835                 throw new InitializationException(e);
836             }
837         } else {
838         	controller = new MBeanCrawlController();
839         }
840         return controller;
841     }
842     
843     protected CrawlController createCrawlController() {
844     	return new MBeanCrawlController();
845     }
846     
847     public void setupForCrawlStart()
848     throws InitializationException {
849         try {
850         	this.controller = setupCrawlController();
851             // Register as listener to get job finished notice.
852             this.controller.addCrawlStatusListener(this);
853             this.controller.initialize(getSettingsHandler());
854             // Set the crawl job this MBeanCrawlController needs to worry about.
855             ((MBeanCrawlController)this.controller).setCrawlJob(this);
856             // Create our mbean description and register our crawljob.
857             this.openMBeanInfo = buildMBeanInfo();
858             try {
859                 Heritrix.registerMBean(this, getJmxJobName(),
860                     CRAWLJOB_JMXMBEAN_TYPE);
861             } catch (InstanceAlreadyExistsException e) {
862                 throw new InitializationException(e);
863             } catch (MBeanRegistrationException e) {
864                 throw new InitializationException(e);
865             } catch (NotCompliantMBeanException e) {
866                 throw new InitializationException(e);
867             }
868         } catch (InitializationException e) {
869             // Can't load current job since it is misconfigured.
870             setStatus(CrawlJob.STATUS_MISCONFIGURED);
871             setErrorMessage("A fatal InitializationException occured when "
872                     + "loading job:\n" + e.getMessage());
873             // Log to stdout so its seen in logs as well as in UI.
874             e.printStackTrace();
875             this.controller = null;
876             throw e;
877         }
878         setStatus(CrawlJob.STATUS_RUNNING);
879         setRunning(true);
880     }
881     
882     public void stopCrawling() {
883         if(this.controller != null) {
884             this.controller.requestCrawlStop();
885         }
886     }
887 
888     /***
889      * @return One-line Frontier report.
890      */
891     public String getFrontierOneLine() {
892         if (this.controller == null || this.controller.getFrontier() == null) {
893             return "Crawler not running";
894         }
895         return this.controller.getFrontier().singleLineReport();
896     }
897     
898     /***
899      * @param reportName Name of report to write.
900      * @return A report of the frontier's status.
901      */
902     public String getFrontierReport(final String reportName) {
903         if (this.controller == null || this.controller.getFrontier() == null) {
904             return "Crawler not running";
905         }
906         return ArchiveUtils.writeReportToString(this.controller.getFrontier(),
907                 reportName);
908     }
909     
910     /***
911      * Write the requested frontier report to the given PrintWriter
912      * @param reportName Name of report to write.
913      * @param writer Where to write to.
914      */
915     public void writeFrontierReport(String reportName, PrintWriter writer) {
916         if (this.controller == null || this.controller.getFrontier() == null) {
917             writer.println("Crawler not running.");
918             return;
919         }
920         this.controller.getFrontier().reportTo(reportName,writer);
921     }
922 
923     /***
924      * @return One-line threads report.
925      */
926     public String getThreadOneLine() {
927         if (this.controller == null) {
928             return "Crawler not running";
929         }
930         return this.controller.oneLineReportThreads();
931     }
932     
933     /***
934      * Get the CrawlControllers ToeThreads report for the running crawl.
935      * @return The CrawlControllers ToeThreads report
936      */
937     public String getThreadsReport() {
938         if (this.controller == null) {
939             return "Crawler not running";
940         }
941         return ArchiveUtils.writeReportToString(this.controller.getToePool(),
942                 null);
943     }
944     
945     /***
946      * Write the requested threads report to the given PrintWriter
947      * @param reportName Name of report to write.
948      * @param writer Where to write to.
949      */
950     public void writeThreadsReport(String reportName, PrintWriter writer) {
951         if (this.controller == null || this.controller.getFrontier() == null) {
952             writer.println("Crawler not running.");
953             return;
954         }
955         this.controller.getToePool().reportTo(reportName, writer);
956     }
957     
958     /***
959      * Kills a thread. For details see
960      * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
961      * ToePool.killThread(int, boolean)}.
962      * @param threadNumber Thread to kill.
963      * @param replace Should thread be replaced.
964      * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
965      */
966     public void killThread(int threadNumber, boolean replace) {
967         if (this.controller ==  null) {
968             return;
969         }
970         this.controller.killThread(threadNumber, replace);
971     }
972 
973     /***
974      * Get the Processors report for the running crawl.
975      * @return The Processors report for the running crawl.
976      */
977     public String getProcessorsReport() {
978         if (this.controller == null) {
979             return "Crawler not running";
980         }
981         return ArchiveUtils.writeReportToString(this.controller,
982                 CrawlController.PROCESSORS_REPORT);
983     }
984     
985     /***
986      * Returns the directory where the configuration files for this job are
987      * located.
988      *
989      * @return the directory where the configuration files for this job are
990      *         located
991      */
992     public String getSettingsDirectory() {
993         return settingsHandler.getOrderFile().getPath();
994     }
995 
996     /***
997      * Returns the path of the job's base directory. For profiles this is always
998      * equal to <code>new File(getSettingsDirectory())</code>.
999      * @return the path of the job's base directory.
1000      */
1001     public File getDirectory(){
1002         return isProfile? new File(getSettingsDirectory()): jobDir;
1003     }
1004 
1005     /***
1006      * Get the error message associated with this job. Will return null if there
1007      * is no error message.
1008      * @return the error message associated with this job
1009      */
1010     public String getErrorMessage() {
1011         return errorMessage;
1012     }
1013 
1014     /***
1015      * Set an error message for this job. Generally this only occurs if the job
1016      * is misconfigured.
1017      * @param string the error message associated with this job
1018      */
1019     public void setErrorMessage(String string) {
1020         errorMessage = string;
1021         writeJobFile(); //Save changes
1022     }
1023 
1024     /***
1025      * @return Returns the number of journal entries.
1026      */
1027     public int getNumberOfJournalEntries() {
1028         return numberOfJournalEntries;
1029     }
1030 
1031     /***
1032      * @param numberOfJournalEntries The number of journal entries to set.
1033      */
1034     public void setNumberOfJournalEntries(int numberOfJournalEntries) {
1035         this.numberOfJournalEntries = numberOfJournalEntries;
1036         writeJobFile();
1037     }
1038 
1039     /***
1040      * @return Returns the error handler for this crawl job
1041      */
1042     public CrawlJobErrorHandler getErrorHandler() {
1043         return errorHandler;
1044     }
1045 
1046     /***
1047      * Read all the checkpoints found in the job's checkpoints
1048      * directory into Checkpoint instances
1049      * @return Collection containing list of all checkpoints.
1050      */
1051     public Collection scanCheckpoints() {
1052         File checkpointsDirectory =
1053             settingsHandler.getOrder().getCheckpointsDirectory();
1054         File[] perCheckpointDirs = checkpointsDirectory.listFiles();
1055         Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>();
1056         if (perCheckpointDirs != null) {
1057             for (int i = 0; i < perCheckpointDirs.length; i++) {
1058                 Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);
1059                 checkpoints.add(cp);
1060             }
1061         }
1062         return checkpoints;
1063     }
1064 
1065     /***
1066      * Returns the absolute path of the specified log.
1067      * Note: If crawl has not begun, this file may not exist.
1068      * @param log
1069      * @return the absolute path for the specified log.
1070      * @throws AttributeNotFoundException
1071      * @throws ReflectionException
1072      * @throws MBeanException
1073      */
1074     public String getLogPath(String log) 
1075     throws AttributeNotFoundException, MBeanException, ReflectionException {
1076         String logsPath = (String)settingsHandler.getOrder().
1077             getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1078         CrawlOrder order = settingsHandler.getOrder();
1079         String diskPath = (String) order.getAttribute(null,
1080             CrawlOrder.ATTR_DISK_PATH);
1081         File disk = settingsHandler.
1082             getPathRelativeToWorkingDirectory(diskPath);
1083         File f = new File(logsPath, log);
1084         if (!f.isAbsolute()) {
1085             f = new File(disk.getPath(), f.getPath());
1086         }
1087         return f.getAbsolutePath();
1088     }
1089 
1090     // OpenMBean implementation.
1091     
1092     protected void pause() {
1093         if (this.controller != null && this.controller.isPaused() == false) {
1094             this.controller.requestCrawlPause();
1095         }
1096     }
1097     
1098     protected void resume() {
1099         if (this.controller != null) {
1100             this.controller.requestCrawlResume();
1101         }
1102     }
1103 
1104     /***
1105      * @throws IllegalStateException Thrown if crawl is not paused.
1106      */
1107     protected void checkpoint() throws IllegalStateException {
1108         if (this.controller != null) {
1109             this.controller.requestCrawlCheckpoint();
1110         }
1111     }
1112     
1113     /***
1114      * @return True if checkpointing.
1115      */
1116     public boolean isCheckpointing() {
1117         return this.controller != null? this.controller.isCheckpointing(): false;
1118     }
1119     
1120     /***
1121      * If its a HostQueuesFrontier, needs to be flushed for the queued.
1122      */
1123     protected void flush() {
1124         // Nothing to do.
1125     }
1126 
1127     /***
1128      * Delete any URI from the frontier of the current (paused) job that match
1129      * the specified regular expression. If the current job is not paused (or
1130      * there is no current job) nothing will be done.
1131      * @param regexpr Regular expression to delete URIs by.
1132      * @return the number of URIs deleted
1133      */
1134     public long deleteURIsFromPending(String regexpr){
1135         return deleteURIsFromPending(regexpr,null);
1136     }
1137     
1138     /***
1139      * Delete any URI from the frontier of the current (paused) job that match
1140      * the specified regular expression. If the current job is not paused (or
1141      * there is no current job) nothing will be done.
1142      * @param regexpr Regular expression to delete URIs by.
1143      * @return the number of URIs deleted
1144      */
1145     public long deleteURIsFromPending(String uriPattern, String queuePattern){
1146         return (this.controller != null &&
1147                 this.controller.getFrontier() != null &&
1148                 this.controller.isPaused())?
1149             this.controller.getFrontier().deleteURIs(uriPattern,queuePattern): 0;
1150     }
1151     
1152     public String importUris(String file, String style, String force) {
1153         return importUris(file, style, "true".equals(force));
1154     }
1155     
1156     public String importUris(final String fileOrUrl, final String style,
1157             final boolean forceRevisit) {
1158         return importUris(fileOrUrl, style, forceRevisit, false);
1159     }
1160 
1161     /***
1162      * @param fileOrUrl Name of file w/ seeds.
1163      * @param style What style of seeds -- crawl log, recovery journal, or
1164      * seeds file.
1165      * @param forceRevisit Should we revisit even if seen before?
1166      * @param areSeeds Is the file exclusively seeds?
1167      * @return A display string that has a count of all added.
1168      */
1169     public String importUris(final String fileOrUrl, final String style,
1170             final boolean forceRevisit, final boolean areSeeds) {
1171         InputStream is =
1172             IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl);
1173         String message = null;
1174         // Do we have an inputstream?
1175         if (is == null) {
1176             message = "Failed to get inputstream from " + fileOrUrl;
1177             logger.severe(message);
1178         } else {
1179             int addedCount = importUris(is, style, forceRevisit, areSeeds);
1180             message = Integer.toString(addedCount) + " URIs added from " +
1181                 fileOrUrl;
1182         }
1183         return message;
1184     }
1185     
1186     protected int importUris(InputStream is, String style,
1187             boolean forceRevisit) {
1188         return importUris(is, style, forceRevisit, false);
1189     }
1190     
1191     /***
1192      * Import URIs.
1193      * @param is Stream to use as URI source.
1194      * @param style Style in which URIs are rendored.  Currently support for
1195      * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file
1196      * format (i.e <code>default</code>) where <code>default</code> style is
1197      * a UURI per line (comments allowed).
1198      * @param forceRevisit Whether we should revisit this URI even if we've
1199      * visited it previously.
1200      * @param areSeeds Are the imported URIs seeds?
1201      * @return Count of added URIs.
1202      */
1203     protected int importUris(InputStream is, String style,
1204             boolean forceRevisit, final boolean areSeeds) {
1205         // Figure the regex to use parsing each line of input stream.
1206         String extractor;
1207         String output;
1208         if(CRAWL_LOG_STYLE.equals(style)) {
1209             // Skip first 3 fields
1210             extractor = "//S+//s+//S+//s+//S+//s+(//S+//s+//S+//s+//S+//s+).*";
1211             output = "$1";
1212         } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {
1213             // Skip the begin-of-line directive
1214             extractor = "//S+//s+((//S+)(?://s+//S+//s+//S+)?)//s*";
1215             output = "$1";
1216         } else {
1217             extractor =
1218                 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;
1219             output = RegexpLineIterator.ENTRY;
1220         }
1221         
1222         controller.installThreadContextSettingsHandler();
1223         
1224         // Read the input stream.
1225         BufferedReader br = null;
1226         int addedCount = 0;
1227         try {
1228             br = new BufferedReader(new InputStreamReader(is));
1229             Iterator iter = new RegexpLineIterator(new LineReadingIterator(br),
1230                 RegexpLineIterator.COMMENT_LINE, extractor, output);
1231             while(iter.hasNext()) {
1232                 try {
1233                     importUri((String)iter.next(), forceRevisit, areSeeds,
1234                         false);
1235                     addedCount++;
1236                 } catch (URIException e) {
1237                     e.printStackTrace();
1238                 }
1239             }
1240             br.close();
1241             flush();
1242         } catch (IOException e) {
1243             e.printStackTrace();
1244         }
1245         return addedCount;
1246     }
1247     
1248     /***
1249      * Schedule a uri.
1250      * @param uri Uri to schedule.
1251      * @param forceFetch Should it be forcefetched.
1252      * @param isSeed True if seed.
1253      * @throws URIException
1254      */
1255     public void importUri(final String uri, final boolean forceFetch,
1256             final boolean isSeed)
1257     throws URIException {
1258         importUri(uri, forceFetch, isSeed, true);
1259     }
1260     
1261     /***
1262      * Schedule a uri.
1263      * @param str String that can be: 1. a UURI, 2. a snippet of the
1264      * crawl.log line, or 3. a snippet from recover log.  See
1265      * {@link #importUris(InputStream, String, boolean)} for how it subparses
1266      * the lines from crawl.log and recover.log.
1267      * @param forceFetch Should it be forcefetched.
1268      * @param isSeed True if seed.
1269      * @param isFlush If true, flush the frontier IF it implements
1270      * flushing.
1271      * @throws URIException
1272      */
1273     public void importUri(final String str, final boolean forceFetch,
1274             final boolean isSeed, final boolean isFlush)
1275     throws URIException {
1276         CandidateURI caUri = CandidateURI.fromString(str);
1277         caUri.setForceFetch(forceFetch);
1278         if (isSeed) {
1279             caUri.setIsSeed(isSeed);
1280             if (caUri.getVia() == null || caUri.getVia().length() <= 0) {
1281                 // Danger of double-add of seeds because of this code here.
1282                 // Only call addSeed if no via.  If a via, the schedule will
1283                 // take care of updating scope.
1284                 this.controller.getScope().addSeed(caUri);
1285             }
1286         }
1287         this.controller.getFrontier().schedule(caUri);
1288         if (isFlush) {
1289             flush();
1290         }
1291     }
1292     
1293     
1294     /***
1295      * @return Our mbean info (Needed for CrawlJob to qualify as a
1296      * DynamicMBean).
1297      */
1298     public MBeanInfo getMBeanInfo() {
1299         return this.openMBeanInfo;
1300     }
1301     
1302     /***
1303      * Build up the MBean info for Heritrix main.
1304      * @return Return created mbean info instance.
1305      * @throws InitializationException 
1306      */
1307     protected OpenMBeanInfoSupport buildMBeanInfo()
1308     throws InitializationException {
1309         // Start adding my attributes.
1310         List<OpenMBeanAttributeInfo> attributes
1311          = new ArrayList<OpenMBeanAttributeInfo>();
1312 
1313         // Attributes.
1314         attributes.add(new OpenMBeanAttributeInfoSupport(NAME_ATTR,
1315             "Crawl job name", SimpleType.STRING, true, false, false));
1316         attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR,
1317             "Short basic status message", SimpleType.STRING, true, false,
1318             false));
1319         attributes.add(
1320                 new OpenMBeanAttributeInfoSupport(FRONTIER_SHORT_REPORT_ATTR,
1321                 "Short frontier report", SimpleType.STRING, true,
1322                 false, false));
1323         attributes.add(
1324                 new OpenMBeanAttributeInfoSupport(THREADS_SHORT_REPORT_ATTR,
1325                 "Short threads report", SimpleType.STRING, true,
1326                 false, false));
1327         attributes.add(new OpenMBeanAttributeInfoSupport(UID_ATTR,
1328             "Crawl job UID", SimpleType.STRING, true, false, false));  
1329         attributes.add(new OpenMBeanAttributeInfoSupport(TOTAL_DATA_ATTR,
1330             "Total data received", SimpleType.LONG, true, false, false));
1331         attributes.add(new OpenMBeanAttributeInfoSupport(CRAWL_TIME_ATTR,
1332             "Crawl time", SimpleType.LONG, true, false, false));
1333         attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_DOC_RATE_ATTR,
1334             "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,
1335             true, false, false));
1336         attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_KB_RATE_ATTR,
1337             "Current crawling rate (Kb/sec)", SimpleType.LONG,
1338             true, false, false));
1339         attributes.add(new OpenMBeanAttributeInfoSupport(THREAD_COUNT_ATTR,
1340             "Active thread count", SimpleType.INTEGER, true, false, false));
1341         attributes.add(new OpenMBeanAttributeInfoSupport(DOC_RATE_ATTR,
1342             "Crawling rate (Docs/sec)", SimpleType.DOUBLE,
1343             true, false, false));
1344         attributes.add(new OpenMBeanAttributeInfoSupport(KB_RATE_ATTR,
1345             "Current crawling rate (Kb/sec)", SimpleType.LONG,
1346             true, false, false));
1347         attributes.add(new OpenMBeanAttributeInfoSupport(DOWNLOAD_COUNT_ATTR,
1348             "Count of downloaded documents", SimpleType.LONG,
1349             true, false, false));
1350         attributes.add(new OpenMBeanAttributeInfoSupport(DISCOVERED_COUNT_ATTR,
1351             "Count of discovered documents", SimpleType.LONG,
1352             true, false, false));
1353         
1354         // Add in the crawl order attributes.
1355         addCrawlOrderAttributes(this.getController().getOrder(), attributes);
1356         
1357         // Add the bdbje attributes.  Convert to open mbean attributes.
1358         // First do bdbeje setup.  Then add a subset of the bdbje attributes.
1359         // Keep around the list of names as a convenience for when it comes
1360         // time to test if attribute is supported.
1361         Environment env = this.controller.getBdbEnvironment();
1362         try {
1363             this.bdbjeMBeanHelper =
1364                 new JEMBeanHelper(env.getConfig(), env.getHome(), true);
1365         } catch (DatabaseException e) {
1366             e.printStackTrace();
1367             InitializationException ie =
1368                 new InitializationException(e.getMessage());
1369             ie.setStackTrace(e.getStackTrace());
1370             throw ie;
1371         }
1372         this.bdbjeAttributeNameList = Arrays.asList(new String [] {
1373                 JEMBeanHelper.ATT_ENV_HOME,
1374                 JEMBeanHelper.ATT_OPEN,
1375                 JEMBeanHelper.ATT_IS_READ_ONLY,
1376                 JEMBeanHelper.ATT_IS_TRANSACTIONAL,
1377                 JEMBeanHelper.ATT_CACHE_SIZE,
1378                 JEMBeanHelper.ATT_CACHE_PERCENT,
1379                 JEMBeanHelper.ATT_LOCK_TIMEOUT,
1380                 JEMBeanHelper.ATT_IS_SERIALIZABLE,
1381                 JEMBeanHelper.ATT_SET_READ_ONLY,
1382         });
1383         addBdbjeAttributes(attributes,
1384                 this.bdbjeMBeanHelper.getAttributeList(env),
1385                 this.bdbjeAttributeNameList);
1386 
1387         // Operations.
1388         List<OpenMBeanOperationInfo> operations
1389          = new ArrayList<OpenMBeanOperationInfo>();
1390         OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[3];
1391         args[0] = new OpenMBeanParameterInfoSupport("url",
1392             "URL to add to the frontier", SimpleType.STRING);
1393         args[1] = new OpenMBeanParameterInfoSupport("forceFetch",
1394             "True if URL is to be force fetched", SimpleType.BOOLEAN);
1395         args[2] = new OpenMBeanParameterInfoSupport("seed",
1396             "True if URL is a seed", SimpleType.BOOLEAN);
1397         operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URI_OPER,
1398             "Add passed URL to the frontier", args, SimpleType.VOID,
1399                 MBeanOperationInfo.ACTION));
1400         
1401         args = new OpenMBeanParameterInfoSupport[4];
1402         args[0] = new OpenMBeanParameterInfoSupport("pathOrUrl",
1403             "Path or URL to file of URLs", SimpleType.STRING);
1404         args[1] = new OpenMBeanParameterInfoSupport("style",
1405             "Format format:default|crawlLog|recoveryJournal",
1406             SimpleType.STRING);
1407         args[2] = new OpenMBeanParameterInfoSupport("forceFetch",
1408             "True if URLs are to be force fetched", SimpleType.BOOLEAN);
1409         args[3] = new OpenMBeanParameterInfoSupport("seed",
1410             "True if all content are seeds.", SimpleType.BOOLEAN);
1411         operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URIS_OPER,
1412             "Add file of passed URLs to the frontier", args, SimpleType.STRING,
1413                 MBeanOperationInfo.ACTION));
1414         
1415         
1416         args = new OpenMBeanParameterInfoSupport[4];
1417         args[0] = new OpenMBeanParameterInfoSupport("filename",
1418                 "File to print to", SimpleType.STRING);
1419         args[1] = new OpenMBeanParameterInfoSupport("regexp",
1420                 "Regular expression URLs must match", SimpleType.STRING);
1421         args[2] = new OpenMBeanParameterInfoSupport("numberOfMatches",
1422                 "Maximum number of matches to return", SimpleType.INTEGER);
1423         args[3] = new OpenMBeanParameterInfoSupport("verbose",
1424                 "Should they be verbose descriptions", SimpleType.BOOLEAN);
1425         operations.add(new OpenMBeanOperationInfoSupport(DUMP_URIS_OPER,
1426                 "Dump pending URIs from frontier to a file", args,
1427                 SimpleType.VOID, MBeanOperationInfo.ACTION));
1428         
1429         operations.add(new OpenMBeanOperationInfoSupport(PAUSE_OPER,
1430             "Pause crawling (noop if already paused)", null, SimpleType.VOID,
1431             MBeanOperationInfo.ACTION));
1432         
1433         operations.add(new OpenMBeanOperationInfoSupport(RESUME_OPER,
1434             "Resume crawling (noop if already resumed)", null,
1435             SimpleType.VOID, MBeanOperationInfo.ACTION));
1436         
1437         args = new OpenMBeanParameterInfoSupport[1];
1438         args[0] = new OpenMBeanParameterInfoSupport("name",
1439             "Name of report ('all', 'standard', etc.).", SimpleType.STRING);
1440         operations.add(new OpenMBeanOperationInfoSupport(FRONTIER_REPORT_OPER,
1441              "Full frontier report", args, SimpleType.STRING,
1442              MBeanOperationInfo.INFO));
1443         
1444         operations.add(new OpenMBeanOperationInfoSupport(THREADS_REPORT_OPER,
1445              "Full thread report", null, SimpleType.STRING,
1446              MBeanOperationInfo.INFO));
1447         
1448         operations.add(new OpenMBeanOperationInfoSupport(SEEDS_REPORT_OPER,
1449              "Seeds report", null, SimpleType.STRING, MBeanOperationInfo.INFO));  
1450  
1451         operations.add(
1452                 new OpenMBeanOperationInfoSupport(PROGRESS_STATISTICS_OPER,
1453                 "Progress statistics at time of invocation", null,
1454                 SimpleType.STRING, MBeanOperationInfo.INFO)); 
1455         
1456         operations.add(new OpenMBeanOperationInfoSupport(
1457             PROGRESS_STATISTICS_LEGEND_OPER,
1458                 "Progress statistics legend", null,
1459                 SimpleType.STRING, MBeanOperationInfo.INFO));  
1460         
1461         operations.add(new OpenMBeanOperationInfoSupport(CHECKPOINT_OPER,
1462                 "Start a checkpoint", null, SimpleType.VOID,
1463                 MBeanOperationInfo.ACTION));
1464                 
1465         // Add bdbje operations. Add subset only. Keep around the list so have
1466         // it to hand when figuring what operations are supported. Usual actual
1467         // Strings because not accessible from JEMBeanHelper.
1468         this.bdbjeOperationsNameList = Arrays.asList(new String[] { "cleanLog",
1469                 "evictMemory", "checkpoint", "sync",
1470                 "getEnvironmentStatsToString", "getLockStatsToString",
1471                 "getDatabaseNames", OP_DB_STAT
1472         });
1473         addBdbjeOperations(operations,
1474                 this.bdbjeMBeanHelper.getOperationList(env),
1475                 this.bdbjeOperationsNameList);
1476         
1477         // Register notifications
1478         List<MBeanNotificationInfo> notifications
1479          = new ArrayList<MBeanNotificationInfo>();
1480         notifications.add(
1481             new MBeanNotificationInfo(new String [] {"crawlStarted",
1482                     "crawlEnding", "crawlPaused", "crawlResuming", PROG_STATS},
1483                 this.getClass().getName() + ".notifications",
1484                 "CrawlStatusListener events and progress statistics as " +
1485                     "notifications"));
1486         MBeanNotificationInfo [] notificationsArray =
1487             new MBeanNotificationInfo[notifications.size()];
1488         notifications.toArray(notificationsArray);
1489         
1490         // Build the info object.
1491         OpenMBeanAttributeInfoSupport[] attributesArray =
1492             new OpenMBeanAttributeInfoSupport[attributes.size()];
1493         attributes.toArray(attributesArray);
1494         OpenMBeanOperationInfoSupport[] operationsArray =
1495             new OpenMBeanOperationInfoSupport[operations.size()];
1496         operations.toArray(operationsArray);
1497         return new OpenMBeanInfoSupport(this.getClass().getName(),
1498             "Current Crawl Job as OpenMBean",
1499             attributesArray,
1500             new OpenMBeanConstructorInfoSupport [] {},
1501             operationsArray,
1502             notificationsArray);
1503     }
1504     
1505     protected void addBdbjeAttributes(
1506             final List<OpenMBeanAttributeInfo> attributes,
1507             final List<MBeanAttributeInfo> bdbjeAttributes, 
1508             final List<String> bdbjeNamesToAdd) {
1509         for (MBeanAttributeInfo info: bdbjeAttributes) {
1510             if (bdbjeNamesToAdd.contains(info.getName())) {
1511                 attributes.add(JmxUtils.convertToOpenMBeanAttribute(info));
1512             }
1513         }   
1514     }
1515     
1516     protected void addBdbjeOperations(
1517             final List<OpenMBeanOperationInfo> operations,
1518             final List<MBeanOperationInfo> bdbjeOperations, 
1519             final List<String> bdbjeNamesToAdd) {
1520         for (MBeanOperationInfo info: bdbjeOperations) {
1521             if (bdbjeNamesToAdd.contains(info.getName())) {
1522                 OpenMBeanOperationInfo omboi = null;
1523                 if (info.getName().equals(OP_DB_STAT)) {
1524                     // Db stats needs special handling. The published
1525                     // signature is wrong and its return type is awkward.
1526                     // Handle it.
1527                     omboi = JmxUtils.convertToOpenMBeanOperation(info, null,
1528                         SimpleType.STRING);
1529                     MBeanParameterInfo[] params = omboi.getSignature();
1530                     OpenMBeanParameterInfo[] args =
1531                         new OpenMBeanParameterInfoSupport[params.length + 1];
1532                     for (int ii = 0; ii < params.length; ii++) {
1533                         args[ii] = (OpenMBeanParameterInfo) params[ii];
1534                     }
1535                     args[params.length] = new OpenMBeanParameterInfoSupport(
1536                             "name", "Database name", SimpleType.STRING);
1537                     omboi = new OpenMBeanOperationInfoSupport(omboi.getName(),
1538                         omboi.getDescription(), args, omboi.getReturnOpenType(),
1539                         omboi.getImpact());
1540                 } else {
1541                     omboi = JmxUtils.convertToOpenMBeanOperation(info);
1542                 }
1543                 operations.add(omboi);
1544             }
1545         }
1546     }
1547     
1548     protected void addCrawlOrderAttributes(final ComplexType type,
1549             final List<OpenMBeanAttributeInfo> attributes) {
1550         for (final Iterator i = type.getAttributeInfoIterator(null);
1551                 i.hasNext();) {
1552             ModuleAttributeInfo info = (ModuleAttributeInfo)i.next();
1553             if (ORDER_EXCLUDE.contains(info.getName())) {
1554                 // Skip.
1555                 continue;
1556             }
1557             String absoluteName = type.getAbsoluteName() + "/" + info.getName();
1558             if (JmxUtils.isOpenType(info.getType())) {
1559                 String description = info.getDescription();
1560                 if (description == null || description.length() <= 0) {
1561                     // Description can't be empty.
1562                     description = info.getName();
1563                 }
1564                 attributes.add(new OpenMBeanAttributeInfoSupport(
1565                     absoluteName, description,
1566                     JmxUtils.getOpenType(info.getType()), true, true, false));
1567             } else if(info.isComplexType()) {
1568                 try {
1569                     ComplexType c =
1570                         (ComplexType)type.getAttribute(info.getName());
1571                     addCrawlOrderAttributes(c, attributes);
1572                 } catch (AttributeNotFoundException e) {
1573                     logger.log(Level.SEVERE, "Failed get of attribute", e);
1574                 } catch (MBeanException e) {
1575                     logger.log(Level.SEVERE, "Failed get of attribute", e);
1576                 } catch (ReflectionException e) {
1577                     logger.log(Level.SEVERE, "Failed get of attribute", e);
1578                 }
1579             } else if (info.getType().equals(TextField.class.getName())) {
1580                 // Special handling for TextField.  Use the STRING OpenType.
1581                 attributes.add(new OpenMBeanAttributeInfoSupport(
1582                         absoluteName, info.getDescription(),
1583                         SimpleType.STRING, true, true, false));
1584             } else {
1585                 // Looks like only type we don't currently handle is StringList.
1586                 // Figure how to do it.  Add as AttributeList?
1587                 logger.fine(info.getType());
1588             }
1589         }
1590     }
1591     
1592     public Object getAttribute(String attribute_name)
1593     throws AttributeNotFoundException {
1594         if (attribute_name == null) {
1595             throw new RuntimeOperationsException(
1596                  new IllegalArgumentException("Attribute name cannot be null"),
1597                  "Cannot call getAttribute with null attribute name");
1598         }
1599         
1600         // If no controller, we can't do any work in here.
1601         if (this.controller == null) {
1602             throw new RuntimeOperationsException(
1603                  new NullPointerException("Controller is null"),
1604                  "Controller is null");
1605         }
1606         
1607         // Is it a bdbje attribute?
1608         if (this.bdbjeAttributeNameList.contains(attribute_name)) {
1609             try {
1610                 return this.bdbjeMBeanHelper.getAttribute(
1611                         this.controller.getBdbEnvironment(), attribute_name);
1612             } catch (MBeanException e) {
1613                 throw new RuntimeOperationsException(new RuntimeException(e));
1614             }
1615         }
1616         
1617         // Is it a crawl-order attribute?
1618         if (attribute_name.
1619                 startsWith(this.controller.getOrder().getAbsoluteName())) {
1620             return getCrawlOrderAttribute(attribute_name);
1621         }
1622         
1623         if (!ATTRIBUTE_LIST.contains(attribute_name)) {
1624             throw new AttributeNotFoundException("Attribute " +
1625                     attribute_name + " is unimplemented.");
1626         }
1627 
1628         // The pattern in the below is to match an attribute and when found
1629         // do a return out of if clause.  Doing it this way, I can fall
1630         // on to the AttributeNotFoundException for case where we've an
1631         // attribute but no handler.
1632         if (attribute_name.equals(STATUS_ATTR)) {
1633             return getCrawlStatus();
1634         }
1635         if (attribute_name.equals(NAME_ATTR)) {
1636             return getJobName();
1637         }
1638         if (attribute_name.equals(UID_ATTR)) {
1639             return getUID();
1640         }
1641         if (attribute_name.equals(TOTAL_DATA_ATTR)) {
1642             return new Long(this.controller == null &&
1643                     this.controller.getStatistics() != null? 0:
1644                 this.controller.getStatistics().totalBytesCrawled());
1645         }
1646         if (attribute_name.equals(CRAWL_TIME_ATTR)) {
1647             return new Long(this.controller == null &&
1648                     this.controller.getStatistics() != null? 0:
1649                 this.controller.getStatistics().getCrawlerTotalElapsedTime() /
1650                     1000);
1651         }
1652         if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) {
1653             return new Double(this.controller == null &&
1654                     this.controller.getStatistics() != null? 0:
1655                 this.controller.getStatistics().currentProcessedDocsPerSec());
1656         }
1657         if (attribute_name.equals(DOC_RATE_ATTR)) {
1658             return new Double(this.controller == null &&
1659                     this.controller.getStatistics() != null? 0:
1660                 this.controller.getStatistics().processedDocsPerSec());
1661         }
1662         if (attribute_name.equals(KB_RATE_ATTR)) {
1663             return new Long(this.controller == null &&
1664                     this.controller.getStatistics() != null? 0:
1665                 this.controller.getStatistics().currentProcessedKBPerSec());
1666         }
1667         if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) {
1668             return new Long(this.controller == null &&
1669                     this.controller.getStatistics() != null? 0:
1670                 this.controller.getStatistics().processedKBPerSec());
1671         }
1672         if (attribute_name.equals(THREAD_COUNT_ATTR)) {
1673             return new Integer(this.controller == null &&
1674                     this.controller.getStatistics() != null? 0:
1675                 this.controller.getStatistics().activeThreadCount());
1676         }       
1677         if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) {
1678             return getFrontierOneLine();
1679         }
1680         if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) {
1681             return getThreadOneLine();
1682         }
1683         if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) {
1684             return new Long(this.controller == null &&
1685                     this.controller.getStatistics() != null? 0:
1686                 this.controller.getStatistics().totalCount());
1687         }
1688         if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) {
1689             return new Long(this.controller == null &&
1690                     this.controller.getStatistics() != null? 0:
1691                 this.controller.getStatistics().successfullyFetchedCount());
1692         }
1693         
1694         throw new AttributeNotFoundException("Attribute " +
1695             attribute_name + " not found.");
1696     }
1697     
1698     protected Object getCrawlOrderAttribute(final String attribute_name) {
1699         CrawlOrder order = this.getController().getOrder();
1700         Object result = null;
1701         try {
1702             result = getCrawlOrderAttribute(attribute_name.substring(order
1703                     .getAbsoluteName().length()), order);
1704         } catch (NullPointerException e) {
1705             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1706         } catch (AttributeNotFoundException e) {
1707             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1708         } catch (MBeanException e) {
1709             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1710         } catch (ReflectionException e) {
1711             logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1712         }
1713         return result;
1714     }
1715 
1716     protected Object getCrawlOrderAttribute(final String attribute_name,
1717             final ComplexType ct)
1718     throws AttributeNotFoundException, MBeanException, ReflectionException {
1719         String subName = attribute_name.startsWith("/") ? attribute_name
1720                 .substring(1) : attribute_name;
1721         int index = subName.indexOf("/");
1722         if (index <= 0) {
1723             MBeanAttributeInfo info = ct.getAttributeInfo(subName);
1724             // Special handling for TextField.
1725             return info.getType().equals(TextField.class.getName()) ? ct
1726                     .getAttribute(subName).toString() : ct
1727                     .getAttribute(subName);
1728         }
1729         return getCrawlOrderAttribute(subName.substring(index + 1),
1730                 (ComplexType) ct.getAttribute(subName.substring(0, index)));
1731     }
1732     
1733     public AttributeList getAttributes(String [] attributeNames) {
1734         if (attributeNames == null) {
1735             throw new RuntimeOperationsException(
1736                 new IllegalArgumentException("attributeNames[] cannot be " +
1737                 "null"), "Cannot call getAttributes with null attribute " +
1738                 "names");
1739         }
1740         
1741         // If no controller, we can't do any work in here.
1742         if (this.controller == null) {
1743             throw new RuntimeOperationsException(
1744                  new NullPointerException("Controller is null"),
1745                  "Controller is null");
1746         }
1747         
1748         AttributeList resultList = new AttributeList();
1749         if (attributeNames.length == 0) {
1750             return resultList;
1751         }
1752         for (int i = 0; i < attributeNames.length; i++) {
1753             try {
1754                 Object value = getAttribute(attributeNames[i]);
1755                 resultList.add(new Attribute(attributeNames[i], value));
1756             } catch (Exception e) {
1757                 e.printStackTrace();
1758             }
1759         }
1760         return(resultList);
1761     }
1762 
1763     public void setAttribute(Attribute attribute)
1764             throws AttributeNotFoundException {
1765         setAttributeInternal(attribute);
1766         // prompt updating of settings-sensitive components
1767         kickUpdate();
1768     }
1769 
1770 	protected void setAttributeInternal(Attribute attribute)
1771 			throws AttributeNotFoundException {
1772 		// Is it a crawl order attribute?
1773         CrawlOrder order = this.getController().getOrder();
1774         String attName = attribute.getName();
1775         if (attName.startsWith(order.getAbsoluteName())) {
1776             try {
1777                 setCrawlOrderAttribute(attribute.getName().substring(
1778                         order.getAbsoluteName().length()), order, attribute);
1779             } catch (NullPointerException e) {
1780                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1781             } catch (AttributeNotFoundException e) {
1782                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1783             } catch (MBeanException e) {
1784                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1785             } catch (ReflectionException e) {
1786                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1787             } catch (InvalidAttributeValueException e) {
1788                 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1789             }
1790             return;
1791         }
1792         
1793         // Is it a bdbje attribute?
1794         if (this.bdbjeAttributeNameList.contains(attName)) {
1795             try {
1796                 this.bdbjeMBeanHelper.setAttribute(this.controller
1797                         .getBdbEnvironment(), attribute);
1798             } catch (AttributeNotFoundException e) {
1799                 throw new RuntimeOperationsException(new RuntimeException(e));
1800             } catch (InvalidAttributeValueException e) {
1801                 throw new RuntimeOperationsException(new RuntimeException(e));
1802             }
1803             return;
1804         }
1805         
1806         // Else, we don't know how to handle this attribute.
1807         throw new AttributeNotFoundException("Attribute " + attName +
1808             " can not be set.");
1809 	}
1810     
1811     protected void setCrawlOrderAttribute(final String attribute_name,
1812             final ComplexType ct, final Attribute attribute)
1813     throws AttributeNotFoundException, InvalidAttributeValueException,
1814             MBeanException, ReflectionException {
1815         String subName = attribute_name.startsWith("/") ? attribute_name
1816                 .substring(1) : attribute_name;
1817         int index = subName.indexOf("/");
1818         if (index <= 0) {
1819             ct.setAttribute(new Attribute(subName, attribute.getValue()));
1820             return;
1821         }
1822         setCrawlOrderAttribute(subName.substring(index + 1), (ComplexType) ct
1823                 .getAttribute(subName.substring(0, index)), attribute);
1824     }
1825 
1826     public AttributeList setAttributes(AttributeList attributes) {
1827         if (attributes == null) {
1828             throw new RuntimeOperationsException(
1829                 new IllegalArgumentException("attributeNames[] cannot be " +
1830                 "null"), "Cannot call getAttributes with null attribute " +
1831                 "names");
1832         }
1833         
1834         AttributeList resultList = new AttributeList();
1835         if (attributes.size() == 0) {
1836             return resultList;
1837         }
1838         for (int i = 0; i < attributes.size(); i++) {
1839             try {
1840                 Attribute attr = (Attribute)attributes.get(i);
1841                 setAttributeInternal(attr);
1842                 String an = attr.getName();
1843                 Object newValue = getAttribute(an);
1844                 resultList.add(new Attribute(an, newValue));
1845             } catch (Exception e) {
1846                 e.printStackTrace();
1847             }
1848         }
1849         // prompt updating of settings-sensitive components
1850         kickUpdate();
1851         return resultList;
1852     }
1853 
1854     public Object invoke(String operationName, Object[] params,
1855         String[] signature)
1856     throws ReflectionException {
1857         if (operationName == null) {
1858             throw new RuntimeOperationsException(
1859                 new IllegalArgumentException("Operation name cannot be null"),
1860                 "Cannot call invoke with null operation name");
1861         }
1862         
1863         controller.installThreadContextSettingsHandler();
1864         
1865         if (this.bdbjeOperationsNameList.contains(operationName)) {
1866             try {
1867                 Object o = this.bdbjeMBeanHelper.invoke(
1868                         this.controller.getBdbEnvironment(),
1869                         operationName, params, signature);
1870                 // If OP_DB_ST, return String version of result.
1871                 if (operationName.equals(OP_DB_STAT)) {
1872                     return o.toString();
1873                 }
1874                 return o;
1875             } catch (MBeanException e) {
1876                 throw new RuntimeOperationsException(new RuntimeException(e));
1877             }
1878         }
1879         
1880         // TODO: Exploit passed signature.
1881         
1882         // The pattern in the below is to match an operation and when found
1883         // do a return out of if clause.  Doing it this way, I can fall
1884         // on to the MethodNotFoundException for case where we've an
1885         // attribute but no handler.
1886         if (operationName.equals(IMPORT_URI_OPER)) {
1887             JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3);
1888             mustBeCrawling();
1889             try {
1890                 importUri((String)params[0],
1891                     ((Boolean)params[1]).booleanValue(),
1892                     ((Boolean)params[2]).booleanValue());
1893             } catch (URIException e) {
1894                 throw new RuntimeOperationsException(new RuntimeException(e));
1895             }
1896             return null;
1897         }
1898         
1899         if (operationName.equals(IMPORT_URIS_OPER)) {
1900             JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4);
1901             mustBeCrawling();
1902             return importUris((String)params[0],
1903                 ((String)params[1]).toString(),
1904                 ((Boolean)params[2]).booleanValue(),
1905                 ((Boolean)params[3]).booleanValue());
1906         }
1907         
1908         if (operationName.equals(DUMP_URIS_OPER)) {
1909             JmxUtils.checkParamsCount(DUMP_URIS_OPER, params, 4);
1910             mustBeCrawling();
1911             if (!this.controller.isPaused()) {
1912                 throw new RuntimeOperationsException(
1913                         new IllegalArgumentException("Must " + "be paused"),
1914                         "Cannot dump URI's from running job.");
1915             }
1916             dumpUris((String) params[0], (String) params[1],
1917                     ((Integer) params[2]).intValue(), ((Boolean) params[3])
1918                             .booleanValue());
1919         }
1920         
1921         if (operationName.equals(PAUSE_OPER)) {
1922             JmxUtils.checkParamsCount(PAUSE_OPER, params, 0);
1923             mustBeCrawling();
1924             pause();
1925             return null;
1926         }
1927         
1928         if (operationName.equals(RESUME_OPER)) {
1929             JmxUtils.checkParamsCount(RESUME_OPER, params, 0);
1930             mustBeCrawling();
1931             resume();
1932             return null;
1933         }
1934         
1935         if (operationName.equals(FRONTIER_REPORT_OPER)) {
1936             JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1);
1937             mustBeCrawling();
1938             return getFrontierReport((String)params[0]);
1939         }
1940         
1941         if (operationName.equals(THREADS_REPORT_OPER)) {
1942             JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0);
1943             mustBeCrawling();
1944             return getThreadsReport();
1945         }
1946         
1947         if (operationName.equals(SEEDS_REPORT_OPER)) {
1948             JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0);
1949             mustBeCrawling();
1950             StringWriter sw = new StringWriter();
1951             if (getStatisticsTracking() != null &&
1952                     getStatisticsTracking() instanceof StatisticsTracker) {
1953                 ((StatisticsTracker)getStatisticsTracking()).
1954                     writeSeedsReportTo(new PrintWriter(sw));
1955             } else {
1956                 sw.write("Unsupported");
1957             }
1958             return sw.toString();
1959         }       
1960         
1961         if (operationName.equals(CHECKPOINT_OPER)) {
1962             JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0);
1963             mustBeCrawling();
1964             try {
1965                 checkpoint();
1966             } catch (IllegalStateException e) {
1967                 throw new RuntimeOperationsException(e);
1968             }
1969             return null;
1970         }
1971         
1972         if (operationName.equals(PROGRESS_STATISTICS_OPER)) {
1973             JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params, 0);
1974             mustBeCrawling();
1975             return getStatisticsTracking().getProgressStatisticsLine();
1976         }
1977         
1978         if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) {
1979             JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER,
1980                     params, 0);
1981             return getStatisticsTracking().progressStatisticsLegend();
1982         }
1983         
1984         throw new ReflectionException(
1985             new NoSuchMethodException(operationName),
1986                 "Cannot find the operation " + operationName);
1987     }
1988     
1989     public void mustBeCrawling() {
1990         if (!isCrawling()) {
1991             throw new RuntimeOperationsException(
1992                 new IllegalArgumentException("Not " +
1993                 "crawling (Shouldn't ever be the case)"),
1994                 "Not current crawling job?");
1995         }
1996     }
1997     
1998     public boolean isCrawling() {
1999         return this.controller != null;
2000     }
2001     
2002     /***
2003      * Utility method to get the stored list of ignored seed items (if any),
2004      * from the last time the seeds were imported to the frontier.
2005      * 
2006      * @return String of all ignored seed items, or null if none
2007      */
2008     public String getIgnoredSeeds() {
2009         File ignoredFile = new File(getDirectory(),
2010                 AbstractFrontier.IGNORED_SEEDS_FILENAME);
2011         if(!ignoredFile.exists()) {
2012             return null;
2013         }
2014         try {
2015             return FileUtils.readFileAsString(ignoredFile);
2016         } catch (IOException e) {
2017             // TODO Auto-generated catch block
2018             e.printStackTrace();
2019             return null;
2020         }
2021     }
2022     
2023     /***
2024      * Forward a 'kick' update to current controller if any.
2025      * @see CrawlController#kickUpdate()
2026      */
2027     public void kickUpdate(){
2028         if (this.controller != null){
2029             this.controller.kickUpdate();
2030         }
2031     }
2032     
2033     /***
2034      * Returns a URIFrontierMarker for the current, paused, job. If there is no
2035      * current job or it is not paused null will be returned.
2036      *
2037      * @param regexpr A regular expression that each URI must match in order to
2038      * be considered 'within' the marker.
2039      * @param inCacheOnly Limit marker scope to 'cached' URIs.
2040      * @return a URIFrontierMarker for the current job.
2041      * @see #getPendingURIsList(FrontierMarker, int, boolean)
2042      * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
2043      *      boolean)
2044      * @see org.archive.crawler.framework.FrontierMarker
2045      */
2046     public FrontierMarker getInitialMarker(String regexpr,
2047             boolean inCacheOnly) {
2048         return (this.controller != null && this.controller.isPaused())?
2049            this.controller.getFrontier().getInitialMarker(regexpr, inCacheOnly):
2050                null;
2051     }
2052     
2053     /***
2054      * Returns the frontiers URI list based on the provided marker. This method
2055      * will return null if there is not current job or if the current job is
2056      * not paused. Only when there is a paused current job will this method
2057      * return a URI list.
2058      *
2059      * @param marker URIFrontier marker
2060      * @param numberOfMatches Maximum number of matches to return
2061      * @param verbose Should detailed info be provided on each URI?
2062      * @return the frontiers URI list based on the provided marker
2063      * @throws InvalidFrontierMarkerException
2064      *             When marker is inconsistent with the current state of the
2065      *             frontier.
2066      * @see #getInitialMarker(String, boolean)
2067      * @see org.archive.crawler.framework.FrontierMarker
2068      */
2069     public ArrayList<String> getPendingURIsList(FrontierMarker marker,
2070             int numberOfMatches, boolean verbose)
2071     throws InvalidFrontierMarkerException {
2072         return  (this.controller != null && this.controller.isPaused())?
2073             this.controller.getFrontier().getURIsList(marker, numberOfMatches,
2074                     verbose):
2075             null;
2076     }
2077 
2078     public void dumpUris(String filename, String regexp, int numberOfMatches,
2079             boolean verbose) {
2080         try {
2081             PrintWriter out = new PrintWriter(filename); 
2082             FrontierMarker marker = 
2083                 controller.getFrontier().getInitialMarker(regexp, false);
2084             int matchesDumped = 0;
2085             
2086             while(matchesDumped<numberOfMatches) {
2087                 int batchMatches = Math.min(100, numberOfMatches-matchesDumped);
2088                 
2089                 ArrayList<String> batchOfUris = 
2090                     getPendingURIsList(marker,batchMatches,false);
2091                 for(String uriLine : batchOfUris) {
2092                     out.write(uriLine);
2093                     out.write("\n");
2094                     matchesDumped++;
2095                 }
2096                 if (batchOfUris.size()<batchMatches) {
2097                     // must be exhausted; we're finished
2098                     break; 
2099                 }
2100             }
2101             IOUtils.closeQuietly(out); 
2102         } catch (FileNotFoundException e) {
2103             logger.log(Level.SEVERE, "Failed dumpUris write", e);
2104         } catch (InvalidFrontierMarkerException e) {
2105             logger.log(Level.SEVERE, "Failed dumpUris", e);
2106         }
2107     }
2108     
2109     public void crawlStarted(String message) {
2110         if (this.mbeanName != null) {
2111             // Can be null around job startup.
2112             sendNotification(new Notification("crawlStarted",
2113                 this.mbeanName,  getNotificationsSequenceNumber(), message)); 
2114         }
2115     }
2116 
2117     public void crawlEnding(String sExitMessage) {
2118         setRunning(false);
2119         setStatus(sExitMessage);
2120         setReadOnly();
2121         if (this.mbeanName != null) {
2122             sendNotification(new Notification("crawlEnding", this.mbeanName,
2123                 getNotificationsSequenceNumber(), sExitMessage));
2124         }
2125     }
2126 
2127     public void crawlEnded(String sExitMessage) {
2128         // Let the settings handler be cleaned up by the crawl controller
2129         // completeStop. Just let go of our reference in here.
2130         // if (this.settingsHandler != null) {
2131         //    this.settingsHandler.cleanup();
2132         // }
2133         
2134         // We used to zero-out datamembers but no longer needed now CrawlJobs
2135         // no longer persist after completion (They used to be kept around in
2136         // a list so operator could view CrawlJob finish state and reports --
2137         // but we now dump actual job and create a new uninitialized CrawlJob
2138         // that points at old CrawlJob data. 
2139     }
2140 
2141     public void crawlPausing(String statusMessage) {
2142         setStatus(statusMessage);
2143     }
2144 
2145     public void crawlPaused(String statusMessage) {
2146         setStatus(statusMessage);
2147         if (this.mbeanName != null) {
2148             // Can be null around job startup.
2149             sendNotification(new Notification("crawlPaused", this.mbeanName,
2150                 getNotificationsSequenceNumber(), statusMessage));
2151         }
2152     }
2153 
2154     public void crawlResuming(String statusMessage) {
2155         setStatus(statusMessage);
2156         if (this.mbeanName != null) {
2157             // Can be null around job startup.
2158             sendNotification(new Notification("crawlResuming", this.mbeanName,
2159                 getNotificationsSequenceNumber(), statusMessage));
2160         }
2161     }
2162 
2163     public void crawlCheckpoint(File checkpointDir) throws Exception {
2164         setStatus(CrawlJob.STATUS_CHECKPOINTING);
2165     }
2166 
2167     public CrawlController getController() {
2168         return this.controller;
2169     }
2170     
2171     public ObjectName preRegister(final MBeanServer server, ObjectName on)
2172     throws Exception {
2173         this.mbeanServer = server;
2174         @SuppressWarnings("unchecked")
2175         Hashtable<String,String> ht = on.getKeyPropertyList();
2176         if (!ht.containsKey(JmxUtils.NAME)) {
2177             throw new IllegalArgumentException("Name property required" +
2178                 on.getCanonicalName());
2179         }
2180         // Now append key/values from hosting heritrix JMX ObjectName so it can be
2181         // found just by examination of the CrawlJob JMX ObjectName.  Add heritrix
2182         // name attribute as 'mother' attribute.
2183         Heritrix h = getHostingHeritrix();
2184         if (h == null || h.getMBeanName() == null) {
2185             throw new IllegalArgumentException("Hosting heritrix not found " +
2186                 "or not registered with JMX: " + on.getCanonicalName());
2187         }
2188         @SuppressWarnings("unchecked")
2189         Map<String,String> hht = h.getMBeanName().getKeyPropertyList();
2190         ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME));
2191         String port = hht.get(JmxUtils.JMX_PORT);
2192         if (port != null) {
2193         	ht.put(JmxUtils.JMX_PORT, port);
2194         }
2195         ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST));
2196         if (!ht.containsKey(JmxUtils.TYPE)) {
2197             ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE);
2198         }
2199         this.mbeanName = new ObjectName(on.getDomain(), ht);
2200         return this.mbeanName;
2201     }
2202 
2203     public void postRegister(Boolean registrationDone) {
2204         if (logger.isLoggable(Level.INFO)) {
2205             logger.info(
2206                 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2207                 this.mbeanServer, registrationDone.booleanValue()));
2208         }
2209     }
2210 
2211     public void preDeregister() throws Exception {
2212         // Nothing to do.
2213     }
2214 
2215     public void postDeregister() {
2216         if (mbeanName ==  null) {
2217             return;
2218         }
2219         if (logger.isLoggable(Level.INFO)) {
2220             logger.info(JmxUtils.getLogUnregistrationMsg(
2221                     this.mbeanName.getCanonicalName(), this.mbeanServer));
2222         }
2223         this.mbeanName = null;
2224     }
2225     
2226     /***
2227      * @return Heritrix that is hosting this job.
2228      */
2229     protected Heritrix getHostingHeritrix() {
2230         Heritrix hostingHeritrix = null;
2231         Map heritrice = Heritrix.getInstances();
2232         for (final Iterator i = heritrice.keySet().iterator(); i.hasNext();) {
2233             Heritrix h = (Heritrix)heritrice.get(i.next());
2234             if (h.getJobHandler().getCurrentJob() == this) {
2235                 hostingHeritrix = h;
2236                 break;
2237             }
2238         }
2239         return hostingHeritrix;
2240     }
2241     
2242     /***
2243      * @return Unique name for job that is safe to use in jmx (Like display
2244      * name but without spaces).
2245      */
2246     public String getJmxJobName() {
2247         return getJobName() + "-" + getUID();
2248     }
2249 
2250     /***
2251      * @return Notification sequence number (Does increment after each access).
2252      */
2253     protected static int getNotificationsSequenceNumber() {
2254         return notificationsSequenceNumber++;
2255     }
2256 
2257     protected ObjectName getMbeanName() {
2258         return this.mbeanName;
2259     }
2260     
2261     /***
2262      * @return the statistics tracking instance (of null if none yet available).
2263      */
2264     public StatisticsTracking getStatisticsTracking() {
2265         return this.controller == null ||
2266             this.controller.getStatistics() == null? null:
2267                 this.controller.getStatistics();
2268     }
2269 }