1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.archive.crawler.admin;
22
23 import java.io.BufferedReader;
24 import java.io.File;
25 import java.io.FileNotFoundException;
26 import java.io.FileOutputStream;
27 import java.io.FileReader;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.InputStreamReader;
31 import java.io.OutputStreamWriter;
32 import java.io.PrintWriter;
33 import java.io.Serializable;
34 import java.io.StringWriter;
35 import java.util.ArrayList;
36 import java.util.Arrays;
37 import java.util.Collection;
38 import java.util.EventObject;
39 import java.util.Hashtable;
40 import java.util.Iterator;
41 import java.util.List;
42 import java.util.Map;
43 import java.util.logging.Level;
44 import java.util.logging.Logger;
45
46 import javax.management.Attribute;
47 import javax.management.AttributeList;
48 import javax.management.AttributeNotFoundException;
49 import javax.management.DynamicMBean;
50 import javax.management.InstanceAlreadyExistsException;
51 import javax.management.InvalidAttributeValueException;
52 import javax.management.MBeanAttributeInfo;
53 import javax.management.MBeanException;
54 import javax.management.MBeanInfo;
55 import javax.management.MBeanNotificationInfo;
56 import javax.management.MBeanOperationInfo;
57 import javax.management.MBeanParameterInfo;
58 import javax.management.MBeanRegistration;
59 import javax.management.MBeanRegistrationException;
60 import javax.management.MBeanServer;
61 import javax.management.NotCompliantMBeanException;
62 import javax.management.Notification;
63 import javax.management.NotificationBroadcasterSupport;
64 import javax.management.ObjectName;
65 import javax.management.ReflectionException;
66 import javax.management.RuntimeOperationsException;
67 import javax.management.openmbean.CompositeData;
68 import javax.management.openmbean.CompositeDataSupport;
69 import javax.management.openmbean.CompositeType;
70 import javax.management.openmbean.OpenDataException;
71 import javax.management.openmbean.OpenMBeanAttributeInfo;
72 import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
73 import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
74 import javax.management.openmbean.OpenMBeanInfoSupport;
75 import javax.management.openmbean.OpenMBeanOperationInfo;
76 import javax.management.openmbean.OpenMBeanOperationInfoSupport;
77 import javax.management.openmbean.OpenMBeanParameterInfo;
78 import javax.management.openmbean.OpenMBeanParameterInfoSupport;
79 import javax.management.openmbean.SimpleType;
80
81 import org.apache.commons.httpclient.URIException;
82 import org.apache.commons.io.IOUtils;
83 import org.archive.crawler.Heritrix;
84 import org.archive.crawler.datamodel.CandidateURI;
85 import org.archive.crawler.datamodel.Checkpoint;
86 import org.archive.crawler.datamodel.CrawlOrder;
87 import org.archive.crawler.event.CrawlStatusListener;
88 import org.archive.crawler.framework.CrawlController;
89 import org.archive.crawler.framework.FrontierMarker;
90 import org.archive.crawler.framework.StatisticsTracking;
91 import org.archive.crawler.framework.exceptions.InitializationException;
92 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
93 import org.archive.crawler.frontier.AbstractFrontier;
94 import org.archive.crawler.settings.ComplexType;
95 import org.archive.crawler.settings.ModuleAttributeInfo;
96 import org.archive.crawler.settings.TextField;
97 import org.archive.crawler.settings.XMLSettingsHandler;
98 import org.archive.crawler.util.CheckpointUtils;
99 import org.archive.crawler.util.IoUtils;
100 import org.archive.util.ArchiveUtils;
101 import org.archive.util.FileUtils;
102 import org.archive.util.JEMBeanHelper;
103 import org.archive.util.JmxUtils;
104 import org.archive.util.iterator.LineReadingIterator;
105 import org.archive.util.iterator.RegexpLineIterator;
106
107 import com.sleepycat.je.DatabaseException;
108 import com.sleepycat.je.Environment;
109
110 /***
111 * A CrawlJob encapsulates a 'crawl order' with any and all information and
112 * methods needed by a CrawlJobHandler to accept and execute them.
113 *
114 * <p>A given crawl job may also be a 'profile' for a crawl. In that case it
115 * should not be executed as a crawl but can be edited and used as a template
116 * for creating new CrawlJobs.
117 *
118 * <p>All of it's constructors are protected since only a CrawlJobHander
119 * should construct new CrawlJobs.
120 *
121 * @author Kristinn Sigurdsson
122 *
123 * @see org.archive.crawler.admin.CrawlJobHandler#newJob(CrawlJob, String,
124 * String, String, String, int)
125 * @see org.archive.crawler.admin.CrawlJobHandler#newProfile(CrawlJob,
126 * String, String, String)
127 */
128
129 public class CrawlJob extends NotificationBroadcasterSupport
130 implements DynamicMBean, MBeanRegistration, CrawlStatusListener, Serializable {
131 /***
132 * Eclipse generated serial number.
133 */
134 private static final long serialVersionUID = 3411161000452525856L;
135
136 private static final Logger logger =
137 Logger.getLogger(CrawlJob.class.getName());
138
139
140
141 /*** lowest */
142 public static final int PRIORITY_MINIMAL = 0;
143 /*** low */
144 public static final int PRIORITY_LOW = 1;
145 /*** average */
146 public static final int PRIORITY_AVERAGE = 2;
147 /*** high */
148 public static final int PRIORITY_HIGH = 3;
149 /*** highest */
150 public static final int PRIORITY_CRITICAL = 4;
151
152
153
154
155 /*** Inital value. May not be ready to run/incomplete. */
156 public static final String STATUS_CREATED = "Created";
157 /*** Job has been successfully submitted to a CrawlJobHandler */
158 public static final String STATUS_PENDING = "Pending";
159 /*** Job is being crawled */
160 public static final String STATUS_RUNNING = "Running";
161 /*** Job was deleted by user, will not be displayed in UI. */
162 public static final String STATUS_DELETED = "Deleted";
163 /*** Job was terminted by user input while crawling */
164 public static final String STATUS_ABORTED = "Finished - Ended by operator";
165 /*** Something went very wrong */
166 public static final String STATUS_FINISHED_ABNORMAL =
167 "Finished - Abnormal exit from crawling";
168 /*** Job finished normally having completed its crawl. */
169 public static final String STATUS_FINISHED = "Finished";
170 /*** Job finished normally when the specified timelimit was hit. */
171 public static final String STATUS_FINISHED_TIME_LIMIT =
172 "Finished - Timelimit hit";
173 /*** Job finished normally when the specifed amount of
174 * data (MB) had been downloaded */
175 public static final String STATUS_FINISHED_DATA_LIMIT =
176 "Finished - Maximum amount of data limit hit";
177 /*** Job finished normally when the specified number of documents had been
178 * fetched.
179 */
180 public static final String STATUS_FINISHED_DOCUMENT_LIMIT =
181 "Finished - Maximum number of documents limit hit";
182 /*** Job is going to be temporarly stopped after active threads are finished. */
183 public static final String STATUS_WAITING_FOR_PAUSE = "Pausing - " +
184 "Waiting for threads to finish";
185 /*** Job was temporarly stopped. State is kept so it can be resumed */
186 public static final String STATUS_PAUSED = "Paused";
187 /***
188 * Job is being checkpointed. When finished checkpointing, job is set
189 * back to STATUS_PAUSED (Job must be first paused before checkpointing
190 * will run).
191 */
192 public static final String STATUS_CHECKPOINTING = "Checkpointing";
193 /*** Job could not be launced due to an InitializationException */
194 public static final String STATUS_MISCONFIGURED = "Could not launch job " +
195 "- Fatal InitializationException";
196 /*** Job is actually a profile */
197 public static final String STATUS_PROFILE = "Profile";
198
199 public static final String STATUS_PREPARING = "Preparing";
200
201
202 private String UID;
203 private String name;
204 private String status;
205 private boolean isReadOnly = false;
206 private boolean isNew = true;
207 private boolean isProfile = false;
208 private boolean isRunning = false;
209 private int priority;
210 private int numberOfJournalEntries = 0;
211
212 private String statisticsFileSave = "";
213
214 private String errorMessage = null;
215
216 private File jobDir = null;
217
218 private transient CrawlJobErrorHandler errorHandler = null;
219
220 protected transient XMLSettingsHandler settingsHandler;
221
222 private transient CrawlController controller = null;
223
224 public static final String RECOVERY_JOURNAL_STYLE = "recoveryJournal";
225 public static final String CRAWL_LOG_STYLE = "crawlLog";
226
227
228
229 /***
230 * Server we registered with. Maybe null.
231 */
232 private transient MBeanServer mbeanServer = null;
233 private transient ObjectName mbeanName = null;
234 public static final String CRAWLJOB_JMXMBEAN_TYPE =
235 JmxUtils.SERVICE + ".Job";
236 private transient JEMBeanHelper bdbjeMBeanHelper = null;
237 private transient List<String> bdbjeAttributeNameList = null;
238 private transient List<String> bdbjeOperationsNameList = null;
239
240
241 /***
242 * The MBean we've registered ourselves with (May be null
243 * throughout life of Heritrix).
244 */
245 private transient OpenMBeanInfoSupport openMBeanInfo;
246
247 public static final String NAME_ATTR = "Name";
248 public static final String UID_ATTR = "UID";
249 public static final String STATUS_ATTR = "Status";
250 public static final String FRONTIER_SHORT_REPORT_ATTR =
251 "FrontierShortReport";
252 public static final String THREADS_SHORT_REPORT_ATTR =
253 "ThreadsShortReport";
254 public static final String TOTAL_DATA_ATTR = "TotalData";
255 public static final String CRAWL_TIME_ATTR = "CrawlTime";
256 public static final String DOC_RATE_ATTR = "DocRate";
257 public static final String CURRENT_DOC_RATE_ATTR = "CurrentDocRate";
258 public static final String KB_RATE_ATTR = "KbRate";
259 public static final String CURRENT_KB_RATE_ATTR = "CurrentKbRate";
260 public static final String THREAD_COUNT_ATTR = "ThreadCount";
261 public static final String DOWNLOAD_COUNT_ATTR = "DownloadedCount";
262 public static final String DISCOVERED_COUNT_ATTR = "DiscoveredCount";
263 public static final String [] ATTRIBUTE_ARRAY = {NAME_ATTR, UID_ATTR,
264 STATUS_ATTR, FRONTIER_SHORT_REPORT_ATTR, THREADS_SHORT_REPORT_ATTR,
265 TOTAL_DATA_ATTR, CRAWL_TIME_ATTR, DOC_RATE_ATTR,
266 CURRENT_DOC_RATE_ATTR, KB_RATE_ATTR, CURRENT_KB_RATE_ATTR,
267 THREAD_COUNT_ATTR, DOWNLOAD_COUNT_ATTR, DISCOVERED_COUNT_ATTR};
268 public static final List ATTRIBUTE_LIST = Arrays.asList(ATTRIBUTE_ARRAY);
269
270 public static final String IMPORT_URI_OPER = "importUri";
271 public static final String IMPORT_URIS_OPER = "importUris";
272 public static final String DUMP_URIS_OPER = "dumpUris";
273 public static final String PAUSE_OPER = "pause";
274 public static final String RESUME_OPER = "resume";
275 public static final String FRONTIER_REPORT_OPER = "frontierReport";
276 public static final String THREADS_REPORT_OPER = "threadsReport";
277 public static final String SEEDS_REPORT_OPER = "seedsReport";
278 public static final String CHECKPOINT_OPER = "startCheckpoint";
279 public static final String PROGRESS_STATISTICS_OPER =
280 "progressStatistics";
281 public static final String PROGRESS_STATISTICS_LEGEND_OPER =
282 "progressStatisticsLegend";
283
284 public static final String PROG_STATS = "progressStatistics";
285
286
287 public static final String OP_DB_STAT = "getDatabaseStats";
288
289 /***
290 * Don't add the following crawl-order items.
291 */
292 public static final List ORDER_EXCLUDE;
293 static {
294 ORDER_EXCLUDE = Arrays.asList(new String [] {"bdb-cache-percent",
295 "extract-processors", "DNS", "uri-included-structure"});
296 }
297
298 /***
299 * Sequence number for jmx notifications.
300 */
301 private static int notificationsSequenceNumber = 1;
302
303 /***
304 * A shutdown Constructor.
305 */
306 protected CrawlJob() {
307 super();
308 }
309
310 /***
311 * A constructor for jobs.
312 *
313 * <p> Create, ready to crawl, jobs.
314 * @param UID A unique ID for this job. Typically emitted by the
315 * CrawlJobHandler.
316 * @param name The name of the job
317 * @param settingsHandler The associated settings
318 * @param errorHandler The crawl jobs settings error handler.
319 * <tt>null</tt> means none is set
320 * @param priority job priority.
321 * @param dir The directory that is considered this jobs working directory.
322 */
323 public CrawlJob(final String UID,
324 final String name, final XMLSettingsHandler settingsHandler,
325 final CrawlJobErrorHandler errorHandler, final int priority,
326 final File dir) {
327 this(UID, name, settingsHandler, errorHandler,
328 priority, dir, null, false, true);
329 }
330
331 /***
332 * A constructor for profiles.
333 *
334 * <p> Any job created with this constructor will be
335 * considered a profile. Profiles are not stored on disk (only their
336 * settings files are stored on disk). This is because their data is
337 * predictible given any settings files.
338 * @param UIDandName A unique ID for this job. For profiles this is the same
339 * as name
340 * @param settingsHandler The associated settings
341 * @param errorHandler The crawl jobs settings error handler.
342 * <tt>null</tt> means none is set
343 */
344 protected CrawlJob(final String UIDandName,
345 final XMLSettingsHandler settingsHandler,
346 final CrawlJobErrorHandler errorHandler) {
347 this(UIDandName, UIDandName, settingsHandler, errorHandler,
348 PRIORITY_AVERAGE, null, STATUS_PROFILE, true, false);
349 }
350
351 public CrawlJob(final String UID,
352 final String name, final XMLSettingsHandler settingsHandler,
353 final CrawlJobErrorHandler errorHandler, final int priority,
354 final File dir, final String status, final boolean isProfile,
355 final boolean isNew) {
356 super();
357 this.UID = UID;
358 this.name = name;
359 this.settingsHandler = settingsHandler;
360 this.errorHandler = errorHandler;
361 this.status = status;
362 this.isProfile = isProfile;
363 this.isNew = isNew;
364 this.jobDir = dir;
365 this.priority = priority;
366 }
367
368 /***
369 * A constructor for reloading jobs from disk. Jobs (not profiles) have
370 * their data written to persistent storage in the file system. This method
371 * is used to load the job from such storage. This is done by the
372 * <code>CrawlJobHandler</code>.
373 * <p>
374 * Proper structure of a job file (TODO: Maybe one day make this an XML file)
375 * Line 1. UID <br>
376 * Line 2. Job name (string) <br>
377 * Line 3. Job status (string) <br>
378 * Line 4. is job read only (true/false) <br>
379 * Line 5. is job running (true/false) <br>
380 * Line 6. job priority (int) <br>
381 * Line 7. number of journal entries <br>
382 * Line 8. setting file (with path) <br>
383 * Line 9. statistics tracker file (with path) <br>
384 * Line 10-?. error message (String, empty for null), can be many lines <br>
385 * @param jobFile
386 * a file containing information about the job to load.
387 * @param errorHandler The crawl jobs settings error handler.
388 * null means none is set
389 * @throws InvalidJobFileException
390 * if the specified file does not refer to a valid job file.
391 * @throws IOException
392 * if io operations fail
393 */
394 protected CrawlJob(final File jobFile,
395 final CrawlJobErrorHandler errorHandler)
396 throws InvalidJobFileException, IOException {
397 this(null, null, null, errorHandler,
398 PRIORITY_AVERAGE, null, null, false, true);
399 this.jobDir = jobFile.getParentFile();
400
401
402 if (jobFile.length() == 0) {
403 throw new InvalidJobFileException(jobFile.getCanonicalPath() +
404 " is corrupt (length is zero)");
405 }
406
407
408 BufferedReader jobReader =
409 new BufferedReader(new FileReader(jobFile), 4096);
410
411 this.UID = jobReader.readLine();
412
413 this.name = jobReader.readLine();
414
415 this.status = jobReader.readLine();
416 if(status.equals(STATUS_ABORTED)==false
417 && status.equals(STATUS_CREATED)==false
418 && status.equals(STATUS_DELETED)==false
419 && status.equals(STATUS_FINISHED)==false
420 && status.equals(STATUS_FINISHED_ABNORMAL)==false
421 && status.equals(STATUS_FINISHED_DATA_LIMIT)==false
422 && status.equals(STATUS_FINISHED_DOCUMENT_LIMIT)==false
423 && status.equals(STATUS_FINISHED_TIME_LIMIT)==false
424 && status.equals(STATUS_MISCONFIGURED)==false
425 && status.equals(STATUS_PAUSED)==false
426 && status.equals(STATUS_CHECKPOINTING)==false
427 && status.equals(STATUS_PENDING)==false
428 && status.equals(STATUS_RUNNING)==false
429 && status.equals(STATUS_WAITING_FOR_PAUSE)==false
430 && status.equals(STATUS_PREPARING)==false){
431
432 throw new InvalidJobFileException("Status (line 3) in job file " +
433 "is not valid: '" + status + "'");
434 }
435
436 String tmp = jobReader.readLine();
437 if(tmp.equals("true")){
438 isReadOnly = true;
439 } else if(tmp.equals("false")){
440 isReadOnly = false;
441 } else {
442 throw new InvalidJobFileException("isReadOnly (line 4) in job" +
443 " file '" + jobFile.getAbsolutePath() + "' is not " +
444 "valid: '" + tmp + "'");
445 }
446
447 tmp = jobReader.readLine();
448 if(tmp.equals("true")){
449 this.isRunning = true;
450 } else if(tmp.equals("false")){
451 this.isRunning = false;
452 } else {
453 throw new InvalidJobFileException("isRunning (line 5) in job " +
454 "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
455 "'" + tmp + "'");
456 }
457
458 tmp = jobReader.readLine();
459 try{
460 this.priority = Integer.parseInt(tmp);
461 } catch(NumberFormatException e){
462 throw new InvalidJobFileException("priority (line 5) in job " +
463 "file '" + jobFile.getAbsolutePath() + "' is not valid: " +
464 "'" + tmp + "'");
465 }
466
467 tmp = jobReader.readLine();
468 try{
469 this.numberOfJournalEntries = Integer.parseInt(tmp);
470 } catch(NumberFormatException e){
471 throw new InvalidJobFileException("numberOfJournalEntries " +
472 "(line 5) in job file '" + jobFile.getAbsolutePath() +
473 "' is not valid: " + "'" + tmp + "'");
474 }
475
476 tmp = jobReader.readLine();
477 try {
478 File f = new File(tmp);
479 this.settingsHandler = new XMLSettingsHandler((f.isAbsolute())?
480 f: new File(jobDir, f.getName()));
481 if(this.errorHandler != null){
482 this.settingsHandler.registerValueErrorHandler(errorHandler);
483 }
484 this.settingsHandler.initialize();
485 } catch (InvalidAttributeValueException e1) {
486 throw new InvalidJobFileException("Problem reading from settings " +
487 "file (" + tmp + ") specified in job file '" +
488 jobFile.getAbsolutePath() + "'\n" + e1.getMessage());
489 }
490
491 jobReader.readLine();
492
493
494 tmp = jobReader.readLine();
495 errorMessage = "";
496 while(tmp!=null){
497 errorMessage+=tmp+'\n';
498 tmp = jobReader.readLine();
499 }
500 if(errorMessage.length()==0){
501
502 errorMessage = null;
503 }
504
505
506
507 jobReader.close();
508 }
509
510 /***
511 * Cause the job to be written to persistent storage.
512 * This will also save the statistics tracker if it is not null and the
513 * job status is finished (regardless of how it's finished)
514 */
515 private void writeJobFile() {
516 if (isProfile) {
517 return;
518 }
519
520 final String jobDirAbsolute = jobDir.getAbsolutePath();
521 if (!jobDir.exists() || !jobDir.canWrite()) {
522 logger.warning("Can't update status on " +
523 jobDirAbsolute + " because file does not" +
524 " exist (or is unwriteable)");
525 return;
526 }
527 File f = new File(jobDirAbsolute, "state.job");
528
529 String settingsFile = getSettingsDirectory();
530
531
532 if(settingsFile.startsWith(jobDirAbsolute.concat(File.separator))) {
533 settingsFile = settingsFile.substring(jobDirAbsolute.length()+1);
534 }
535 try {
536 OutputStreamWriter jobWriter =
537 new OutputStreamWriter(
538 new FileOutputStream(f, false),
539 "UTF-8");
540 try {
541 jobWriter.write(UID + "\n");
542 jobWriter.write(name + "\n");
543 jobWriter.write(status + "\n");
544 jobWriter.write(isReadOnly + "\n");
545 jobWriter.write(isRunning + "\n");
546 jobWriter.write(priority + "\n");
547 jobWriter.write(numberOfJournalEntries + "\n");
548 jobWriter.write(settingsFile + "\n");
549 jobWriter.write(statisticsFileSave + "\n");
550
551
552 if (errorMessage != null) {
553 jobWriter.write(errorMessage + "\n");
554 }
555 } finally {
556 if (jobWriter != null) {
557 jobWriter.close();
558 }
559 }
560 } catch (IOException e) {
561 logger.log(Level.WARNING, "An IOException occured saving job " +
562 name + " (" + UID + ")", e);
563 }
564 }
565
566 /***
567 * Returns this jobs unique ID (UID) that was issued by the
568 * CrawlJobHandler() when this job was first created.
569 *
570 * @return Job This jobs UID.
571 * @see CrawlJobHandler#getNextJobUID()
572 */
573 public String getUID(){
574 return UID;
575 }
576
577 /***
578 * Returns this job's 'name'. The name comes from the settings for this job,
579 * need not be unique and may change. For a unique identifier use
580 * {@link #getUID() getUID()}.
581 * <p>
582 * The name corrisponds to the value of the 'name' tag in the 'meta' section
583 * of the settings file.
584 *
585 * @return This job's 'name'
586 */
587 public String getJobName(){
588 return name;
589 }
590
591 /***
592 * Return the combination of given name and UID most commonly
593 * used in administrative interface.
594 *
595 * @return Job's name with UID notation
596 */
597 public String getDisplayName() {
598 return getJobName()+" ["+getUID()+"]";
599 }
600
601 /***
602 * Set this job's level of priority.
603 *
604 * @param priority The level of priority
605 *
606 * @see #getJobPriority()
607 * @see #PRIORITY_MINIMAL
608 * @see #PRIORITY_LOW
609 * @see #PRIORITY_AVERAGE
610 * @see #PRIORITY_HIGH
611 * @see #PRIORITY_CRITICAL
612 */
613 public void setJobPriority(int priority) {
614 this.priority = priority;
615 }
616
617 /***
618 * Get this job's level of priority.
619 *
620 * @return this job's priority
621 * @see #setJobPriority(int)
622 * @see #PRIORITY_MINIMAL
623 * @see #PRIORITY_LOW
624 * @see #PRIORITY_AVERAGE
625 * @see #PRIORITY_HIGH
626 * @see #PRIORITY_CRITICAL
627 */
628 public int getJobPriority() {
629 return priority;
630 }
631
632 /***
633 * Once called no changes can be made to the settings for this job.
634 * Typically this is done once a crawl is completed and further changes
635 * to the crawl order are therefor meaningless.
636 */
637 public void setReadOnly() {
638 isReadOnly = true;
639 writeJobFile();
640 }
641
642 /***
643 * Is job read only?
644 * @return false until setReadOnly has been invoked, after that it returns true.
645 */
646 public boolean isReadOnly(){
647 return isReadOnly;
648 }
649
650 /***
651 * Set the status of this CrawlJob.
652 *
653 * @param status Current status of CrawlJob
654 * (see constants defined here beginning with STATUS)
655 */
656 public void setStatus(String status) {
657 this.status = status;
658 writeJobFile();
659
660 }
661
662 /***
663 * @return Status of the crawler (Used by JMX).
664 */
665 public String getCrawlStatus() {
666 return this.controller != null?
667 this.controller.getState().toString(): "Illegal State";
668 }
669
670 /***
671 * Get the current status of this CrawlJob
672 *
673 * @return The current status of this CrawlJob
674 * (see constants defined here beginning with STATUS)
675 */
676 public String getStatus() {
677 return this.status;
678 }
679
680 /***
681 * Returns the settings handler for this job. It will have been initialized.
682 * @return the settings handler for this job.
683 */
684 public XMLSettingsHandler getSettingsHandler() {
685 return this.settingsHandler;
686 }
687 /***
688 * Is this a new job?
689 * @return True if is new.
690 */
691 public boolean isNew() {
692 return isNew;
693 }
694
695 /***
696 * Set if the job is considered to be a profile
697 * @return True if is a profile.
698 */
699 public boolean isProfile() {
700 return isProfile;
701 }
702
703 /***
704 * Set if the job is considered a new job or not.
705 * @param b Is the job considered to be new.
706 */
707 public void setNew(boolean b) {
708 isNew = b;
709 writeJobFile();
710 }
711
712 /***
713 * Returns true if the job is being crawled.
714 * @return true if the job is being crawled
715 */
716 public boolean isRunning() {
717 return isRunning;
718 }
719
720 /***
721 * Set if job is being crawled.
722 * @param b Is job being crawled.
723 */
724 protected void setRunning(boolean b) {
725 isRunning = b;
726 writeJobFile();
727
728
729
730
731
732 }
733
734 protected void unregisterMBean() {
735
736 if (this.mbeanServer == null) {
737 return;
738 }
739 try {
740 this.mbeanServer.unregisterMBean(this.mbeanName);
741 this.mbeanServer = null;
742 } catch (Exception e) {
743 logger.log(Level.SEVERE, "Failed with " + this.mbeanName, e);
744 }
745 }
746
747 /***
748 * Subclass of crawlcontroller that unregisters beans when stopped.
749 * Done as subclass so CrawlController doesn't get any JMX (or 'CrawlJob')
750 * pollution, so for sure CrawlJob is unregistered with JMX and so any
751 * listeners on the CrawlJob get a chance to get crawl ended message
752 * (These latter notifications may not actually be getting through -- TBD).
753 * <p>TODO: This override dirtys the data model since CC knows about CJs.
754 * The facility provided by this class emitting events and statistics so
755 * they can be read by JMX needs to go back into CC. Probably best to
756 * registering in JMX the CC, rather than CJ. Lets do this in Heritrix 2.0
757 * since means changing the JMX API some.
758 */
759 public class MBeanCrawlController extends CrawlController
760 implements Serializable {
761 private static final long serialVersionUID = -4608537998168407222L;
762 private CrawlJob cj = null;
763 private CompositeType ct = null;
764
765 public CrawlJob getCrawlJob() {
766 return this.cj;
767 }
768
769 public void setCrawlJob(CrawlJob cj) {
770 this.cj = cj;
771 }
772
773 @SuppressWarnings("unchecked")
774 public void progressStatisticsEvent(final EventObject e) {
775 super.progressStatisticsEvent(e);
776 if (this.cj.getMbeanName() == null) {
777
778 return;
779 }
780
781 Map s = ((StatisticsTracking)e.getSource()).getProgressStatistics();
782
783
784 CompositeData cd = null;
785 try {
786 if (this.ct == null) {
787 this.ct = JmxUtils.createCompositeType(s, PROG_STATS,
788 PROG_STATS + " for " + this.cj.getMbeanName());
789 }
790 cd = new CompositeDataSupport(this.ct, s);
791 } catch (OpenDataException ode) {
792 ode.printStackTrace();
793 }
794 if (cd != null) {
795 Notification n = new Notification(PROG_STATS,
796 this.cj.getMbeanName(), getNotificationsSequenceNumber(),
797 ((StatisticsTracking)e.getSource()).
798 getProgressStatisticsLine());
799 n.setUserData(cd);
800 this.cj.sendNotification(n);
801 }
802 }
803
804 protected void completeStop() {
805 try {
806 super.completeStop();
807 } finally {
808 if (this.cj != null) {
809 this.cj.unregisterMBean();
810 }
811 this.cj = null;
812 }
813 }
814 }
815
816 protected CrawlController setupCrawlController()
817 throws InitializationException {
818 CrawlController controller = null;
819
820
821
822
823 Checkpoint cp = CrawlController.
824 getCheckpointRecover(getSettingsHandler().getOrder());
825 if (cp != null) {
826 try {
827 controller = (MBeanCrawlController)CheckpointUtils.
828 readObjectFromFile(MBeanCrawlController.class,
829 cp.getDirectory());
830 } catch (FileNotFoundException e) {
831 throw new InitializationException(e);
832 } catch (IOException e) {
833 throw new InitializationException(e);
834 } catch (ClassNotFoundException e) {
835 throw new InitializationException(e);
836 }
837 } else {
838 controller = new MBeanCrawlController();
839 }
840 return controller;
841 }
842
843 protected CrawlController createCrawlController() {
844 return new MBeanCrawlController();
845 }
846
847 public void setupForCrawlStart()
848 throws InitializationException {
849 try {
850 this.controller = setupCrawlController();
851
852 this.controller.addCrawlStatusListener(this);
853 this.controller.initialize(getSettingsHandler());
854
855 ((MBeanCrawlController)this.controller).setCrawlJob(this);
856
857 this.openMBeanInfo = buildMBeanInfo();
858 try {
859 Heritrix.registerMBean(this, getJmxJobName(),
860 CRAWLJOB_JMXMBEAN_TYPE);
861 } catch (InstanceAlreadyExistsException e) {
862 throw new InitializationException(e);
863 } catch (MBeanRegistrationException e) {
864 throw new InitializationException(e);
865 } catch (NotCompliantMBeanException e) {
866 throw new InitializationException(e);
867 }
868 } catch (InitializationException e) {
869
870 setStatus(CrawlJob.STATUS_MISCONFIGURED);
871 setErrorMessage("A fatal InitializationException occured when "
872 + "loading job:\n" + e.getMessage());
873
874 e.printStackTrace();
875 this.controller = null;
876 throw e;
877 }
878 setStatus(CrawlJob.STATUS_RUNNING);
879 setRunning(true);
880 }
881
882 public void stopCrawling() {
883 if(this.controller != null) {
884 this.controller.requestCrawlStop();
885 }
886 }
887
888 /***
889 * @return One-line Frontier report.
890 */
891 public String getFrontierOneLine() {
892 if (this.controller == null || this.controller.getFrontier() == null) {
893 return "Crawler not running";
894 }
895 return this.controller.getFrontier().singleLineReport();
896 }
897
898 /***
899 * @param reportName Name of report to write.
900 * @return A report of the frontier's status.
901 */
902 public String getFrontierReport(final String reportName) {
903 if (this.controller == null || this.controller.getFrontier() == null) {
904 return "Crawler not running";
905 }
906 return ArchiveUtils.writeReportToString(this.controller.getFrontier(),
907 reportName);
908 }
909
910 /***
911 * Write the requested frontier report to the given PrintWriter
912 * @param reportName Name of report to write.
913 * @param writer Where to write to.
914 */
915 public void writeFrontierReport(String reportName, PrintWriter writer) {
916 if (this.controller == null || this.controller.getFrontier() == null) {
917 writer.println("Crawler not running.");
918 return;
919 }
920 this.controller.getFrontier().reportTo(reportName,writer);
921 }
922
923 /***
924 * @return One-line threads report.
925 */
926 public String getThreadOneLine() {
927 if (this.controller == null) {
928 return "Crawler not running";
929 }
930 return this.controller.oneLineReportThreads();
931 }
932
933 /***
934 * Get the CrawlControllers ToeThreads report for the running crawl.
935 * @return The CrawlControllers ToeThreads report
936 */
937 public String getThreadsReport() {
938 if (this.controller == null) {
939 return "Crawler not running";
940 }
941 return ArchiveUtils.writeReportToString(this.controller.getToePool(),
942 null);
943 }
944
945 /***
946 * Write the requested threads report to the given PrintWriter
947 * @param reportName Name of report to write.
948 * @param writer Where to write to.
949 */
950 public void writeThreadsReport(String reportName, PrintWriter writer) {
951 if (this.controller == null || this.controller.getFrontier() == null) {
952 writer.println("Crawler not running.");
953 return;
954 }
955 this.controller.getToePool().reportTo(reportName, writer);
956 }
957
958 /***
959 * Kills a thread. For details see
960 * {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
961 * ToePool.killThread(int, boolean)}.
962 * @param threadNumber Thread to kill.
963 * @param replace Should thread be replaced.
964 * @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
965 */
966 public void killThread(int threadNumber, boolean replace) {
967 if (this.controller == null) {
968 return;
969 }
970 this.controller.killThread(threadNumber, replace);
971 }
972
973 /***
974 * Get the Processors report for the running crawl.
975 * @return The Processors report for the running crawl.
976 */
977 public String getProcessorsReport() {
978 if (this.controller == null) {
979 return "Crawler not running";
980 }
981 return ArchiveUtils.writeReportToString(this.controller,
982 CrawlController.PROCESSORS_REPORT);
983 }
984
985 /***
986 * Returns the directory where the configuration files for this job are
987 * located.
988 *
989 * @return the directory where the configuration files for this job are
990 * located
991 */
992 public String getSettingsDirectory() {
993 return settingsHandler.getOrderFile().getPath();
994 }
995
996 /***
997 * Returns the path of the job's base directory. For profiles this is always
998 * equal to <code>new File(getSettingsDirectory())</code>.
999 * @return the path of the job's base directory.
1000 */
1001 public File getDirectory(){
1002 return isProfile? new File(getSettingsDirectory()): jobDir;
1003 }
1004
1005 /***
1006 * Get the error message associated with this job. Will return null if there
1007 * is no error message.
1008 * @return the error message associated with this job
1009 */
1010 public String getErrorMessage() {
1011 return errorMessage;
1012 }
1013
1014 /***
1015 * Set an error message for this job. Generally this only occurs if the job
1016 * is misconfigured.
1017 * @param string the error message associated with this job
1018 */
1019 public void setErrorMessage(String string) {
1020 errorMessage = string;
1021 writeJobFile();
1022 }
1023
1024 /***
1025 * @return Returns the number of journal entries.
1026 */
1027 public int getNumberOfJournalEntries() {
1028 return numberOfJournalEntries;
1029 }
1030
1031 /***
1032 * @param numberOfJournalEntries The number of journal entries to set.
1033 */
1034 public void setNumberOfJournalEntries(int numberOfJournalEntries) {
1035 this.numberOfJournalEntries = numberOfJournalEntries;
1036 writeJobFile();
1037 }
1038
1039 /***
1040 * @return Returns the error handler for this crawl job
1041 */
1042 public CrawlJobErrorHandler getErrorHandler() {
1043 return errorHandler;
1044 }
1045
1046 /***
1047 * Read all the checkpoints found in the job's checkpoints
1048 * directory into Checkpoint instances
1049 * @return Collection containing list of all checkpoints.
1050 */
1051 public Collection scanCheckpoints() {
1052 File checkpointsDirectory =
1053 settingsHandler.getOrder().getCheckpointsDirectory();
1054 File[] perCheckpointDirs = checkpointsDirectory.listFiles();
1055 Collection<Checkpoint> checkpoints = new ArrayList<Checkpoint>();
1056 if (perCheckpointDirs != null) {
1057 for (int i = 0; i < perCheckpointDirs.length; i++) {
1058 Checkpoint cp = new Checkpoint(perCheckpointDirs[i]);
1059 checkpoints.add(cp);
1060 }
1061 }
1062 return checkpoints;
1063 }
1064
1065 /***
1066 * Returns the absolute path of the specified log.
1067 * Note: If crawl has not begun, this file may not exist.
1068 * @param log
1069 * @return the absolute path for the specified log.
1070 * @throws AttributeNotFoundException
1071 * @throws ReflectionException
1072 * @throws MBeanException
1073 */
1074 public String getLogPath(String log)
1075 throws AttributeNotFoundException, MBeanException, ReflectionException {
1076 String logsPath = (String)settingsHandler.getOrder().
1077 getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1078 CrawlOrder order = settingsHandler.getOrder();
1079 String diskPath = (String) order.getAttribute(null,
1080 CrawlOrder.ATTR_DISK_PATH);
1081 File disk = settingsHandler.
1082 getPathRelativeToWorkingDirectory(diskPath);
1083 File f = new File(logsPath, log);
1084 if (!f.isAbsolute()) {
1085 f = new File(disk.getPath(), f.getPath());
1086 }
1087 return f.getAbsolutePath();
1088 }
1089
1090
1091
1092 protected void pause() {
1093 if (this.controller != null && this.controller.isPaused() == false) {
1094 this.controller.requestCrawlPause();
1095 }
1096 }
1097
1098 protected void resume() {
1099 if (this.controller != null) {
1100 this.controller.requestCrawlResume();
1101 }
1102 }
1103
1104 /***
1105 * @throws IllegalStateException Thrown if crawl is not paused.
1106 */
1107 protected void checkpoint() throws IllegalStateException {
1108 if (this.controller != null) {
1109 this.controller.requestCrawlCheckpoint();
1110 }
1111 }
1112
1113 /***
1114 * @return True if checkpointing.
1115 */
1116 public boolean isCheckpointing() {
1117 return this.controller != null? this.controller.isCheckpointing(): false;
1118 }
1119
1120 /***
1121 * If its a HostQueuesFrontier, needs to be flushed for the queued.
1122 */
1123 protected void flush() {
1124
1125 }
1126
1127 /***
1128 * Delete any URI from the frontier of the current (paused) job that match
1129 * the specified regular expression. If the current job is not paused (or
1130 * there is no current job) nothing will be done.
1131 * @param regexpr Regular expression to delete URIs by.
1132 * @return the number of URIs deleted
1133 */
1134 public long deleteURIsFromPending(String regexpr){
1135 return deleteURIsFromPending(regexpr,null);
1136 }
1137
1138 /***
1139 * Delete any URI from the frontier of the current (paused) job that match
1140 * the specified regular expression. If the current job is not paused (or
1141 * there is no current job) nothing will be done.
1142 * @param regexpr Regular expression to delete URIs by.
1143 * @return the number of URIs deleted
1144 */
1145 public long deleteURIsFromPending(String uriPattern, String queuePattern){
1146 return (this.controller != null &&
1147 this.controller.getFrontier() != null &&
1148 this.controller.isPaused())?
1149 this.controller.getFrontier().deleteURIs(uriPattern,queuePattern): 0;
1150 }
1151
1152 public String importUris(String file, String style, String force) {
1153 return importUris(file, style, "true".equals(force));
1154 }
1155
1156 public String importUris(final String fileOrUrl, final String style,
1157 final boolean forceRevisit) {
1158 return importUris(fileOrUrl, style, forceRevisit, false);
1159 }
1160
1161 /***
1162 * @param fileOrUrl Name of file w/ seeds.
1163 * @param style What style of seeds -- crawl log, recovery journal, or
1164 * seeds file.
1165 * @param forceRevisit Should we revisit even if seen before?
1166 * @param areSeeds Is the file exclusively seeds?
1167 * @return A display string that has a count of all added.
1168 */
1169 public String importUris(final String fileOrUrl, final String style,
1170 final boolean forceRevisit, final boolean areSeeds) {
1171 InputStream is =
1172 IoUtils.getInputStream(this.controller.getDisk(), fileOrUrl);
1173 String message = null;
1174
1175 if (is == null) {
1176 message = "Failed to get inputstream from " + fileOrUrl;
1177 logger.severe(message);
1178 } else {
1179 int addedCount = importUris(is, style, forceRevisit, areSeeds);
1180 message = Integer.toString(addedCount) + " URIs added from " +
1181 fileOrUrl;
1182 }
1183 return message;
1184 }
1185
1186 protected int importUris(InputStream is, String style,
1187 boolean forceRevisit) {
1188 return importUris(is, style, forceRevisit, false);
1189 }
1190
1191 /***
1192 * Import URIs.
1193 * @param is Stream to use as URI source.
1194 * @param style Style in which URIs are rendored. Currently support for
1195 * <code>recoveryJournal</code>, <code>crawlLog</code>, and seeds file
1196 * format (i.e <code>default</code>) where <code>default</code> style is
1197 * a UURI per line (comments allowed).
1198 * @param forceRevisit Whether we should revisit this URI even if we've
1199 * visited it previously.
1200 * @param areSeeds Are the imported URIs seeds?
1201 * @return Count of added URIs.
1202 */
1203 protected int importUris(InputStream is, String style,
1204 boolean forceRevisit, final boolean areSeeds) {
1205
1206 String extractor;
1207 String output;
1208 if(CRAWL_LOG_STYLE.equals(style)) {
1209
1210 extractor = "//S+//s+//S+//s+//S+//s+(//S+//s+//S+//s+//S+//s+).*";
1211 output = "$1";
1212 } else if (RECOVERY_JOURNAL_STYLE.equals(style)) {
1213
1214 extractor = "//S+//s+((//S+)(?://s+//S+//s+//S+)?)//s*";
1215 output = "$1";
1216 } else {
1217 extractor =
1218 RegexpLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT;
1219 output = RegexpLineIterator.ENTRY;
1220 }
1221
1222 controller.installThreadContextSettingsHandler();
1223
1224
1225 BufferedReader br = null;
1226 int addedCount = 0;
1227 try {
1228 br = new BufferedReader(new InputStreamReader(is));
1229 Iterator iter = new RegexpLineIterator(new LineReadingIterator(br),
1230 RegexpLineIterator.COMMENT_LINE, extractor, output);
1231 while(iter.hasNext()) {
1232 try {
1233 importUri((String)iter.next(), forceRevisit, areSeeds,
1234 false);
1235 addedCount++;
1236 } catch (URIException e) {
1237 e.printStackTrace();
1238 }
1239 }
1240 br.close();
1241 flush();
1242 } catch (IOException e) {
1243 e.printStackTrace();
1244 }
1245 return addedCount;
1246 }
1247
1248 /***
1249 * Schedule a uri.
1250 * @param uri Uri to schedule.
1251 * @param forceFetch Should it be forcefetched.
1252 * @param isSeed True if seed.
1253 * @throws URIException
1254 */
1255 public void importUri(final String uri, final boolean forceFetch,
1256 final boolean isSeed)
1257 throws URIException {
1258 importUri(uri, forceFetch, isSeed, true);
1259 }
1260
1261 /***
1262 * Schedule a uri.
1263 * @param str String that can be: 1. a UURI, 2. a snippet of the
1264 * crawl.log line, or 3. a snippet from recover log. See
1265 * {@link #importUris(InputStream, String, boolean)} for how it subparses
1266 * the lines from crawl.log and recover.log.
1267 * @param forceFetch Should it be forcefetched.
1268 * @param isSeed True if seed.
1269 * @param isFlush If true, flush the frontier IF it implements
1270 * flushing.
1271 * @throws URIException
1272 */
1273 public void importUri(final String str, final boolean forceFetch,
1274 final boolean isSeed, final boolean isFlush)
1275 throws URIException {
1276 CandidateURI caUri = CandidateURI.fromString(str);
1277 caUri.setForceFetch(forceFetch);
1278 if (isSeed) {
1279 caUri.setIsSeed(isSeed);
1280 if (caUri.getVia() == null || caUri.getVia().length() <= 0) {
1281
1282
1283
1284 this.controller.getScope().addSeed(caUri);
1285 }
1286 }
1287 this.controller.getFrontier().schedule(caUri);
1288 if (isFlush) {
1289 flush();
1290 }
1291 }
1292
1293
1294 /***
1295 * @return Our mbean info (Needed for CrawlJob to qualify as a
1296 * DynamicMBean).
1297 */
1298 public MBeanInfo getMBeanInfo() {
1299 return this.openMBeanInfo;
1300 }
1301
1302 /***
1303 * Build up the MBean info for Heritrix main.
1304 * @return Return created mbean info instance.
1305 * @throws InitializationException
1306 */
1307 protected OpenMBeanInfoSupport buildMBeanInfo()
1308 throws InitializationException {
1309
1310 List<OpenMBeanAttributeInfo> attributes
1311 = new ArrayList<OpenMBeanAttributeInfo>();
1312
1313
1314 attributes.add(new OpenMBeanAttributeInfoSupport(NAME_ATTR,
1315 "Crawl job name", SimpleType.STRING, true, false, false));
1316 attributes.add(new OpenMBeanAttributeInfoSupport(STATUS_ATTR,
1317 "Short basic status message", SimpleType.STRING, true, false,
1318 false));
1319 attributes.add(
1320 new OpenMBeanAttributeInfoSupport(FRONTIER_SHORT_REPORT_ATTR,
1321 "Short frontier report", SimpleType.STRING, true,
1322 false, false));
1323 attributes.add(
1324 new OpenMBeanAttributeInfoSupport(THREADS_SHORT_REPORT_ATTR,
1325 "Short threads report", SimpleType.STRING, true,
1326 false, false));
1327 attributes.add(new OpenMBeanAttributeInfoSupport(UID_ATTR,
1328 "Crawl job UID", SimpleType.STRING, true, false, false));
1329 attributes.add(new OpenMBeanAttributeInfoSupport(TOTAL_DATA_ATTR,
1330 "Total data received", SimpleType.LONG, true, false, false));
1331 attributes.add(new OpenMBeanAttributeInfoSupport(CRAWL_TIME_ATTR,
1332 "Crawl time", SimpleType.LONG, true, false, false));
1333 attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_DOC_RATE_ATTR,
1334 "Current crawling rate (Docs/sec)", SimpleType.DOUBLE,
1335 true, false, false));
1336 attributes.add(new OpenMBeanAttributeInfoSupport(CURRENT_KB_RATE_ATTR,
1337 "Current crawling rate (Kb/sec)", SimpleType.LONG,
1338 true, false, false));
1339 attributes.add(new OpenMBeanAttributeInfoSupport(THREAD_COUNT_ATTR,
1340 "Active thread count", SimpleType.INTEGER, true, false, false));
1341 attributes.add(new OpenMBeanAttributeInfoSupport(DOC_RATE_ATTR,
1342 "Crawling rate (Docs/sec)", SimpleType.DOUBLE,
1343 true, false, false));
1344 attributes.add(new OpenMBeanAttributeInfoSupport(KB_RATE_ATTR,
1345 "Current crawling rate (Kb/sec)", SimpleType.LONG,
1346 true, false, false));
1347 attributes.add(new OpenMBeanAttributeInfoSupport(DOWNLOAD_COUNT_ATTR,
1348 "Count of downloaded documents", SimpleType.LONG,
1349 true, false, false));
1350 attributes.add(new OpenMBeanAttributeInfoSupport(DISCOVERED_COUNT_ATTR,
1351 "Count of discovered documents", SimpleType.LONG,
1352 true, false, false));
1353
1354
1355 addCrawlOrderAttributes(this.getController().getOrder(), attributes);
1356
1357
1358
1359
1360
1361 Environment env = this.controller.getBdbEnvironment();
1362 try {
1363 this.bdbjeMBeanHelper =
1364 new JEMBeanHelper(env.getConfig(), env.getHome(), true);
1365 } catch (DatabaseException e) {
1366 e.printStackTrace();
1367 InitializationException ie =
1368 new InitializationException(e.getMessage());
1369 ie.setStackTrace(e.getStackTrace());
1370 throw ie;
1371 }
1372 this.bdbjeAttributeNameList = Arrays.asList(new String [] {
1373 JEMBeanHelper.ATT_ENV_HOME,
1374 JEMBeanHelper.ATT_OPEN,
1375 JEMBeanHelper.ATT_IS_READ_ONLY,
1376 JEMBeanHelper.ATT_IS_TRANSACTIONAL,
1377 JEMBeanHelper.ATT_CACHE_SIZE,
1378 JEMBeanHelper.ATT_CACHE_PERCENT,
1379 JEMBeanHelper.ATT_LOCK_TIMEOUT,
1380 JEMBeanHelper.ATT_IS_SERIALIZABLE,
1381 JEMBeanHelper.ATT_SET_READ_ONLY,
1382 });
1383 addBdbjeAttributes(attributes,
1384 this.bdbjeMBeanHelper.getAttributeList(env),
1385 this.bdbjeAttributeNameList);
1386
1387
1388 List<OpenMBeanOperationInfo> operations
1389 = new ArrayList<OpenMBeanOperationInfo>();
1390 OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[3];
1391 args[0] = new OpenMBeanParameterInfoSupport("url",
1392 "URL to add to the frontier", SimpleType.STRING);
1393 args[1] = new OpenMBeanParameterInfoSupport("forceFetch",
1394 "True if URL is to be force fetched", SimpleType.BOOLEAN);
1395 args[2] = new OpenMBeanParameterInfoSupport("seed",
1396 "True if URL is a seed", SimpleType.BOOLEAN);
1397 operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URI_OPER,
1398 "Add passed URL to the frontier", args, SimpleType.VOID,
1399 MBeanOperationInfo.ACTION));
1400
1401 args = new OpenMBeanParameterInfoSupport[4];
1402 args[0] = new OpenMBeanParameterInfoSupport("pathOrUrl",
1403 "Path or URL to file of URLs", SimpleType.STRING);
1404 args[1] = new OpenMBeanParameterInfoSupport("style",
1405 "Format format:default|crawlLog|recoveryJournal",
1406 SimpleType.STRING);
1407 args[2] = new OpenMBeanParameterInfoSupport("forceFetch",
1408 "True if URLs are to be force fetched", SimpleType.BOOLEAN);
1409 args[3] = new OpenMBeanParameterInfoSupport("seed",
1410 "True if all content are seeds.", SimpleType.BOOLEAN);
1411 operations.add(new OpenMBeanOperationInfoSupport(IMPORT_URIS_OPER,
1412 "Add file of passed URLs to the frontier", args, SimpleType.STRING,
1413 MBeanOperationInfo.ACTION));
1414
1415
1416 args = new OpenMBeanParameterInfoSupport[4];
1417 args[0] = new OpenMBeanParameterInfoSupport("filename",
1418 "File to print to", SimpleType.STRING);
1419 args[1] = new OpenMBeanParameterInfoSupport("regexp",
1420 "Regular expression URLs must match", SimpleType.STRING);
1421 args[2] = new OpenMBeanParameterInfoSupport("numberOfMatches",
1422 "Maximum number of matches to return", SimpleType.INTEGER);
1423 args[3] = new OpenMBeanParameterInfoSupport("verbose",
1424 "Should they be verbose descriptions", SimpleType.BOOLEAN);
1425 operations.add(new OpenMBeanOperationInfoSupport(DUMP_URIS_OPER,
1426 "Dump pending URIs from frontier to a file", args,
1427 SimpleType.VOID, MBeanOperationInfo.ACTION));
1428
1429 operations.add(new OpenMBeanOperationInfoSupport(PAUSE_OPER,
1430 "Pause crawling (noop if already paused)", null, SimpleType.VOID,
1431 MBeanOperationInfo.ACTION));
1432
1433 operations.add(new OpenMBeanOperationInfoSupport(RESUME_OPER,
1434 "Resume crawling (noop if already resumed)", null,
1435 SimpleType.VOID, MBeanOperationInfo.ACTION));
1436
1437 args = new OpenMBeanParameterInfoSupport[1];
1438 args[0] = new OpenMBeanParameterInfoSupport("name",
1439 "Name of report ('all', 'standard', etc.).", SimpleType.STRING);
1440 operations.add(new OpenMBeanOperationInfoSupport(FRONTIER_REPORT_OPER,
1441 "Full frontier report", args, SimpleType.STRING,
1442 MBeanOperationInfo.INFO));
1443
1444 operations.add(new OpenMBeanOperationInfoSupport(THREADS_REPORT_OPER,
1445 "Full thread report", null, SimpleType.STRING,
1446 MBeanOperationInfo.INFO));
1447
1448 operations.add(new OpenMBeanOperationInfoSupport(SEEDS_REPORT_OPER,
1449 "Seeds report", null, SimpleType.STRING, MBeanOperationInfo.INFO));
1450
1451 operations.add(
1452 new OpenMBeanOperationInfoSupport(PROGRESS_STATISTICS_OPER,
1453 "Progress statistics at time of invocation", null,
1454 SimpleType.STRING, MBeanOperationInfo.INFO));
1455
1456 operations.add(new OpenMBeanOperationInfoSupport(
1457 PROGRESS_STATISTICS_LEGEND_OPER,
1458 "Progress statistics legend", null,
1459 SimpleType.STRING, MBeanOperationInfo.INFO));
1460
1461 operations.add(new OpenMBeanOperationInfoSupport(CHECKPOINT_OPER,
1462 "Start a checkpoint", null, SimpleType.VOID,
1463 MBeanOperationInfo.ACTION));
1464
1465
1466
1467
1468 this.bdbjeOperationsNameList = Arrays.asList(new String[] { "cleanLog",
1469 "evictMemory", "checkpoint", "sync",
1470 "getEnvironmentStatsToString", "getLockStatsToString",
1471 "getDatabaseNames", OP_DB_STAT
1472 });
1473 addBdbjeOperations(operations,
1474 this.bdbjeMBeanHelper.getOperationList(env),
1475 this.bdbjeOperationsNameList);
1476
1477
1478 List<MBeanNotificationInfo> notifications
1479 = new ArrayList<MBeanNotificationInfo>();
1480 notifications.add(
1481 new MBeanNotificationInfo(new String [] {"crawlStarted",
1482 "crawlEnding", "crawlPaused", "crawlResuming", PROG_STATS},
1483 this.getClass().getName() + ".notifications",
1484 "CrawlStatusListener events and progress statistics as " +
1485 "notifications"));
1486 MBeanNotificationInfo [] notificationsArray =
1487 new MBeanNotificationInfo[notifications.size()];
1488 notifications.toArray(notificationsArray);
1489
1490
1491 OpenMBeanAttributeInfoSupport[] attributesArray =
1492 new OpenMBeanAttributeInfoSupport[attributes.size()];
1493 attributes.toArray(attributesArray);
1494 OpenMBeanOperationInfoSupport[] operationsArray =
1495 new OpenMBeanOperationInfoSupport[operations.size()];
1496 operations.toArray(operationsArray);
1497 return new OpenMBeanInfoSupport(this.getClass().getName(),
1498 "Current Crawl Job as OpenMBean",
1499 attributesArray,
1500 new OpenMBeanConstructorInfoSupport [] {},
1501 operationsArray,
1502 notificationsArray);
1503 }
1504
1505 protected void addBdbjeAttributes(
1506 final List<OpenMBeanAttributeInfo> attributes,
1507 final List<MBeanAttributeInfo> bdbjeAttributes,
1508 final List<String> bdbjeNamesToAdd) {
1509 for (MBeanAttributeInfo info: bdbjeAttributes) {
1510 if (bdbjeNamesToAdd.contains(info.getName())) {
1511 attributes.add(JmxUtils.convertToOpenMBeanAttribute(info));
1512 }
1513 }
1514 }
1515
1516 protected void addBdbjeOperations(
1517 final List<OpenMBeanOperationInfo> operations,
1518 final List<MBeanOperationInfo> bdbjeOperations,
1519 final List<String> bdbjeNamesToAdd) {
1520 for (MBeanOperationInfo info: bdbjeOperations) {
1521 if (bdbjeNamesToAdd.contains(info.getName())) {
1522 OpenMBeanOperationInfo omboi = null;
1523 if (info.getName().equals(OP_DB_STAT)) {
1524
1525
1526
1527 omboi = JmxUtils.convertToOpenMBeanOperation(info, null,
1528 SimpleType.STRING);
1529 MBeanParameterInfo[] params = omboi.getSignature();
1530 OpenMBeanParameterInfo[] args =
1531 new OpenMBeanParameterInfoSupport[params.length + 1];
1532 for (int ii = 0; ii < params.length; ii++) {
1533 args[ii] = (OpenMBeanParameterInfo) params[ii];
1534 }
1535 args[params.length] = new OpenMBeanParameterInfoSupport(
1536 "name", "Database name", SimpleType.STRING);
1537 omboi = new OpenMBeanOperationInfoSupport(omboi.getName(),
1538 omboi.getDescription(), args, omboi.getReturnOpenType(),
1539 omboi.getImpact());
1540 } else {
1541 omboi = JmxUtils.convertToOpenMBeanOperation(info);
1542 }
1543 operations.add(omboi);
1544 }
1545 }
1546 }
1547
1548 protected void addCrawlOrderAttributes(final ComplexType type,
1549 final List<OpenMBeanAttributeInfo> attributes) {
1550 for (final Iterator i = type.getAttributeInfoIterator(null);
1551 i.hasNext();) {
1552 ModuleAttributeInfo info = (ModuleAttributeInfo)i.next();
1553 if (ORDER_EXCLUDE.contains(info.getName())) {
1554
1555 continue;
1556 }
1557 String absoluteName = type.getAbsoluteName() + "/" + info.getName();
1558 if (JmxUtils.isOpenType(info.getType())) {
1559 String description = info.getDescription();
1560 if (description == null || description.length() <= 0) {
1561
1562 description = info.getName();
1563 }
1564 attributes.add(new OpenMBeanAttributeInfoSupport(
1565 absoluteName, description,
1566 JmxUtils.getOpenType(info.getType()), true, true, false));
1567 } else if(info.isComplexType()) {
1568 try {
1569 ComplexType c =
1570 (ComplexType)type.getAttribute(info.getName());
1571 addCrawlOrderAttributes(c, attributes);
1572 } catch (AttributeNotFoundException e) {
1573 logger.log(Level.SEVERE, "Failed get of attribute", e);
1574 } catch (MBeanException e) {
1575 logger.log(Level.SEVERE, "Failed get of attribute", e);
1576 } catch (ReflectionException e) {
1577 logger.log(Level.SEVERE, "Failed get of attribute", e);
1578 }
1579 } else if (info.getType().equals(TextField.class.getName())) {
1580
1581 attributes.add(new OpenMBeanAttributeInfoSupport(
1582 absoluteName, info.getDescription(),
1583 SimpleType.STRING, true, true, false));
1584 } else {
1585
1586
1587 logger.fine(info.getType());
1588 }
1589 }
1590 }
1591
1592 public Object getAttribute(String attribute_name)
1593 throws AttributeNotFoundException {
1594 if (attribute_name == null) {
1595 throw new RuntimeOperationsException(
1596 new IllegalArgumentException("Attribute name cannot be null"),
1597 "Cannot call getAttribute with null attribute name");
1598 }
1599
1600
1601 if (this.controller == null) {
1602 throw new RuntimeOperationsException(
1603 new NullPointerException("Controller is null"),
1604 "Controller is null");
1605 }
1606
1607
1608 if (this.bdbjeAttributeNameList.contains(attribute_name)) {
1609 try {
1610 return this.bdbjeMBeanHelper.getAttribute(
1611 this.controller.getBdbEnvironment(), attribute_name);
1612 } catch (MBeanException e) {
1613 throw new RuntimeOperationsException(new RuntimeException(e));
1614 }
1615 }
1616
1617
1618 if (attribute_name.
1619 startsWith(this.controller.getOrder().getAbsoluteName())) {
1620 return getCrawlOrderAttribute(attribute_name);
1621 }
1622
1623 if (!ATTRIBUTE_LIST.contains(attribute_name)) {
1624 throw new AttributeNotFoundException("Attribute " +
1625 attribute_name + " is unimplemented.");
1626 }
1627
1628
1629
1630
1631
1632 if (attribute_name.equals(STATUS_ATTR)) {
1633 return getCrawlStatus();
1634 }
1635 if (attribute_name.equals(NAME_ATTR)) {
1636 return getJobName();
1637 }
1638 if (attribute_name.equals(UID_ATTR)) {
1639 return getUID();
1640 }
1641 if (attribute_name.equals(TOTAL_DATA_ATTR)) {
1642 return new Long(this.controller == null &&
1643 this.controller.getStatistics() != null? 0:
1644 this.controller.getStatistics().totalBytesCrawled());
1645 }
1646 if (attribute_name.equals(CRAWL_TIME_ATTR)) {
1647 return new Long(this.controller == null &&
1648 this.controller.getStatistics() != null? 0:
1649 this.controller.getStatistics().getCrawlerTotalElapsedTime() /
1650 1000);
1651 }
1652 if (attribute_name.equals(CURRENT_DOC_RATE_ATTR)) {
1653 return new Double(this.controller == null &&
1654 this.controller.getStatistics() != null? 0:
1655 this.controller.getStatistics().currentProcessedDocsPerSec());
1656 }
1657 if (attribute_name.equals(DOC_RATE_ATTR)) {
1658 return new Double(this.controller == null &&
1659 this.controller.getStatistics() != null? 0:
1660 this.controller.getStatistics().processedDocsPerSec());
1661 }
1662 if (attribute_name.equals(KB_RATE_ATTR)) {
1663 return new Long(this.controller == null &&
1664 this.controller.getStatistics() != null? 0:
1665 this.controller.getStatistics().currentProcessedKBPerSec());
1666 }
1667 if (attribute_name.equals(CURRENT_KB_RATE_ATTR)) {
1668 return new Long(this.controller == null &&
1669 this.controller.getStatistics() != null? 0:
1670 this.controller.getStatistics().processedKBPerSec());
1671 }
1672 if (attribute_name.equals(THREAD_COUNT_ATTR)) {
1673 return new Integer(this.controller == null &&
1674 this.controller.getStatistics() != null? 0:
1675 this.controller.getStatistics().activeThreadCount());
1676 }
1677 if (attribute_name.equals(FRONTIER_SHORT_REPORT_ATTR)) {
1678 return getFrontierOneLine();
1679 }
1680 if (attribute_name.equals(THREADS_SHORT_REPORT_ATTR)) {
1681 return getThreadOneLine();
1682 }
1683 if (attribute_name.equals(DISCOVERED_COUNT_ATTR)) {
1684 return new Long(this.controller == null &&
1685 this.controller.getStatistics() != null? 0:
1686 this.controller.getStatistics().totalCount());
1687 }
1688 if (attribute_name.equals(DOWNLOAD_COUNT_ATTR)) {
1689 return new Long(this.controller == null &&
1690 this.controller.getStatistics() != null? 0:
1691 this.controller.getStatistics().successfullyFetchedCount());
1692 }
1693
1694 throw new AttributeNotFoundException("Attribute " +
1695 attribute_name + " not found.");
1696 }
1697
1698 protected Object getCrawlOrderAttribute(final String attribute_name) {
1699 CrawlOrder order = this.getController().getOrder();
1700 Object result = null;
1701 try {
1702 result = getCrawlOrderAttribute(attribute_name.substring(order
1703 .getAbsoluteName().length()), order);
1704 } catch (NullPointerException e) {
1705 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1706 } catch (AttributeNotFoundException e) {
1707 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1708 } catch (MBeanException e) {
1709 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1710 } catch (ReflectionException e) {
1711 logger.log(Level.SEVERE, "Failed get of " + attribute_name, e);
1712 }
1713 return result;
1714 }
1715
1716 protected Object getCrawlOrderAttribute(final String attribute_name,
1717 final ComplexType ct)
1718 throws AttributeNotFoundException, MBeanException, ReflectionException {
1719 String subName = attribute_name.startsWith("/") ? attribute_name
1720 .substring(1) : attribute_name;
1721 int index = subName.indexOf("/");
1722 if (index <= 0) {
1723 MBeanAttributeInfo info = ct.getAttributeInfo(subName);
1724
1725 return info.getType().equals(TextField.class.getName()) ? ct
1726 .getAttribute(subName).toString() : ct
1727 .getAttribute(subName);
1728 }
1729 return getCrawlOrderAttribute(subName.substring(index + 1),
1730 (ComplexType) ct.getAttribute(subName.substring(0, index)));
1731 }
1732
1733 public AttributeList getAttributes(String [] attributeNames) {
1734 if (attributeNames == null) {
1735 throw new RuntimeOperationsException(
1736 new IllegalArgumentException("attributeNames[] cannot be " +
1737 "null"), "Cannot call getAttributes with null attribute " +
1738 "names");
1739 }
1740
1741
1742 if (this.controller == null) {
1743 throw new RuntimeOperationsException(
1744 new NullPointerException("Controller is null"),
1745 "Controller is null");
1746 }
1747
1748 AttributeList resultList = new AttributeList();
1749 if (attributeNames.length == 0) {
1750 return resultList;
1751 }
1752 for (int i = 0; i < attributeNames.length; i++) {
1753 try {
1754 Object value = getAttribute(attributeNames[i]);
1755 resultList.add(new Attribute(attributeNames[i], value));
1756 } catch (Exception e) {
1757 e.printStackTrace();
1758 }
1759 }
1760 return(resultList);
1761 }
1762
1763 public void setAttribute(Attribute attribute)
1764 throws AttributeNotFoundException {
1765 setAttributeInternal(attribute);
1766
1767 kickUpdate();
1768 }
1769
1770 protected void setAttributeInternal(Attribute attribute)
1771 throws AttributeNotFoundException {
1772
1773 CrawlOrder order = this.getController().getOrder();
1774 String attName = attribute.getName();
1775 if (attName.startsWith(order.getAbsoluteName())) {
1776 try {
1777 setCrawlOrderAttribute(attribute.getName().substring(
1778 order.getAbsoluteName().length()), order, attribute);
1779 } catch (NullPointerException e) {
1780 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1781 } catch (AttributeNotFoundException e) {
1782 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1783 } catch (MBeanException e) {
1784 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1785 } catch (ReflectionException e) {
1786 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1787 } catch (InvalidAttributeValueException e) {
1788 logger.log(Level.SEVERE, "Failed set of " + attName, e);
1789 }
1790 return;
1791 }
1792
1793
1794 if (this.bdbjeAttributeNameList.contains(attName)) {
1795 try {
1796 this.bdbjeMBeanHelper.setAttribute(this.controller
1797 .getBdbEnvironment(), attribute);
1798 } catch (AttributeNotFoundException e) {
1799 throw new RuntimeOperationsException(new RuntimeException(e));
1800 } catch (InvalidAttributeValueException e) {
1801 throw new RuntimeOperationsException(new RuntimeException(e));
1802 }
1803 return;
1804 }
1805
1806
1807 throw new AttributeNotFoundException("Attribute " + attName +
1808 " can not be set.");
1809 }
1810
1811 protected void setCrawlOrderAttribute(final String attribute_name,
1812 final ComplexType ct, final Attribute attribute)
1813 throws AttributeNotFoundException, InvalidAttributeValueException,
1814 MBeanException, ReflectionException {
1815 String subName = attribute_name.startsWith("/") ? attribute_name
1816 .substring(1) : attribute_name;
1817 int index = subName.indexOf("/");
1818 if (index <= 0) {
1819 ct.setAttribute(new Attribute(subName, attribute.getValue()));
1820 return;
1821 }
1822 setCrawlOrderAttribute(subName.substring(index + 1), (ComplexType) ct
1823 .getAttribute(subName.substring(0, index)), attribute);
1824 }
1825
1826 public AttributeList setAttributes(AttributeList attributes) {
1827 if (attributes == null) {
1828 throw new RuntimeOperationsException(
1829 new IllegalArgumentException("attributeNames[] cannot be " +
1830 "null"), "Cannot call getAttributes with null attribute " +
1831 "names");
1832 }
1833
1834 AttributeList resultList = new AttributeList();
1835 if (attributes.size() == 0) {
1836 return resultList;
1837 }
1838 for (int i = 0; i < attributes.size(); i++) {
1839 try {
1840 Attribute attr = (Attribute)attributes.get(i);
1841 setAttributeInternal(attr);
1842 String an = attr.getName();
1843 Object newValue = getAttribute(an);
1844 resultList.add(new Attribute(an, newValue));
1845 } catch (Exception e) {
1846 e.printStackTrace();
1847 }
1848 }
1849
1850 kickUpdate();
1851 return resultList;
1852 }
1853
1854 public Object invoke(String operationName, Object[] params,
1855 String[] signature)
1856 throws ReflectionException {
1857 if (operationName == null) {
1858 throw new RuntimeOperationsException(
1859 new IllegalArgumentException("Operation name cannot be null"),
1860 "Cannot call invoke with null operation name");
1861 }
1862
1863 controller.installThreadContextSettingsHandler();
1864
1865 if (this.bdbjeOperationsNameList.contains(operationName)) {
1866 try {
1867 Object o = this.bdbjeMBeanHelper.invoke(
1868 this.controller.getBdbEnvironment(),
1869 operationName, params, signature);
1870
1871 if (operationName.equals(OP_DB_STAT)) {
1872 return o.toString();
1873 }
1874 return o;
1875 } catch (MBeanException e) {
1876 throw new RuntimeOperationsException(new RuntimeException(e));
1877 }
1878 }
1879
1880
1881
1882
1883
1884
1885
1886 if (operationName.equals(IMPORT_URI_OPER)) {
1887 JmxUtils.checkParamsCount(IMPORT_URI_OPER, params, 3);
1888 mustBeCrawling();
1889 try {
1890 importUri((String)params[0],
1891 ((Boolean)params[1]).booleanValue(),
1892 ((Boolean)params[2]).booleanValue());
1893 } catch (URIException e) {
1894 throw new RuntimeOperationsException(new RuntimeException(e));
1895 }
1896 return null;
1897 }
1898
1899 if (operationName.equals(IMPORT_URIS_OPER)) {
1900 JmxUtils.checkParamsCount(IMPORT_URIS_OPER, params, 4);
1901 mustBeCrawling();
1902 return importUris((String)params[0],
1903 ((String)params[1]).toString(),
1904 ((Boolean)params[2]).booleanValue(),
1905 ((Boolean)params[3]).booleanValue());
1906 }
1907
1908 if (operationName.equals(DUMP_URIS_OPER)) {
1909 JmxUtils.checkParamsCount(DUMP_URIS_OPER, params, 4);
1910 mustBeCrawling();
1911 if (!this.controller.isPaused()) {
1912 throw new RuntimeOperationsException(
1913 new IllegalArgumentException("Must " + "be paused"),
1914 "Cannot dump URI's from running job.");
1915 }
1916 dumpUris((String) params[0], (String) params[1],
1917 ((Integer) params[2]).intValue(), ((Boolean) params[3])
1918 .booleanValue());
1919 }
1920
1921 if (operationName.equals(PAUSE_OPER)) {
1922 JmxUtils.checkParamsCount(PAUSE_OPER, params, 0);
1923 mustBeCrawling();
1924 pause();
1925 return null;
1926 }
1927
1928 if (operationName.equals(RESUME_OPER)) {
1929 JmxUtils.checkParamsCount(RESUME_OPER, params, 0);
1930 mustBeCrawling();
1931 resume();
1932 return null;
1933 }
1934
1935 if (operationName.equals(FRONTIER_REPORT_OPER)) {
1936 JmxUtils.checkParamsCount(FRONTIER_REPORT_OPER, params, 1);
1937 mustBeCrawling();
1938 return getFrontierReport((String)params[0]);
1939 }
1940
1941 if (operationName.equals(THREADS_REPORT_OPER)) {
1942 JmxUtils.checkParamsCount(THREADS_REPORT_OPER, params, 0);
1943 mustBeCrawling();
1944 return getThreadsReport();
1945 }
1946
1947 if (operationName.equals(SEEDS_REPORT_OPER)) {
1948 JmxUtils.checkParamsCount(SEEDS_REPORT_OPER, params, 0);
1949 mustBeCrawling();
1950 StringWriter sw = new StringWriter();
1951 if (getStatisticsTracking() != null &&
1952 getStatisticsTracking() instanceof StatisticsTracker) {
1953 ((StatisticsTracker)getStatisticsTracking()).
1954 writeSeedsReportTo(new PrintWriter(sw));
1955 } else {
1956 sw.write("Unsupported");
1957 }
1958 return sw.toString();
1959 }
1960
1961 if (operationName.equals(CHECKPOINT_OPER)) {
1962 JmxUtils.checkParamsCount(CHECKPOINT_OPER, params, 0);
1963 mustBeCrawling();
1964 try {
1965 checkpoint();
1966 } catch (IllegalStateException e) {
1967 throw new RuntimeOperationsException(e);
1968 }
1969 return null;
1970 }
1971
1972 if (operationName.equals(PROGRESS_STATISTICS_OPER)) {
1973 JmxUtils.checkParamsCount(PROGRESS_STATISTICS_OPER, params, 0);
1974 mustBeCrawling();
1975 return getStatisticsTracking().getProgressStatisticsLine();
1976 }
1977
1978 if (operationName.equals(PROGRESS_STATISTICS_LEGEND_OPER)) {
1979 JmxUtils.checkParamsCount(PROGRESS_STATISTICS_LEGEND_OPER,
1980 params, 0);
1981 return getStatisticsTracking().progressStatisticsLegend();
1982 }
1983
1984 throw new ReflectionException(
1985 new NoSuchMethodException(operationName),
1986 "Cannot find the operation " + operationName);
1987 }
1988
1989 public void mustBeCrawling() {
1990 if (!isCrawling()) {
1991 throw new RuntimeOperationsException(
1992 new IllegalArgumentException("Not " +
1993 "crawling (Shouldn't ever be the case)"),
1994 "Not current crawling job?");
1995 }
1996 }
1997
1998 public boolean isCrawling() {
1999 return this.controller != null;
2000 }
2001
2002 /***
2003 * Utility method to get the stored list of ignored seed items (if any),
2004 * from the last time the seeds were imported to the frontier.
2005 *
2006 * @return String of all ignored seed items, or null if none
2007 */
2008 public String getIgnoredSeeds() {
2009 File ignoredFile = new File(getDirectory(),
2010 AbstractFrontier.IGNORED_SEEDS_FILENAME);
2011 if(!ignoredFile.exists()) {
2012 return null;
2013 }
2014 try {
2015 return FileUtils.readFileAsString(ignoredFile);
2016 } catch (IOException e) {
2017
2018 e.printStackTrace();
2019 return null;
2020 }
2021 }
2022
2023 /***
2024 * Forward a 'kick' update to current controller if any.
2025 * @see CrawlController#kickUpdate()
2026 */
2027 public void kickUpdate(){
2028 if (this.controller != null){
2029 this.controller.kickUpdate();
2030 }
2031 }
2032
2033 /***
2034 * Returns a URIFrontierMarker for the current, paused, job. If there is no
2035 * current job or it is not paused null will be returned.
2036 *
2037 * @param regexpr A regular expression that each URI must match in order to
2038 * be considered 'within' the marker.
2039 * @param inCacheOnly Limit marker scope to 'cached' URIs.
2040 * @return a URIFrontierMarker for the current job.
2041 * @see #getPendingURIsList(FrontierMarker, int, boolean)
2042 * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
2043 * boolean)
2044 * @see org.archive.crawler.framework.FrontierMarker
2045 */
2046 public FrontierMarker getInitialMarker(String regexpr,
2047 boolean inCacheOnly) {
2048 return (this.controller != null && this.controller.isPaused())?
2049 this.controller.getFrontier().getInitialMarker(regexpr, inCacheOnly):
2050 null;
2051 }
2052
2053 /***
2054 * Returns the frontiers URI list based on the provided marker. This method
2055 * will return null if there is not current job or if the current job is
2056 * not paused. Only when there is a paused current job will this method
2057 * return a URI list.
2058 *
2059 * @param marker URIFrontier marker
2060 * @param numberOfMatches Maximum number of matches to return
2061 * @param verbose Should detailed info be provided on each URI?
2062 * @return the frontiers URI list based on the provided marker
2063 * @throws InvalidFrontierMarkerException
2064 * When marker is inconsistent with the current state of the
2065 * frontier.
2066 * @see #getInitialMarker(String, boolean)
2067 * @see org.archive.crawler.framework.FrontierMarker
2068 */
2069 public ArrayList<String> getPendingURIsList(FrontierMarker marker,
2070 int numberOfMatches, boolean verbose)
2071 throws InvalidFrontierMarkerException {
2072 return (this.controller != null && this.controller.isPaused())?
2073 this.controller.getFrontier().getURIsList(marker, numberOfMatches,
2074 verbose):
2075 null;
2076 }
2077
2078 public void dumpUris(String filename, String regexp, int numberOfMatches,
2079 boolean verbose) {
2080 try {
2081 PrintWriter out = new PrintWriter(filename);
2082 FrontierMarker marker =
2083 controller.getFrontier().getInitialMarker(regexp, false);
2084 int matchesDumped = 0;
2085
2086 while(matchesDumped<numberOfMatches) {
2087 int batchMatches = Math.min(100, numberOfMatches-matchesDumped);
2088
2089 ArrayList<String> batchOfUris =
2090 getPendingURIsList(marker,batchMatches,false);
2091 for(String uriLine : batchOfUris) {
2092 out.write(uriLine);
2093 out.write("\n");
2094 matchesDumped++;
2095 }
2096 if (batchOfUris.size()<batchMatches) {
2097
2098 break;
2099 }
2100 }
2101 IOUtils.closeQuietly(out);
2102 } catch (FileNotFoundException e) {
2103 logger.log(Level.SEVERE, "Failed dumpUris write", e);
2104 } catch (InvalidFrontierMarkerException e) {
2105 logger.log(Level.SEVERE, "Failed dumpUris", e);
2106 }
2107 }
2108
2109 public void crawlStarted(String message) {
2110 if (this.mbeanName != null) {
2111
2112 sendNotification(new Notification("crawlStarted",
2113 this.mbeanName, getNotificationsSequenceNumber(), message));
2114 }
2115 }
2116
2117 public void crawlEnding(String sExitMessage) {
2118 setRunning(false);
2119 setStatus(sExitMessage);
2120 setReadOnly();
2121 if (this.mbeanName != null) {
2122 sendNotification(new Notification("crawlEnding", this.mbeanName,
2123 getNotificationsSequenceNumber(), sExitMessage));
2124 }
2125 }
2126
2127 public void crawlEnded(String sExitMessage) {
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139 }
2140
2141 public void crawlPausing(String statusMessage) {
2142 setStatus(statusMessage);
2143 }
2144
2145 public void crawlPaused(String statusMessage) {
2146 setStatus(statusMessage);
2147 if (this.mbeanName != null) {
2148
2149 sendNotification(new Notification("crawlPaused", this.mbeanName,
2150 getNotificationsSequenceNumber(), statusMessage));
2151 }
2152 }
2153
2154 public void crawlResuming(String statusMessage) {
2155 setStatus(statusMessage);
2156 if (this.mbeanName != null) {
2157
2158 sendNotification(new Notification("crawlResuming", this.mbeanName,
2159 getNotificationsSequenceNumber(), statusMessage));
2160 }
2161 }
2162
2163 public void crawlCheckpoint(File checkpointDir) throws Exception {
2164 setStatus(CrawlJob.STATUS_CHECKPOINTING);
2165 }
2166
2167 public CrawlController getController() {
2168 return this.controller;
2169 }
2170
2171 public ObjectName preRegister(final MBeanServer server, ObjectName on)
2172 throws Exception {
2173 this.mbeanServer = server;
2174 @SuppressWarnings("unchecked")
2175 Hashtable<String,String> ht = on.getKeyPropertyList();
2176 if (!ht.containsKey(JmxUtils.NAME)) {
2177 throw new IllegalArgumentException("Name property required" +
2178 on.getCanonicalName());
2179 }
2180
2181
2182
2183 Heritrix h = getHostingHeritrix();
2184 if (h == null || h.getMBeanName() == null) {
2185 throw new IllegalArgumentException("Hosting heritrix not found " +
2186 "or not registered with JMX: " + on.getCanonicalName());
2187 }
2188 @SuppressWarnings("unchecked")
2189 Map<String,String> hht = h.getMBeanName().getKeyPropertyList();
2190 ht.put(JmxUtils.MOTHER, hht.get(JmxUtils.NAME));
2191 String port = hht.get(JmxUtils.JMX_PORT);
2192 if (port != null) {
2193 ht.put(JmxUtils.JMX_PORT, port);
2194 }
2195 ht.put(JmxUtils.HOST, hht.get(JmxUtils.HOST));
2196 if (!ht.containsKey(JmxUtils.TYPE)) {
2197 ht.put(JmxUtils.TYPE, CRAWLJOB_JMXMBEAN_TYPE);
2198 }
2199 this.mbeanName = new ObjectName(on.getDomain(), ht);
2200 return this.mbeanName;
2201 }
2202
2203 public void postRegister(Boolean registrationDone) {
2204 if (logger.isLoggable(Level.INFO)) {
2205 logger.info(
2206 JmxUtils.getLogRegistrationMsg(this.mbeanName.getCanonicalName(),
2207 this.mbeanServer, registrationDone.booleanValue()));
2208 }
2209 }
2210
2211 public void preDeregister() throws Exception {
2212
2213 }
2214
2215 public void postDeregister() {
2216 if (mbeanName == null) {
2217 return;
2218 }
2219 if (logger.isLoggable(Level.INFO)) {
2220 logger.info(JmxUtils.getLogUnregistrationMsg(
2221 this.mbeanName.getCanonicalName(), this.mbeanServer));
2222 }
2223 this.mbeanName = null;
2224 }
2225
2226 /***
2227 * @return Heritrix that is hosting this job.
2228 */
2229 protected Heritrix getHostingHeritrix() {
2230 Heritrix hostingHeritrix = null;
2231 Map heritrice = Heritrix.getInstances();
2232 for (final Iterator i = heritrice.keySet().iterator(); i.hasNext();) {
2233 Heritrix h = (Heritrix)heritrice.get(i.next());
2234 if (h.getJobHandler().getCurrentJob() == this) {
2235 hostingHeritrix = h;
2236 break;
2237 }
2238 }
2239 return hostingHeritrix;
2240 }
2241
2242 /***
2243 * @return Unique name for job that is safe to use in jmx (Like display
2244 * name but without spaces).
2245 */
2246 public String getJmxJobName() {
2247 return getJobName() + "-" + getUID();
2248 }
2249
2250 /***
2251 * @return Notification sequence number (Does increment after each access).
2252 */
2253 protected static int getNotificationsSequenceNumber() {
2254 return notificationsSequenceNumber++;
2255 }
2256
2257 protected ObjectName getMbeanName() {
2258 return this.mbeanName;
2259 }
2260
2261 /***
2262 * @return the statistics tracking instance (of null if none yet available).
2263 */
2264 public StatisticsTracking getStatisticsTracking() {
2265 return this.controller == null ||
2266 this.controller.getStatistics() == null? null:
2267 this.controller.getStatistics();
2268 }
2269 }