1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.admin;
24
25 import java.io.BufferedReader;
26 import java.io.BufferedWriter;
27 import java.io.File;
28 import java.io.FileOutputStream;
29 import java.io.FilenameFilter;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.InputStreamReader;
33 import java.io.OutputStreamWriter;
34 import java.net.URI;
35 import java.net.URL;
36 import java.util.ArrayList;
37 import java.util.Comparator;
38 import java.util.Enumeration;
39 import java.util.Iterator;
40 import java.util.List;
41 import java.util.TreeSet;
42 import java.util.logging.Level;
43 import java.util.logging.Logger;
44
45 import javax.management.Attribute;
46 import javax.management.AttributeNotFoundException;
47 import javax.management.InvalidAttributeValueException;
48 import javax.management.MBeanException;
49 import javax.management.ReflectionException;
50
51 import org.apache.commons.httpclient.URIException;
52 import org.archive.crawler.Heritrix;
53 import org.archive.crawler.datamodel.CrawlOrder;
54 import org.archive.crawler.event.CrawlStatusListener;
55 import org.archive.crawler.framework.FrontierMarker;
56 import org.archive.crawler.framework.exceptions.FatalConfigurationException;
57 import org.archive.crawler.framework.exceptions.InitializationException;
58 import org.archive.crawler.framework.exceptions.InvalidFrontierMarkerException;
59 import org.archive.crawler.frontier.FrontierJournal;
60 import org.archive.crawler.settings.ComplexType;
61 import org.archive.crawler.settings.CrawlerSettings;
62 import org.archive.crawler.settings.SettingsHandler;
63 import org.archive.crawler.settings.XMLSettingsHandler;
64 import org.archive.util.ArchiveUtils;
65 import org.archive.util.FileUtils;
66
67
68 /***
69 * This class manages CrawlJobs. Submitted crawl jobs are queued up and run
70 * in order when the crawler is running.
71 * <p>Basically this provides a layer between any potential user interface and
72 * the CrawlJobs. It keeps the lists of completed jobs, pending jobs, etc.
73 * <p>
74 * The jobs managed by the handler can be divided into the following:
75 * <ul>
76 * <li> <code>Pending</code> - Jobs that are ready to run and are waiting their
77 * turn. These can be edited, viewed, deleted etc.
78 * <li> <code>Running</code> - Only one job can be running at a time. There may
79 * be no job running. The running job can be viewed
80 * and edited to some extent. It can also be
81 * terminated. This job should have a
82 * StatisticsTracking module attached to it for more
83 * details on the crawl.
84 * <li><code>Completed</code> - Jobs that have finished crawling or have been
85 * deleted from the pending queue or terminated
86 * while running. They can not be edited but can be
87 * viewed. They retain the StatisticsTracking
88 * module from their run.
89 * <li> <code>New job</code> - At any given time their can be one 'new job' the
90 * new job is not considered ready to run. It can
91 * be edited or discarded (in which case it will be
92 * totally destroyed, including any files on disk).
93 * Once an operator deems the job ready to run it
94 * can be moved to the pending queue.
95 * <li> <code>Profiles</code> - Jobs under profiles are not actual jobs. They can
96 * be edited normally but can not be submitted to
97 * the pending queue. New jobs can be created
98 * using a profile as it's template.
99 *
100 * @author Kristinn Sigurdsson
101 *
102 * @see org.archive.crawler.admin.CrawlJob
103 */
104
105 public class CrawlJobHandler implements CrawlStatusListener {
106 private static final Logger logger =
107 Logger.getLogger(CrawlJobHandler.class.getName());
108
109 /***
110 * Name of system property whose specification overrides default profile
111 * used.
112 *
113 */
114 public static final String DEFAULT_PROFILE_NAME
115 = "heritrix.default.profile";
116
117 /***
118 * Default profile name.
119 */
120 public static final String DEFAULT_PROFILE = "default";
121
122 /***
123 * Name of the profiles directory.
124 */
125 public static final String PROFILES_DIR_NAME = "profiles";
126
127 public static final String ORDER_FILE_NAME = "order.xml";
128
129 /***
130 * Job currently being crawled.
131 */
132 private CrawlJob currentJob = null;
133
134 /***
135 * A new job that is being created/configured. Not yet ready for crawling.
136 */
137 private CrawlJob newJob = null;
138
139 /***
140 * Thread to start the next job in background
141 */
142 private Thread startingNextJob = null;
143
144 /***
145 * A list of pending CrawlJobs.
146 */
147 private TreeSet<CrawlJob> pendingCrawlJobs;
148
149 /***
150 * A list of completed CrawlJobs.
151 */
152
153 private TreeSet<CrawlJob> completedCrawlJobs;
154
155 /***
156 * A list of profile CrawlJobs.
157 */
158 private TreeSet<CrawlJob> profileJobs;
159
160
161
162 private String defaultProfile = null;
163
164 /***
165 * If true the crawler is 'running'. That is the next pending job will start
166 * crawling as soon as the current job (if any) is completed.
167 */
168 private boolean running = false;
169
170 /***
171 * String to indicate recovery should be based on the recovery log, not
172 * based on checkpointing.
173 */
174 public static final String RECOVER_LOG = "recover";
175
176 /***
177 * Jobs directory.
178 */
179 private final File jobsDir;
180
181 /***
182 * Constructor.
183 * @param jobsDir Jobs directory.
184 */
185 public CrawlJobHandler(final File jobsDir) {
186 this(jobsDir, true, true);
187 }
188
189 /***
190 * Constructor allowing for optional loading of profiles and jobs.
191 * @param jobsDir Jobs directory.
192 * @param loadJobs If true then any applicable jobs will be loaded.
193 * @param loadProfiles If true then any applicable profiles will be loaded.
194 */
195 public CrawlJobHandler(final File jobsDir,
196 final boolean loadJobs, final boolean loadProfiles) {
197 this.jobsDir = jobsDir;
198
199 Comparator<CrawlJob> comp = new Comparator<CrawlJob>(){
200 public int compare(CrawlJob job1, CrawlJob job2) {
201 if( job1.getJobPriority() < job2.getJobPriority() ){
202 return -1;
203 } else if( job1.getJobPriority() > job2.getJobPriority() ){
204 return 1;
205 } else {
206
207
208 return job1.getUID().compareTo(job2.getUID());
209 }
210 }
211 };
212 this.pendingCrawlJobs = new TreeSet<CrawlJob>(comp);
213 this.completedCrawlJobs = new TreeSet<CrawlJob>(comp);
214
215 this.profileJobs = new TreeSet<CrawlJob>(comp);
216 if (loadProfiles){
217 loadProfiles();
218 }
219 if (loadJobs){
220 loadJobs();
221 }
222 }
223
224 /***
225 * Find the state.job file in the job directory.
226 * @param jobDir Directory to look in.
227 * @return Full path to 'state.job' file or null if none found.
228 */
229 protected File getStateJobFile(final File jobDir) {
230
231 File[] jobFiles = jobDir.listFiles(new FilenameFilter() {
232 public boolean accept(File dir, String name) {
233 return name.toLowerCase().endsWith(".job") &&
234 (new File(dir, name)).canRead();
235 }
236
237 });
238 return (jobFiles.length == 1)? jobFiles[0]: null;
239 }
240
241 /***
242 * Loads any availible jobs in the jobs directory.
243 * <p>
244 * Availible jobs are any directory containing a file called
245 * <code>state.job</code>. The file must contain valid job information.
246 */
247 private void loadJobs() {
248 this.jobsDir.mkdirs();
249 File[] jobs = this.jobsDir.listFiles();
250 for (int i = 0; i < jobs.length; i++) {
251 if (jobs[i].isDirectory()) {
252 File jobFile = getStateJobFile(jobs[i]);
253 if (jobFile != null) {
254 loadJob(jobFile);
255 }
256 }
257 }
258 }
259
260 /***
261 * Loads a job given a specific job file. The loaded job will be placed in
262 * the list of completed jobs or pending queue depending on its status.
263 * Running jobs will have their status set to 'finished abnormally' and put
264 * into the completed list.
265 * @param job The job file of the job to load.
266 */
267 protected void loadJob(final File job) {
268 CrawlJob cjob = null;
269 try {
270
271 cjob = new CrawlJob(job, new CrawlJobErrorHandler());
272 } catch (InvalidJobFileException e) {
273 logger.log(Level.INFO,
274 "Invalid job file for " + job.getAbsolutePath(), e);
275 return;
276 } catch (IOException e) {
277 logger.log(Level.INFO, "IOException for " + job.getName() +
278 ", " + job.getAbsolutePath(), e);
279 return;
280 }
281
282
283
284 if (cjob.getStatus().equals(CrawlJob.STATUS_RUNNING)
285 || cjob.getStatus().equals(CrawlJob.STATUS_PAUSED)
286 || cjob.getStatus().equals(CrawlJob.STATUS_CHECKPOINTING)
287 || cjob.getStatus().equals(CrawlJob.STATUS_WAITING_FOR_PAUSE) ){
288
289 cjob.setStatus(CrawlJob.STATUS_FINISHED_ABNORMAL);
290 this.completedCrawlJobs.add(cjob);
291 } else if( cjob.getStatus().equals(CrawlJob.STATUS_PENDING) ) {
292
293 this.pendingCrawlJobs.add(cjob);
294 } else if( cjob.getStatus().equals(CrawlJob.STATUS_CREATED)
295 || cjob.getStatus().equals(CrawlJob.STATUS_DELETED) ) {
296
297 } else {
298
299 this.completedCrawlJobs.add(cjob);
300 }
301 }
302
303 /***
304 * Looks in conf dir for a profiles dir.
305 * @return the directory where profiles are stored else null if none
306 * available
307 * @throws IOException
308 */
309 private File getProfilesDirectory() throws IOException {
310 URL webappProfilePath = Heritrix.class.getResource("/" +
311 PROFILES_DIR_NAME);
312 if (webappProfilePath != null) {
313 try {
314 return new File(new URI(webappProfilePath.toString()));
315 } catch (java.lang.IllegalArgumentException e) {
316
317
318 } catch (java.net.URISyntaxException e) {
319 e.printStackTrace();
320 }
321 }
322 return (Heritrix.getConfdir(false) == null)? null:
323 new File(Heritrix.getConfdir().getAbsolutePath(),
324 PROFILES_DIR_NAME);
325 }
326
327 /***
328 * Loads the default profile and all other profiles found on disk.
329 */
330 private void loadProfiles() {
331 boolean loadedDefault = false;
332 File profileDir = null;
333 try {
334 profileDir = getProfilesDirectory();
335 } catch (IOException e) {
336 e.printStackTrace();
337 }
338 if (profileDir != null) {
339 File[] ps = profileDir.listFiles();
340 if (ps != null && ps.length > 0) {
341 for (int i = 0; i < ps.length; i++) {
342 File f = ps[i];
343 if (f.isDirectory()) {
344
345
346 File profile = new File(f, ORDER_FILE_NAME);
347 if (profile.canRead()) {
348 boolean b = loadProfile(profile);
349 if (b) {
350 loadedDefault = b;
351 }
352 }
353 }
354 }
355 }
356 }
357
358
359 String parent = File.separator + PROFILES_DIR_NAME + File.separator;
360 if (!loadedDefault) {
361 loadProfile(new File(parent + DEFAULT_PROFILE, ORDER_FILE_NAME));
362 }
363
364
365
366 defaultProfile = DEFAULT_PROFILE;
367 }
368
369 /***
370 * Load one profile.
371 * @param profile Profile to load.
372 * @return True if loaded profile was the default profile.
373 */
374 protected boolean loadProfile(File profile) {
375 boolean loadedDefault = false;
376
377 try {
378
379 XMLSettingsHandler newSettingsHandler =
380 new XMLSettingsHandler(profile);
381 CrawlJobErrorHandler cjseh =
382 new CrawlJobErrorHandler(Level.SEVERE);
383 newSettingsHandler.
384 setErrorReportingLevel(cjseh.getLevel());
385 newSettingsHandler.initialize();
386 addProfile(new CrawlJob(profile.getParentFile().getName(),
387 newSettingsHandler, cjseh));
388 loadedDefault = profile.getParentFile().getName().
389 equals(DEFAULT_PROFILE);
390 } catch (InvalidAttributeValueException e) {
391 System.err.println("Failed to load profile '" +
392 profile.getParentFile().getName() +
393 "'. InvalidAttributeValueException.");
394 }
395 return loadedDefault;
396 }
397
398 /***
399 * Add a new profile
400 * @param profile The new profile
401 */
402 public synchronized void addProfile(CrawlJob profile){
403 profileJobs.add(profile);
404 }
405
406 public synchronized void deleteProfile(CrawlJob cj) throws IOException {
407 File d = getProfilesDirectory();
408 File p = new File(d, cj.getJobName());
409 if (!p.exists()) {
410 throw new IOException("No profile named " + cj.getJobName() +
411 " at " + d.getAbsolutePath());
412 }
413 FileUtils.deleteDir(p);
414 this.profileJobs.remove(cj);
415 }
416
417 /***
418 * Returns a List of all known profiles.
419 * @return a List of all known profiles.
420 */
421 public synchronized List<CrawlJob> getProfiles(){
422 ArrayList<CrawlJob> tmp = new ArrayList<CrawlJob>(profileJobs.size());
423 tmp.addAll(profileJobs);
424 return tmp;
425 }
426
427 /***
428 * Submit a job to the handler. Job will be scheduled for crawling. At
429 * present it will not take the job's priority into consideration.
430 *
431 * @param job A new job for the handler
432 * @return CrawlJob that was added or null.
433 */
434 public CrawlJob addJob(CrawlJob job) {
435 if(job.isProfile()){
436 return null;
437 }
438 job.setStatus(CrawlJob.STATUS_PENDING);
439 if(job.isNew()){
440
441 this.newJob = null;
442 job.setNew(false);
443 }
444 this.pendingCrawlJobs.add(job);
445 if(isCrawling() == false && isRunning()) {
446
447 startNextJob();
448 }
449 return job;
450 }
451
452 /***
453 * Returns the default profile. If no default profile has been set it will
454 * return the first profile that was set/loaded and still exists. If no
455 * profiles exist it will return null
456 * @return the default profile.
457 */
458 public synchronized CrawlJob getDefaultProfile() {
459 if(defaultProfile != null){
460 for(Iterator it = profileJobs.iterator(); it.hasNext();) {
461 CrawlJob item = (CrawlJob)it.next();
462 if(item.getJobName().equals(defaultProfile)){
463
464 return item;
465 }
466 }
467 }
468 if(profileJobs.size() > 0){
469 return (CrawlJob)profileJobs.first();
470 }
471 return null;
472 }
473
474 /***
475 * Set the default profile.
476 * @param profile The new default profile. The following must apply to it.
477 * profile.isProfile() should return true and
478 * this.getProfiles() should contain it.
479 */
480 public void setDefaultProfile(CrawlJob profile) {
481 defaultProfile = profile.getJobName();
482
483 }
484
485 /***
486 * A List of all pending jobs
487 *
488 * @return A List of all pending jobs.
489 * No promises are made about the order of the list
490 */
491 public List<CrawlJob> getPendingJobs() {
492 ArrayList<CrawlJob> tmp
493 = new ArrayList<CrawlJob>(pendingCrawlJobs.size());
494 tmp.addAll(pendingCrawlJobs);
495 return tmp;
496 }
497
498 /***
499 * @return The job currently being crawled.
500 */
501 public CrawlJob getCurrentJob() {
502 return currentJob;
503 }
504
505 /***
506 * @return A List of all finished jobs.
507 */
508 public List<CrawlJob> getCompletedJobs() {
509 ArrayList<CrawlJob> tmp
510 = new ArrayList<CrawlJob>(completedCrawlJobs.size());
511 tmp.addAll(completedCrawlJobs);
512 return tmp;
513 }
514
515 /***
516 * Return a job with the given UID.
517 * Doesn't matter if it's pending, currently running, has finished running
518 * is new or a profile.
519 *
520 * @param jobUID The unique ID of the job.
521 * @return The job with the UID or null if no such job is found
522 */
523 public CrawlJob getJob(String jobUID) {
524 if (jobUID == null){
525 return null;
526 }
527
528 if (currentJob != null && currentJob.getUID().equals(jobUID)) {
529 return currentJob;
530 } else if (newJob != null && newJob.getUID().equals(jobUID)) {
531
532 return newJob;
533 } else {
534
535 Iterator itPend = pendingCrawlJobs.iterator();
536 while (itPend.hasNext()) {
537 CrawlJob cj = (CrawlJob) itPend.next();
538 if (cj.getUID().equals(jobUID)) {
539 return cj;
540 }
541 }
542
543
544 Iterator itComp = completedCrawlJobs.iterator();
545 while (itComp.hasNext()) {
546 CrawlJob cj = (CrawlJob) itComp.next();
547 if (cj.getUID().equals(jobUID)) {
548 return cj;
549 }
550 }
551
552
553 for (Iterator i = getProfiles().iterator(); i.hasNext();) {
554 CrawlJob cj = (CrawlJob) i.next();
555 if (cj.getUID().equals(jobUID)) {
556 return cj;
557 }
558 }
559 }
560 return null;
561 }
562
563 /***
564 * @return True if we terminated a current job (False if no job to
565 * terminate)
566 */
567 public boolean terminateCurrentJob() {
568 if (this.currentJob == null) {
569 return false;
570 }
571
572 if(startingNextJob != null) {
573 try {
574 startingNextJob.join();
575 } catch (InterruptedException e) {
576 e.printStackTrace();
577 }
578 }
579
580
581
582 this.currentJob.stopCrawling();
583 synchronized (this) {
584 try {
585
586
587
588 wait(3000);
589 } catch (InterruptedException e) {
590
591 }
592 }
593 return true;
594 }
595
596 /***
597 * The specified job will be removed from the pending queue or aborted if
598 * currently running. It will be placed in the list of completed jobs with
599 * appropriate status info. If the job is already in the completed list or
600 * no job with the given UID is found, no action will be taken.
601 *
602 * @param jobUID The UID (unique ID) of the job that is to be deleted.
603 *
604 */
605 public void deleteJob(String jobUID) {
606
607 if (currentJob != null && jobUID.equals(currentJob.getUID())) {
608 terminateCurrentJob();
609 return;
610 }
611
612
613 for(Iterator it = pendingCrawlJobs.iterator(); it.hasNext();) {
614 CrawlJob cj = (CrawlJob) it.next();
615 if (cj.getUID().equals(jobUID)) {
616
617 cj.setStatus(CrawlJob.STATUS_DELETED);
618 it.remove();
619 return;
620 }
621 }
622
623
624 for (Iterator it = completedCrawlJobs.iterator(); it.hasNext();) {
625 CrawlJob cj = (CrawlJob) it.next();
626 if (cj.getUID().equals(jobUID)) {
627
628 cj.setStatus(CrawlJob.STATUS_DELETED);
629 it.remove();
630 return;
631 }
632 }
633 }
634
635 /***
636 * Cause the current job to pause. If no current job is crawling this
637 * method will have no effect.
638 */
639 public void pauseJob() {
640 if (this.currentJob != null) {
641 this.currentJob.pause();
642 }
643 }
644
645 /***
646 * Cause the current job to resume crawling if it was paused. Will have no
647 * effect if the current job was not paused or if there is no current job.
648 * If the current job is still waiting to pause, this will not take effect
649 * until the job has actually paused. At which time it will immeditatly
650 * resume crawling.
651 */
652 public void resumeJob() {
653 if (this.currentJob != null) {
654 this.currentJob.resume();
655 }
656 }
657
658 /***
659 * Cause the current job to write a checkpoint to disk. Currently
660 * requires job to already be paused.
661 * @throws IllegalStateException Thrown if crawl is not paused.
662 */
663 public void checkpointJob() throws IllegalStateException {
664 if (this.currentJob != null) {
665 this.currentJob.checkpoint();
666 }
667 }
668
669 /***
670 * Returns a unique job ID.
671 * <p>
672 * No two calls to this method (on the same instance of this class) can ever
673 * return the same value. <br>
674 * Currently implemented to return a time stamp. That is subject to change
675 * though.
676 *
677 * @return A unique job ID.
678 *
679 * @see ArchiveUtils#TIMESTAMP17
680 */
681 public String getNextJobUID() {
682 return ArchiveUtils.get17DigitDate();
683 }
684
685 /***
686 * Creates a new job. The new job will be returned and also registered as
687 * the handler's 'new job'. The new job will be based on the settings
688 * provided but created in a new location on disk.
689 *
690 * @param baseOn
691 * A CrawlJob (with a valid settingshandler) to use as the
692 * template for the new job.
693 * @param recovery Whether to preinitialize new job as recovery of
694 * <code>baseOn</code> job. String holds RECOVER_LOG if we are to
695 * do the recovery based off the recover.gz log -- See RecoveryJournal in
696 * the frontier package -- or it holds the name of
697 * the checkpoint we're to use recoverying.
698 * @param name
699 * The name of the new job.
700 * @param description
701 * Descriptions of the job.
702 * @param seeds
703 * The contents of the new settings' seed file.
704 * @param priority
705 * The priority of the new job.
706 *
707 * @return The new crawl job.
708 * @throws FatalConfigurationException If a problem occurs creating the
709 * settings.
710 */
711 public CrawlJob newJob(CrawlJob baseOn, String recovery, String name,
712 String description, String seeds, int priority)
713 throws FatalConfigurationException {
714
715 File recover = null;
716 try {
717 if (recovery != null && recovery.length() > 0
718 && recovery.equals(RECOVER_LOG)) {
719
720
721 File dir = baseOn.getSettingsHandler().getOrder()
722 .getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
723
724
725 recover = new File(dir, FrontierJournal.LOGNAME_RECOVER);
726 } else if (recovery != null && recovery.length() > 0) {
727
728 recover = new File(baseOn.getSettingsHandler().
729 getOrder().getSettingsDir(CrawlOrder.ATTR_CHECKPOINTS_PATH),
730 recovery);
731 }
732 } catch (AttributeNotFoundException e1) {
733 throw new FatalConfigurationException(
734 "AttributeNotFoundException occured while setting up" +
735 "new job/profile " + name + " \n" + e1.getMessage());
736 }
737
738 CrawlJob cj = createNewJob(baseOn.getSettingsHandler().getOrderFile(),
739 name, description, seeds, priority);
740
741 updateRecoveryPaths(recover, cj.getSettingsHandler(), name);
742
743 return cj;
744 }
745
746 /***
747 * Creates a new job. The new job will be returned and also registered as
748 * the handler's 'new job'. The new job will be based on the settings
749 * provided but created in a new location on disk.
750 * @param orderFile Order file to use as the template for the new job.
751 * @param name The name of the new job.
752 * @param description Descriptions of the job.
753 * @param seeds The contents of the new settings' seed file.
754 *
755 * @return The new crawl job.
756 * @throws FatalConfigurationException If a problem occurs creating the
757 * settings.
758 */
759 public CrawlJob newJob(final File orderFile, final String name,
760 final String description, final String seeds)
761 throws FatalConfigurationException {
762 return createNewJob(orderFile, name, description, seeds,
763 CrawlJob.PRIORITY_AVERAGE);
764 }
765
766 protected void checkDirectory(File dir)
767 throws FatalConfigurationException {
768 if (dir == null) {
769 return;
770 }
771 if (!dir.exists() && !dir.canRead()) {
772 throw new FatalConfigurationException(dir.getAbsolutePath() +
773 " does not exist or is unreadable");
774 }
775 }
776
777 protected CrawlJob createNewJob(final File orderFile, final String name,
778 final String description, final String seeds, final int priority)
779 throws FatalConfigurationException {
780 if (newJob != null) {
781
782 discardNewJob();
783 }
784 String UID = getNextJobUID();
785 File jobDir;
786 jobDir = new File(this.jobsDir, name + "-" + UID);
787 CrawlJobErrorHandler errorHandler = new CrawlJobErrorHandler();
788 XMLSettingsHandler handler =
789 createSettingsHandler(orderFile, name, description,
790 seeds, jobDir, errorHandler, "order.xml", "seeds.txt");
791 this.newJob = new CrawlJob(UID, name, handler, errorHandler, priority,
792 jobDir);
793 return this.newJob;
794 }
795
796 /***
797 * Creates a new profile. The new profile will be returned and also
798 * registered as the handler's 'new job'. The new profile will be based on
799 * the settings provided but created in a new location on disk.
800 *
801 * @param baseOn
802 * A CrawlJob (with a valid settingshandler) to use as the
803 * template for the new profile.
804 * @param name
805 * The name of the new profile.
806 * @param description
807 * Description of the new profile
808 * @param seeds
809 * The contents of the new profiles' seed file
810 * @return The new profile.
811 * @throws FatalConfigurationException
812 * @throws IOException
813 */
814 public CrawlJob newProfile(CrawlJob baseOn, String name, String description,
815 String seeds)
816 throws FatalConfigurationException, IOException {
817 File profileDir = new File(getProfilesDirectory().getAbsoluteFile()
818 + File.separator + name);
819 CrawlJobErrorHandler cjseh = new CrawlJobErrorHandler(Level.SEVERE);
820 CrawlJob newProfile = new CrawlJob(name,
821 createSettingsHandler(baseOn.getSettingsHandler().getOrderFile(),
822 name, description, seeds, profileDir, cjseh, "order.xml",
823 "seeds.txt"), cjseh);
824 addProfile(newProfile);
825 return newProfile;
826 }
827
828 /***
829 * Creates a new settings handler based on an existing job. Basically all
830 * the settings file for the 'based on' will be copied to the specified
831 * directory.
832 *
833 * @param orderFile Order file to base new order file on. Cannot be null.
834 * @param name Name for the new settings
835 * @param description Description of the new settings.
836 * @param seeds The contents of the new settings' seed file.
837 * @param newSettingsDir
838 * @param errorHandler
839 * @param filename Name of new order file.
840 * @param seedfile Name of new seeds file.
841 *
842 * @return The new settings handler.
843 * @throws FatalConfigurationException
844 * If there are problems with reading the 'base on'
845 * configuration, with writing the new configuration or it's
846 * seed file.
847 */
848 protected XMLSettingsHandler createSettingsHandler(
849 final File orderFile, final String name, final String description,
850 final String seeds, final File newSettingsDir,
851 final CrawlJobErrorHandler errorHandler,
852 final String filename, final String seedfile)
853 throws FatalConfigurationException {
854 XMLSettingsHandler newHandler = null;
855 try {
856 newHandler = new XMLSettingsHandler(orderFile);
857 if(errorHandler != null){
858 newHandler.registerValueErrorHandler(errorHandler);
859 }
860 newHandler.setErrorReportingLevel(errorHandler.getLevel());
861 newHandler.initialize();
862 } catch (InvalidAttributeValueException e2) {
863 throw new FatalConfigurationException(
864 "InvalidAttributeValueException occured while creating" +
865 " new settings handler for new job/profile\n" +
866 e2.getMessage());
867 }
868
869
870 newSettingsDir.mkdirs();
871
872 try {
873
874 ((ComplexType)newHandler.getOrder().getAttribute("scope"))
875 .setAttribute(new Attribute("seedsfile", seedfile));
876 } catch (AttributeNotFoundException e1) {
877 throw new FatalConfigurationException(
878 "AttributeNotFoundException occured while setting up" +
879 "new job/profile\n" + e1.getMessage());
880 } catch (InvalidAttributeValueException e1) {
881 throw new FatalConfigurationException(
882 "InvalidAttributeValueException occured while setting" +
883 "up new job/profile\n" + e1.getMessage());
884 } catch (MBeanException e1) {
885 throw new FatalConfigurationException(
886 "MBeanException occured while setting up new" +
887 " job/profile\n" + e1.getMessage());
888 } catch (ReflectionException e1) {
889 throw new FatalConfigurationException(
890 "ReflectionException occured while setting up" +
891 " new job/profile\n" + e1.getMessage());
892 }
893
894 File newFile = new File(newSettingsDir.getAbsolutePath(), filename);
895
896 try {
897 newHandler.copySettings(newFile, (String)newHandler.getOrder()
898 .getAttribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY));
899 } catch (IOException e3) {
900
901
902 e3.printStackTrace();
903 throw new FatalConfigurationException(
904 "IOException occured while writing new settings files" +
905 " for new job/profile\n" + e3.getMessage());
906 } catch (AttributeNotFoundException e) {
907 throw new FatalConfigurationException(
908 "AttributeNotFoundException occured while writing new" +
909 " settings files for new job/profile\n" + e.getMessage());
910 } catch (MBeanException e) {
911 throw new FatalConfigurationException(
912 "MBeanException occured while writing new settings files" +
913 " for new job/profile\n" + e.getMessage());
914 } catch (ReflectionException e) {
915 throw new FatalConfigurationException(
916 "ReflectionException occured while writing new settings" +
917 " files for new job/profile\n" + e.getMessage());
918 }
919 CrawlerSettings orderfile = newHandler.getSettingsObject(null);
920
921 orderfile.setName(name);
922 orderfile.setDescription(description);
923
924 if (seeds != null) {
925 BufferedWriter writer = null;
926 try {
927 writer = new BufferedWriter(
928 new OutputStreamWriter(
929 new FileOutputStream(
930 newHandler.getPathRelativeToWorkingDirectory(seedfile)),
931 "UTF-8"));
932 try {
933 writer.write(seeds);
934 } finally {
935 writer.close();
936 }
937 } catch (IOException e) {
938 throw new FatalConfigurationException(
939 "IOException occured while writing seed file for new"
940 + " job/profile\n" + e.getMessage());
941 }
942 }
943 return newHandler;
944 }
945
946 /***
947 * @param recover
948 * Source to use recovering. Can be full path to a recovery log
949 * or full path to a checkpoint src dir.
950 * @param sh
951 * Settings Handler to update.
952 * @param jobName
953 * Name of this job.
954 * @throws FatalConfigurationException
955 */
956 protected void updateRecoveryPaths(final File recover,
957 final SettingsHandler sh, final String jobName)
958 throws FatalConfigurationException {
959 if (recover == null) {
960 return;
961 }
962 checkDirectory(recover);
963 try {
964
965 updateRecoveryPaths(recover, sh);
966 } catch (AttributeNotFoundException e1) {
967 throw new FatalConfigurationException(
968 "AttributeNotFoundException occured while setting up"
969 + "new job/profile " + jobName + " \n"
970 + e1.getMessage());
971 } catch (InvalidAttributeValueException e1) {
972 throw new FatalConfigurationException(
973 "InvalidAttributeValueException occured while setting"
974 + "new job/profile " + jobName + " \n"
975 + e1.getMessage());
976 } catch (MBeanException e1) {
977 throw new FatalConfigurationException(
978 "MBeanException occured while setting up new"
979 + "new job/profile " + jobName + " \n"
980 + e1.getMessage());
981 } catch (ReflectionException e1) {
982 throw new FatalConfigurationException(
983 "ReflectionException occured while setting up"
984 + "new job/profile " + jobName + " \n"
985 + e1.getMessage());
986 } catch (IOException e) {
987 throw new FatalConfigurationException(
988 "IOException occured while setting up" + "new job/profile "
989 + jobName + " \n" + e.getMessage());
990 }
991 }
992
993 /***
994 * @param recover
995 * Source to use recovering. Can be full path to a recovery log
996 * or full path to a checkpoint src dir.
997 * @param newHandler
998 * @throws ReflectionException
999 * @throws MBeanException
1000 * @throws InvalidAttributeValueException
1001 * @throws AttributeNotFoundException
1002 * @throws IOException
1003 */
1004 private void updateRecoveryPaths(final File recover,
1005 SettingsHandler newHandler)
1006 throws AttributeNotFoundException, InvalidAttributeValueException,
1007 MBeanException, ReflectionException, IOException {
1008 if (recover == null || !recover.exists()) {
1009 throw new IOException("Recovery src does not exist: " + recover);
1010 }
1011 newHandler.getOrder().setAttribute(
1012 new Attribute(CrawlOrder.ATTR_RECOVER_PATH,
1013 recover.getAbsolutePath()));
1014
1015
1016
1017 File newLogsDisk = null;
1018 final String RECOVERY_SUFFIX = "-R";
1019 while(true) {
1020 try {
1021 newLogsDisk = newHandler.getOrder().
1022 getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
1023 } catch (AttributeNotFoundException e) {
1024 logger.log(Level.SEVERE, "Failed to get logs directory", e);
1025 }
1026 if (newLogsDisk.list().length > 0) {
1027
1028 String logsPath = (String) newHandler.getOrder().
1029 getAttribute(CrawlOrder.ATTR_LOGS_PATH);
1030 if(logsPath.endsWith("/")) {
1031 logsPath = logsPath.substring(0,logsPath.length()-1);
1032 }
1033 newHandler.getOrder().setAttribute(
1034 new Attribute(CrawlOrder.ATTR_LOGS_PATH,
1035 logsPath + RECOVERY_SUFFIX));
1036 } else {
1037
1038 break;
1039 }
1040 }
1041 File newStateDisk = null;
1042 while (true) {
1043 try {
1044 newStateDisk = newHandler.getOrder().getSettingsDir(
1045 CrawlOrder.ATTR_STATE_PATH);
1046 } catch (AttributeNotFoundException e) {
1047 logger.log(Level.SEVERE, "Failed to get state directory", e);
1048 }
1049 if (newStateDisk.list().length>0) {
1050
1051 String statePath = (String) newHandler.getOrder().
1052 getAttribute(CrawlOrder.ATTR_STATE_PATH);
1053 if(statePath.endsWith("/")) {
1054 statePath = statePath.substring(0,statePath.length()-1);
1055 }
1056 newHandler.getOrder().setAttribute(
1057 new Attribute(CrawlOrder.ATTR_STATE_PATH,
1058 statePath + RECOVERY_SUFFIX));
1059 } else {
1060
1061 break;
1062 }
1063 }
1064 }
1065
1066 /***
1067 * Discard the handler's 'new job'. This will remove any files/directories
1068 * written to disk.
1069 */
1070 public void discardNewJob(){
1071 FileUtils.deleteDir(new File(newJob.getSettingsDirectory()));
1072 }
1073
1074 /***
1075 * Get the handler's 'new job'
1076 * @return the handler's 'new job'
1077 */
1078 public CrawlJob getNewJob(){
1079 return newJob;
1080 }
1081
1082 /***
1083 * Is the crawler accepting crawl jobs to run?
1084 * @return True if the next availible CrawlJob will be crawled. False otherwise.
1085 */
1086 public boolean isRunning() {
1087 return running;
1088 }
1089
1090 /***
1091 * Is a crawl job being crawled?
1092 * @return True if a job is actually being crawled (even if it is paused).
1093 * False if no job is being crawled.
1094 */
1095 public boolean isCrawling() {
1096 return this.currentJob != null;
1097 }
1098
1099 /***
1100 * Allow jobs to be crawled.
1101 */
1102 public void startCrawler() {
1103 running = true;
1104 if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
1105
1106 startNextJob();
1107 }
1108 }
1109
1110 /***
1111 * Stop future jobs from being crawled.
1112 *
1113 * This action will not affect the current job.
1114 */
1115 public void stopCrawler() {
1116 running = false;
1117 }
1118
1119 /***
1120 * Start next crawl job.
1121 *
1122 * If a is job already running this method will do nothing.
1123 */
1124 protected final void startNextJob() {
1125 synchronized (this) {
1126 if(startingNextJob != null) {
1127 try {
1128 startingNextJob.join();
1129 } catch (InterruptedException e) {
1130 logger.log(Level.WARNING,
1131 "interrupt waiting for job start to complete", e);
1132 return;
1133 }
1134 }
1135 startingNextJob = new Thread(new Runnable() {
1136 public void run() {
1137 startNextJobInternal();
1138 }
1139 }, "StartNextJob");
1140 startingNextJob.start();
1141 }
1142 }
1143
1144 protected void startNextJobInternal() {
1145 if (pendingCrawlJobs.size() == 0 || isCrawling()) {
1146
1147 return;
1148 }
1149 this.currentJob = (CrawlJob)pendingCrawlJobs.first();
1150 assert pendingCrawlJobs.contains(currentJob) :
1151 "pendingCrawlJobs is in an illegal state";
1152 pendingCrawlJobs.remove(currentJob);
1153 try {
1154 this.currentJob.setupForCrawlStart();
1155
1156
1157
1158 this.currentJob.getController().addCrawlStatusListener(this);
1159
1160 this.currentJob.getController().requestCrawlStart();
1161 } catch (InitializationException e) {
1162 loadJob(getStateJobFile(this.currentJob.getDirectory()));
1163 this.currentJob = null;
1164 startNextJobInternal();
1165 }
1166 }
1167
1168 /***
1169 * Forward a 'kick' update to current job if any.
1170 */
1171 public void kickUpdate() {
1172 if(this.currentJob != null) {
1173 this.currentJob.kickUpdate();
1174 }
1175 }
1176
1177 /***
1178 * Loads options from a file. Typically these are a list of available
1179 * modules that can be plugged into some part of the configuration.
1180 * For examples Processors, Frontiers, Filters etc. Leading and trailing
1181 * spaces are trimmed from each line.
1182 *
1183 * <p>Options are loaded from the CLASSPATH.
1184 * @param file the name of the option file (without path!)
1185 * @return The option file with each option line as a seperate entry in the
1186 * ArrayList.
1187 * @throws IOException when there is trouble reading the file.
1188 */
1189 public static ArrayList<String> loadOptions(String file)
1190 throws IOException {
1191 ArrayList<String> ret = new ArrayList<String>();
1192 Enumeration resources =
1193 CrawlJob.class.getClassLoader().getResources("modules/" + file);
1194
1195 boolean noFileFound = true;
1196 while (resources.hasMoreElements()) {
1197 InputStream is = ((URL) resources.nextElement()).openStream();
1198 noFileFound = false;
1199
1200 String line = null;
1201 BufferedReader bf =
1202 new BufferedReader(new InputStreamReader(is), 8192);
1203 try {
1204 while ((line = bf.readLine()) != null) {
1205 line = line.trim();
1206 if(line.indexOf('#')<0 && line.length()>0){
1207
1208 ret.add(line);
1209 }
1210 }
1211 } finally {
1212 bf.close();
1213 }
1214 }
1215
1216 if (noFileFound) {
1217 throw new IOException("Failed to get " + file + " from the " +
1218 " CLASSPATH");
1219 }
1220
1221 return ret;
1222 }
1223
1224 /***
1225 * Returns a URIFrontierMarker for the current, paused, job. If there is no
1226 * current job or it is not paused null will be returned.
1227 *
1228 * @param regexpr
1229 * A regular expression that each URI must match in order to be
1230 * considered 'within' the marker.
1231 * @param inCacheOnly
1232 * Limit marker scope to 'cached' URIs.
1233 * @return a URIFrontierMarker for the current job.
1234 * @see #getPendingURIsList(FrontierMarker, int, boolean)
1235 * @see org.archive.crawler.framework.Frontier#getInitialMarker(String,
1236 * boolean)
1237 * @see org.archive.crawler.framework.FrontierMarker
1238 */
1239 public FrontierMarker getInitialMarker(String regexpr,
1240 boolean inCacheOnly) {
1241 return (this.currentJob != null)?
1242 this.currentJob.getInitialMarker(regexpr, inCacheOnly): null;
1243 }
1244
1245 /***
1246 * Returns the frontiers URI list based on the provided marker. This method
1247 * will return null if there is not current job or if the current job is
1248 * not paused. Only when there is a paused current job will this method
1249 * return a URI list.
1250 *
1251 * @param marker
1252 * URIFrontier marker
1253 * @param numberOfMatches
1254 * maximum number of matches to return
1255 * @param verbose
1256 * should detailed info be provided on each URI?
1257 * @return the frontiers URI list based on the provided marker
1258 * @throws InvalidFrontierMarkerException
1259 * When marker is inconsistent with the current state of the
1260 * frontier.
1261 * @see #getInitialMarker(String, boolean)
1262 * @see org.archive.crawler.framework.FrontierMarker
1263 */
1264 public ArrayList getPendingURIsList(FrontierMarker marker,
1265 int numberOfMatches, boolean verbose)
1266 throws InvalidFrontierMarkerException {
1267 return (this.currentJob != null)?
1268 this.currentJob.getPendingURIsList(marker, numberOfMatches, verbose):
1269 null;
1270 }
1271
1272 /***
1273 * Delete any URI from the frontier of the current (paused) job that match
1274 * the specified regular expression. If the current job is not paused (or
1275 * there is no current job) nothing will be done.
1276 * @param regexpr Regular expression to delete URIs by.
1277 * @return the number of URIs deleted
1278 */
1279 public long deleteURIsFromPending(String regexpr) {
1280 return deleteURIsFromPending(regexpr, null);
1281 }
1282
1283 /***
1284 * Delete any URI from the frontier of the current (paused) job that match
1285 * the specified regular expression. If the current job is not paused (or
1286 * there is no current job) nothing will be done.
1287 * @param uriPattern Regular expression to delete URIs by.
1288 * @param queuePattern Regular expression of target queues (or null for all)
1289 * @return the number of URIs deleted
1290 */
1291 public long deleteURIsFromPending(String uriPattern, String queuePattern) {
1292 return (this.currentJob != null)?
1293 this.currentJob.deleteURIsFromPending(uriPattern,queuePattern): 0;
1294 }
1295
1296 public String importUris(String file, String style, String force) {
1297 return importUris(file, style, "true".equals(force));
1298 }
1299
1300 /***
1301 * @param fileOrUrl Name of file w/ seeds.
1302 * @param style What style of seeds -- crawl log (<code>crawlLog</code>
1303 * style) or recovery journal (<code>recoveryJournal</code> style), or
1304 * seeds file style (Pass <code>default</code> style).
1305 * @param forceRevisit Should we revisit even if seen before?
1306 * @return A display string that has a count of all added.
1307 */
1308 public String importUris(final String fileOrUrl, final String style,
1309 final boolean forceRevisit) {
1310 return (this.currentJob != null)?
1311 this.currentJob.importUris(fileOrUrl, style, forceRevisit): null;
1312 }
1313
1314 protected int importUris(InputStream is, String style,
1315 boolean forceRevisit) {
1316 return (this.currentJob != null)?
1317 this.currentJob.importUris(is, style, forceRevisit): 0;
1318 }
1319
1320 /***
1321 * Schedule a uri.
1322 * @param uri Uri to schedule.
1323 * @param forceFetch Should it be forcefetched.
1324 * @param isSeed True if seed.
1325 * @throws URIException
1326 */
1327 public void importUri(final String uri, final boolean forceFetch,
1328 final boolean isSeed)
1329 throws URIException {
1330 importUri(uri, forceFetch, isSeed, true);
1331 }
1332
1333 /***
1334 * Schedule a uri.
1335 * @param str String that can be: 1. a UURI, 2. a snippet of the
1336 * crawl.log line, or 3. a snippet from recover log. See
1337 * {@link #importUris(InputStream, String, boolean)} for how it subparses
1338 * the lines from crawl.log and recover.log.
1339 * @param forceFetch Should it be forcefetched.
1340 * @param isSeed True if seed.
1341 * @param isFlush If true, flush the frontier IF it implements
1342 * flushing.
1343 * @throws URIException
1344 */
1345 public void importUri(final String str, final boolean forceFetch,
1346 final boolean isSeed, final boolean isFlush)
1347 throws URIException {
1348 if (this.currentJob != null) {
1349 this.currentJob.importUri(str, forceFetch, isSeed, isFlush);
1350 }
1351 }
1352
1353 /***
1354 * If its a HostQueuesFrontier, needs to be flushed for the queued.
1355 */
1356 protected void doFlush() {
1357 if (this.currentJob != null) {
1358 this.currentJob.flush();
1359 }
1360 }
1361
1362 public void stop() {
1363 if (isCrawling()) {
1364 deleteJob(getCurrentJob().getUID());
1365 }
1366 }
1367
1368 public void requestCrawlStop() {
1369 if (this.currentJob != null) {
1370 this.currentJob.stopCrawling();
1371 }
1372 }
1373
1374 /***
1375 * Ensure order file with new name/desc is written.
1376 * See '[ 1066573 ] sometimes job based-on other job uses older job name'.
1377 * @param newJob Newly created job.
1378 * @param metaname Metaname for new job.
1379 * @param description Description for new job.
1380 * @return <code>newJob</code>
1381 */
1382 public static CrawlJob ensureNewJobWritten(CrawlJob newJob, String metaname,
1383 String description) {
1384 XMLSettingsHandler settingsHandler = newJob.getSettingsHandler();
1385 CrawlerSettings orderfile = settingsHandler.getSettingsObject(null);
1386 orderfile.setName(metaname);
1387 orderfile.setDescription(description);
1388 settingsHandler.writeSettingsObject(orderfile);
1389 return newJob;
1390 }
1391
1392 public void crawlStarted(String message) {
1393
1394
1395 }
1396
1397 public void crawlEnding(String sExitMessage) {
1398 loadJob(getStateJobFile(this.currentJob.getDirectory()));
1399 currentJob = null;
1400 synchronized (this) {
1401
1402 notifyAll();
1403 }
1404 }
1405
1406 public void crawlEnded(String sExitMessage) {
1407 if (this.running) {
1408 startNextJob();
1409 }
1410 }
1411
1412 public void crawlPausing(String statusMessage) {
1413
1414
1415 }
1416
1417 public void crawlPaused(String statusMessage) {
1418
1419
1420 }
1421
1422 public void crawlResuming(String statusMessage) {
1423
1424 }
1425
1426 public void crawlCheckpoint(File checkpointDir) throws Exception {
1427
1428 }
1429 }