RuntimeLimitEnforcer xref

View Javadoc

1   /* RuntimeLimitEnforcer
2    * 
3    * Created on July 7, 2006
4    *
5    * Copyright (C) 2005 Internet Archive.
6    * 
7    * This file is part of the Heritrix web crawler (crawler.archive.org).
8    * 
9    * Heritrix is free software; you can redistribute it and/or modify
10   * it under the terms of the GNU Lesser Public License as published by
11   * the Free Software Foundation; either version 2.1 of the License, or
12   * any later version.
13   * 
14   * Heritrix is distributed in the hope that it will be useful, 
15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   * GNU Lesser Public License for more details.
18   * 
19   * You should have received a copy of the GNU Lesser Public License
20   * along with Heritrix; if not, write to the Free Software
21   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22   */
23  package org.archive.crawler.prefetch;
24  
25  import java.util.logging.Level;
26  import java.util.logging.Logger;
27  
28  import org.archive.crawler.admin.CrawlJob;
29  import org.archive.crawler.datamodel.CrawlURI;
30  import org.archive.crawler.datamodel.FetchStatusCodes;
31  import org.archive.crawler.framework.Processor;
32  import org.archive.crawler.settings.SimpleType;
33  import org.archive.crawler.settings.Type;
34  
35  /***
36   * A processor to enforce runtime limits on crawls.
37   * <p>
38   * This processor extends and improves on the 'max-time' capability of Heritrix.
39   * Essentially, the 'Terminate job' option functions the same way as 'max-time'. 
40   * The processor however also enables pausing when the runtime is exceeded and  
41   * the blocking of all URIs. 
42   * <p>
43   * <ol>
44   * <li>Pause job - Pauses the crawl. A change (increase) to the 
45   *     runtime duration will make it pausible to resume the crawl. 
46   *     Attempts to resume the crawl without modifying the run time 
47   *     will cause it to be immediately paused again.</li>
48   * <li>Terminate job - Terminates the job. Equivalent
49   *     to using the max-time setting on the CrawlController.</li>
50   * <li>Block URIs - Blocks each URI with an -5002
51   *     (blocked by custom processor) fetch status code. This will
52   *     cause all the URIs queued to wind up in the crawl.log.</li>
53   * <ol>
54   * <p>
55   * The processor allows variable runtime based on host (or other  
56   * override/refinement criteria) however using such overrides only makes sense  
57   * when using 'Block URIs' as pause and terminate will have global impact once
58   * encountered anywhere. 
59   * 
60   * @author Kristinn Sigur&eth;sson
61   */
62  public class RuntimeLimitEnforcer 
63                  extends Processor implements FetchStatusCodes {
64  
65      private static final long serialVersionUID = 1L;
66      
67      protected Logger logger = Logger.getLogger(
68              RuntimeLimitEnforcer.class.getName());
69      
70      public static final String ATTR_RUNTIME_SECONDS = "runtime-sec".intern();
71      protected static final long DEFAULT_RUNTIME_SECONDS = 86400; // 1 day
72  
73      public static final String ATTR_END_OPERATION = "end-operation".intern();
74      protected static final String OP_PAUSE = "Pause job".intern();
75      protected static final String OP_TERMINATE = "Terminate job".intern();
76      protected static final String OP_BLOCK_URIS = "Block URIs".intern();
77      protected static final String DEFAULT_END_OPERATION = OP_PAUSE;
78      protected static final String[] AVAILABLE_END_OPERATIONS = {
79          OP_PAUSE, OP_TERMINATE, OP_BLOCK_URIS};
80      
81      public RuntimeLimitEnforcer(String name) {
82          super(name, "A processor that halts further progress once a fixed " +
83                  "amount of time has elapsed since the start of a crawl. " +
84                  "It is possible to configure this processor per host, but " +
85                  "it should be noted that Heritrix does not track runtime " +
86                  "per host seperately. Especially when using facilities " +
87                  "like the BdbFrontier's hold-queues, the actual amount of " +
88                  "time spent crawling a host may have little relevance to " +
89                  "total elapsed time. Note however that using overrides " +
90                  "and/or refinements only makes sense when using the " +
91                  "'Block URIs' end operation. The pause and terminate " +
92                  "operations have global impact once encountered.");
93          Type t =  new SimpleType(
94                  ATTR_RUNTIME_SECONDS,
95                  "The amount of time, in seconds, that the crawl will be " +
96                  "allowed to run before this processor performs it's 'end " +
97                  "operation.'",
98                  DEFAULT_RUNTIME_SECONDS);
99          addElementToDefinition(t);
100         t = new SimpleType(
101                 ATTR_END_OPERATION,
102                 "The action that the processor takes once the runtime has " +
103                 "elapsed.\n " +
104                 "Operation: Pause job - Pauses the crawl. A change " +
105                 "(increase) to the runtime duration will " +
106                 "make it pausible to resume the crawl. Attempts to resume " +
107                 "the crawl without modifying the run time will cause it to " +
108                 "be immediately paused again.\n " +
109                 "Operation: Terminate job - Terminates the job. Equivalent " +
110                 "to using the max-time setting on the CrawlController.\n " +
111                 "Operation: Block URIs - Blocks each URI with an -5002 " +
112                 "(blocked by custom processor) fetch status code. This will " +
113                 "cause all the URIs queued to wind up in the crawl.log.",
114                 DEFAULT_END_OPERATION, 
115                 AVAILABLE_END_OPERATIONS);
116         addElementToDefinition(t);
117     }
118 
119     protected void innerProcess(CrawlURI curi) throws InterruptedException {
120         long allowedRuntime = getRuntime(curi);
121         long currentRuntime = getController().getStatistics().crawlDuration();
122         if(currentRuntime > allowedRuntime){
123             String op = (String)getUncheckedAttribute(curi,ATTR_END_OPERATION);
124             if(op != null){
125                 if(op.equals(OP_PAUSE)){
126                     getController().requestCrawlPause();
127                 } else if(op.equals(OP_TERMINATE)){
128                     getController().requestCrawlStop(
129                             CrawlJob.STATUS_FINISHED_TIME_LIMIT);
130                 } else if(op.equals(OP_BLOCK_URIS)){
131                     curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT);
132                     curi.addAnnotation("Runtime exceeded " + allowedRuntime + 
133                             "ms");
134                     curi.skipToProcessorChain(
135                             getController().getPostprocessorChain());
136                 }
137             } else {
138                 logger.log(Level.SEVERE,"Null value for " + ATTR_END_OPERATION + 
139                         " when processing " + curi.toString());
140             }
141         }
142     }
143     
144     /***
145      * Returns the amount of time to allow the crawl to run before this 
146      * processor interrupts.
147      * @return the amount of time in milliseconds.
148      */
149     protected long getRuntime(CrawlURI curi){
150         Object o = getUncheckedAttribute(curi,ATTR_RUNTIME_SECONDS);
151         if(o == null){
152             logger.log(Level.SEVERE,"Null value for " + ATTR_RUNTIME_SECONDS + 
153                     " when processing " + curi.toString());
154             return Long.MAX_VALUE;
155         }
156         return ((Long)o).longValue()*1000; //extract value and convert to ms.
157     }
158     
159 }