1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.archive.crawler.prefetch;
24
25 import java.util.logging.Level;
26 import java.util.logging.Logger;
27
28 import org.archive.crawler.admin.CrawlJob;
29 import org.archive.crawler.datamodel.CrawlURI;
30 import org.archive.crawler.datamodel.FetchStatusCodes;
31 import org.archive.crawler.framework.Processor;
32 import org.archive.crawler.settings.SimpleType;
33 import org.archive.crawler.settings.Type;
34
35 /***
36 * A processor to enforce runtime limits on crawls.
37 * <p>
38 * This processor extends and improves on the 'max-time' capability of Heritrix.
39 * Essentially, the 'Terminate job' option functions the same way as 'max-time'.
40 * The processor however also enables pausing when the runtime is exceeded and
41 * the blocking of all URIs.
42 * <p>
43 * <ol>
44 * <li>Pause job - Pauses the crawl. A change (increase) to the
45 * runtime duration will make it pausible to resume the crawl.
46 * Attempts to resume the crawl without modifying the run time
47 * will cause it to be immediately paused again.</li>
48 * <li>Terminate job - Terminates the job. Equivalent
49 * to using the max-time setting on the CrawlController.</li>
50 * <li>Block URIs - Blocks each URI with an -5002
51 * (blocked by custom processor) fetch status code. This will
52 * cause all the URIs queued to wind up in the crawl.log.</li>
53 * <ol>
54 * <p>
55 * The processor allows variable runtime based on host (or other
56 * override/refinement criteria) however using such overrides only makes sense
57 * when using 'Block URIs' as pause and terminate will have global impact once
58 * encountered anywhere.
59 *
60 * @author Kristinn Sigurðsson
61 */
62 public class RuntimeLimitEnforcer
63 extends Processor implements FetchStatusCodes {
64
65 private static final long serialVersionUID = 1L;
66
67 protected Logger logger = Logger.getLogger(
68 RuntimeLimitEnforcer.class.getName());
69
70 public static final String ATTR_RUNTIME_SECONDS = "runtime-sec".intern();
71 protected static final long DEFAULT_RUNTIME_SECONDS = 86400;
72
73 public static final String ATTR_END_OPERATION = "end-operation".intern();
74 protected static final String OP_PAUSE = "Pause job".intern();
75 protected static final String OP_TERMINATE = "Terminate job".intern();
76 protected static final String OP_BLOCK_URIS = "Block URIs".intern();
77 protected static final String DEFAULT_END_OPERATION = OP_PAUSE;
78 protected static final String[] AVAILABLE_END_OPERATIONS = {
79 OP_PAUSE, OP_TERMINATE, OP_BLOCK_URIS};
80
81 public RuntimeLimitEnforcer(String name) {
82 super(name, "A processor that halts further progress once a fixed " +
83 "amount of time has elapsed since the start of a crawl. " +
84 "It is possible to configure this processor per host, but " +
85 "it should be noted that Heritrix does not track runtime " +
86 "per host seperately. Especially when using facilities " +
87 "like the BdbFrontier's hold-queues, the actual amount of " +
88 "time spent crawling a host may have little relevance to " +
89 "total elapsed time. Note however that using overrides " +
90 "and/or refinements only makes sense when using the " +
91 "'Block URIs' end operation. The pause and terminate " +
92 "operations have global impact once encountered.");
93 Type t = new SimpleType(
94 ATTR_RUNTIME_SECONDS,
95 "The amount of time, in seconds, that the crawl will be " +
96 "allowed to run before this processor performs it's 'end " +
97 "operation.'",
98 DEFAULT_RUNTIME_SECONDS);
99 addElementToDefinition(t);
100 t = new SimpleType(
101 ATTR_END_OPERATION,
102 "The action that the processor takes once the runtime has " +
103 "elapsed.\n " +
104 "Operation: Pause job - Pauses the crawl. A change " +
105 "(increase) to the runtime duration will " +
106 "make it pausible to resume the crawl. Attempts to resume " +
107 "the crawl without modifying the run time will cause it to " +
108 "be immediately paused again.\n " +
109 "Operation: Terminate job - Terminates the job. Equivalent " +
110 "to using the max-time setting on the CrawlController.\n " +
111 "Operation: Block URIs - Blocks each URI with an -5002 " +
112 "(blocked by custom processor) fetch status code. This will " +
113 "cause all the URIs queued to wind up in the crawl.log.",
114 DEFAULT_END_OPERATION,
115 AVAILABLE_END_OPERATIONS);
116 addElementToDefinition(t);
117 }
118
119 protected void innerProcess(CrawlURI curi) throws InterruptedException {
120 long allowedRuntime = getRuntime(curi);
121 long currentRuntime = getController().getStatistics().crawlDuration();
122 if(currentRuntime > allowedRuntime){
123 String op = (String)getUncheckedAttribute(curi,ATTR_END_OPERATION);
124 if(op != null){
125 if(op.equals(OP_PAUSE)){
126 getController().requestCrawlPause();
127 } else if(op.equals(OP_TERMINATE)){
128 getController().requestCrawlStop(
129 CrawlJob.STATUS_FINISHED_TIME_LIMIT);
130 } else if(op.equals(OP_BLOCK_URIS)){
131 curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT);
132 curi.addAnnotation("Runtime exceeded " + allowedRuntime +
133 "ms");
134 curi.skipToProcessorChain(
135 getController().getPostprocessorChain());
136 }
137 } else {
138 logger.log(Level.SEVERE,"Null value for " + ATTR_END_OPERATION +
139 " when processing " + curi.toString());
140 }
141 }
142 }
143
144 /***
145 * Returns the amount of time to allow the crawl to run before this
146 * processor interrupts.
147 * @return the amount of time in milliseconds.
148 */
149 protected long getRuntime(CrawlURI curi){
150 Object o = getUncheckedAttribute(curi,ATTR_RUNTIME_SECONDS);
151 if(o == null){
152 logger.log(Level.SEVERE,"Null value for " + ATTR_RUNTIME_SECONDS +
153 " when processing " + curi.toString());
154 return Long.MAX_VALUE;
155 }
156 return ((Long)o).longValue()*1000;
157 }
158
159 }