1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.deciderules;
26
27 import java.io.File;
28 import java.io.FileOutputStream;
29 import java.io.FileReader;
30 import java.io.IOException;
31 import java.io.OutputStreamWriter;
32
33 import org.archive.crawler.datamodel.CandidateURI;
34 import org.archive.crawler.framework.CrawlScope;
35 import org.archive.crawler.scope.SeedListener;
36 import org.archive.crawler.settings.SimpleType;
37 import org.archive.crawler.settings.Type;
38 import org.archive.util.SurtPrefixSet;
39
40
41
42 /***
43 * Rule applies configured decision to any URIs that, when
44 * expressed in SURT form, begin with one of the prefixes
45 * in the configured set.
46 *
47 * The set can be filled with SURT prefixes implied or
48 * listed in the seeds file, or another external file.
49 *
50 * The "also-check-via" option to implement "one hop off"
51 * scoping derives from a contribution by Shifra Raffel
52 * of the California Digital Library.
53 *
54 * @author gojomo
55 */
56 public class SurtPrefixedDecideRule extends PredicatedDecideRule
57 implements SeedListener {
58
59 private static final long serialVersionUID = 2075790126085405015L;
60
61
62
63
64 public static final String ATTR_SURTS_SOURCE_FILE = "surts-source-file";
65 public static final String ATTR_SEEDS_AS_SURT_PREFIXES =
66 "seeds-as-surt-prefixes";
67 public static final String ATTR_SURTS_DUMP_FILE = "surts-dump-file";
68
69 private static final Boolean DEFAULT_SEEDS_AS_SURT_PREFIXES =
70 new Boolean(true);
71
72 /***
73 * Whether every config change should trigger a
74 * rebuilding of the prefix set.
75 */
76 public static final String
77 ATTR_REBUILD_ON_RECONFIG = "rebuild-on-reconfig";
78 public static final Boolean
79 DEFAULT_REBUILD_ON_RECONFIG = Boolean.TRUE;
80
81 /***
82 * Whether the 'via' of CrawlURIs should also be checked
83 * to see if it is prefixed by the set of SURT prefixes
84 */
85 public static final String
86 ATTR_ALSO_CHECK_VIA = "also-check-via";
87 public static final Boolean
88 DEFAULT_ALSO_CHECK_VIA = Boolean.FALSE;
89
90 protected SurtPrefixSet surtPrefixes = null;
91
92 /***
93 * Usual constructor.
94 * @param name
95 */
96 public SurtPrefixedDecideRule(String name) {
97 super(name);
98 setDescription("SurtPrefixedDecideRule. Makes the configured decision "
99 + "for any URI which, when expressed in SURT form, begins "
100 + "with any of the established prefixes (from either seeds "
101 + "specification or an external file).");
102 addElementToDefinition(new SimpleType(ATTR_SURTS_SOURCE_FILE,
103 "Source file from which to infer SURT prefixes. Any URLs " +
104 "in file will be converted to the implied SURT prefix, and " +
105 "literal SURT prefixes may be listed on lines beginning " +
106 "with a '+' character.",
107 ""));
108 addElementToDefinition(new SimpleType(ATTR_SEEDS_AS_SURT_PREFIXES,
109 "Should seeds also be interpreted as SURT prefixes.",
110 DEFAULT_SEEDS_AS_SURT_PREFIXES));
111 Type t = addElementToDefinition(new SimpleType(ATTR_SURTS_DUMP_FILE,
112 "Dump file to save SURT prefixes actually used: " +
113 "Useful debugging SURTs.", ""));
114 t.setExpertSetting(true);
115 t = addElementToDefinition(new SimpleType(ATTR_ALSO_CHECK_VIA,
116 "Whether to also make the configured decision if a " +
117 "URI's 'via' URI (the URI from which it was discovered) " +
118 "in SURT form begins with any of the established prefixes. " +
119 "For example, can be used to ACCEPT URIs that are 'one hop " +
120 "off' URIs fitting the SURT prefixes. Default is false.",
121 DEFAULT_ALSO_CHECK_VIA));
122 t.setOverrideable(false);
123 t.setExpertSetting(true);
124 t = addElementToDefinition(new SimpleType(ATTR_REBUILD_ON_RECONFIG,
125 "Whether to rebuild the internal structures from source " +
126 "files (including seeds if appropriate) every time any " +
127 "configuration change occurs. If true, " +
128 "rule is rebuilt from sources even when (for example) " +
129 "unrelated new domain overrides are set. Rereading large" +
130 "source files can take a long time.",
131 DEFAULT_REBUILD_ON_RECONFIG));
132 t.setOverrideable(false);
133 t.setExpertSetting(true);
134 }
135
136 /***
137 * Evaluate whether given object's URI is covered by the SURT prefix set
138 *
139 * @param object Item to evaluate.
140 * @return true if item, as SURT form URI, is prefixed by an item in the set
141 */
142 protected boolean evaluate(Object object) {
143 if ( (object instanceof CandidateURI) &&
144 ((Boolean) getUncheckedAttribute(null, ATTR_ALSO_CHECK_VIA))
145 .booleanValue()) {
146 if(evaluate(((CandidateURI)object).getVia())) {
147 return true;
148 }
149 }
150 String candidateSurt;
151 candidateSurt = SurtPrefixSet.getCandidateSurt(object);
152 if (candidateSurt == null) {
153 return false;
154 }
155 return getPrefixes().containsPrefixOf(candidateSurt);
156 }
157
158 /***
159 * Synchronized get of prefix set to use
160 *
161 * @return SurtPrefixSet to use for check
162 */
163 private synchronized SurtPrefixSet getPrefixes() {
164 if (surtPrefixes == null) {
165 readPrefixes();
166 }
167 return surtPrefixes;
168 }
169
170 protected void readPrefixes() {
171 buildSurtPrefixSet();
172 dumpSurtPrefixSet();
173 }
174
175 /***
176 * Dump the current prefixes in use to configured dump file (if any)
177 */
178 protected void dumpSurtPrefixSet() {
179
180 String dumpPath = (String)getUncheckedAttribute(null,
181 ATTR_SURTS_DUMP_FILE);
182 if (dumpPath.length() > 0) {
183 File dump = new File(dumpPath);
184 if (!dump.isAbsolute()) {
185 dump = new File(getSettingsHandler().getOrder().getController()
186 .getDisk(), dumpPath);
187 }
188 try {
189 OutputStreamWriter fw =
190 new OutputStreamWriter(new FileOutputStream(dump),"UTF-8");
191 try {
192 surtPrefixes.exportTo(fw);
193 } finally {
194 fw.close();
195 }
196 } catch (IOException e) {
197 e.printStackTrace();
198 throw new RuntimeException(e);
199 }
200 }
201 }
202
203 /***
204 * Construct the set of prefixes to use, from the seed list (
205 * which may include both URIs and '+'-prefixed directives).
206 */
207 protected void buildSurtPrefixSet() {
208 SurtPrefixSet newSurtPrefixes = new SurtPrefixSet();
209 FileReader fr = null;
210
211
212 String sourcePath = (String)getUncheckedAttribute(null,
213 ATTR_SURTS_SOURCE_FILE);
214 if (sourcePath.length() > 0) {
215 File source = new File(sourcePath);
216 if (!source.isAbsolute()) {
217 source = new File(getSettingsHandler().getOrder()
218 .getController().getDisk(), sourcePath);
219 }
220 try {
221 fr = new FileReader(source);
222 try {
223 newSurtPrefixes.importFromMixed(fr, true);
224 } finally {
225 fr.close();
226 }
227 } catch (IOException e) {
228 e.printStackTrace();
229 throw new RuntimeException(e);
230 }
231 }
232
233
234 boolean deduceFromSeeds = ((Boolean)getUncheckedAttribute(null,
235 ATTR_SEEDS_AS_SURT_PREFIXES)).booleanValue();
236 if(deduceFromSeeds) {
237 try {
238 fr = new FileReader(getSeedfile());
239 try {
240 newSurtPrefixes.importFromMixed(fr, deduceFromSeeds);
241 } finally {
242 fr.close();
243 }
244 } catch (IOException e) {
245 e.printStackTrace();
246 throw new RuntimeException(e);
247 }
248 }
249
250 surtPrefixes = newSurtPrefixes;
251 }
252
253 /***
254 * Re-read prefixes after an update.
255 *
256 * @see org.archive.crawler.framework.CrawlScope#kickUpdate()
257 */
258 public synchronized void kickUpdate() {
259 super.kickUpdate();
260 if (((Boolean) getUncheckedAttribute(null, ATTR_REBUILD_ON_RECONFIG))
261 .booleanValue()) {
262 readPrefixes();
263 }
264
265
266 }
267
268 /***
269 * Dig through everything to get the crawl-global seeds file.
270 * Add self as listener while at it.
271 *
272 * @return Seed list file
273 */
274 protected File getSeedfile() {
275 CrawlScope scope =
276 getSettingsHandler().getOrder().getController().getScope();
277 scope.addSeedListener(this);
278 return scope.getSeedfile();
279 }
280
281 public synchronized void addedSeed(final CandidateURI curi) {
282 SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();
283 newSurtPrefixes.add(prefixFrom(curi.toString()));
284 surtPrefixes = newSurtPrefixes;
285 }
286
287 protected String prefixFrom(String uri) {
288 return SurtPrefixSet.prefixFromPlain(uri);
289 }
290 }