1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.settings;
26
27 import java.io.BufferedInputStream;
28 import java.io.BufferedOutputStream;
29 import java.io.File;
30 import java.io.FileInputStream;
31 import java.io.FileNotFoundException;
32 import java.io.FileOutputStream;
33 import java.io.IOException;
34 import java.io.InputStream;
35 import java.util.ArrayList;
36 import java.util.Collection;
37 import java.util.List;
38 import java.util.TreeSet;
39 import java.util.logging.Level;
40 import java.util.logging.Logger;
41
42 import javax.management.Attribute;
43 import javax.management.AttributeNotFoundException;
44 import javax.management.InvalidAttributeValueException;
45 import javax.management.MBeanAttributeInfo;
46 import javax.management.MBeanException;
47 import javax.management.MBeanInfo;
48 import javax.management.ReflectionException;
49 import javax.xml.parsers.FactoryConfigurationError;
50 import javax.xml.parsers.ParserConfigurationException;
51 import javax.xml.parsers.SAXParserFactory;
52 import javax.xml.transform.Source;
53 import javax.xml.transform.Transformer;
54 import javax.xml.transform.TransformerFactory;
55 import javax.xml.transform.stream.StreamResult;
56
57 import org.apache.commons.io.IOUtils;
58 import org.archive.crawler.datamodel.CrawlOrder;
59 import org.archive.util.ArchiveUtils;
60 import org.archive.util.FileUtils;
61 import org.xml.sax.InputSource;
62 import org.xml.sax.SAXException;
63 import org.xml.sax.SAXParseException;
64 import org.xml.sax.XMLReader;
65
66 /*** A SettingsHandler which uses XML files as persistent storage.
67 *
68 * @author John Erik Halse
69 */
70 public class XMLSettingsHandler extends SettingsHandler {
71 private static Logger logger =
72 Logger.getLogger(
73 "org.archive.crawler.settings.XMLSettingsHandler");
74
75
76 protected static final String XML_SCHEMA = "heritrix_settings.xsd";
77 protected static final String XML_ROOT_ORDER = "crawl-order";
78 protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
79 protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
80 protected static final String XML_ELEMENT_CONTROLLER = "controller";
81 protected static final String XML_ELEMENT_META = "meta";
82 protected static final String XML_ELEMENT_NAME = "name";
83 protected static final String XML_ELEMENT_DESCRIPTION = "description";
84 protected static final String XML_ELEMENT_OPERATOR = "operator";
85 protected static final String XML_ELEMENT_ORGANIZATION = "organization";
86 protected static final String XML_ELEMENT_AUDIENCE = "audience";
87 protected static final String XML_ELEMENT_DATE = "date";
88 protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
89 protected static final String XML_ELEMENT_REFINEMENT = "refinement";
90 protected static final String XML_ELEMENT_REFERENCE = "reference";
91 protected static final String XML_ELEMENT_LIMITS = "limits";
92 protected static final String XML_ELEMENT_TIMESPAN = "timespan";
93 protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
94 protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
95 protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
96 protected static final String XML_ELEMENT_OBJECT = "object";
97 protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
98 protected static final String XML_ATTRIBUTE_NAME = "name";
99 protected static final String XML_ATTRIBUTE_CLASS = "class";
100 protected static final String XML_ATTRIBUTE_FROM = "from";
101 protected static final String XML_ATTRIBUTE_TO = "to";
102
103 private File orderFile;
104 private final static String settingsFilename = "settings";
105 private final static String settingsFilenameSuffix = "xml";
106 private final static String REFINEMENT_DIR = "_refinements";
107
108 /*** Create a new XMLSettingsHandler object.
109 *
110 * @param orderFile where the order file is located.
111 * @throws InvalidAttributeValueException
112 */
113 public XMLSettingsHandler(File orderFile)
114 throws InvalidAttributeValueException {
115 super();
116 this.orderFile = orderFile.getAbsoluteFile();
117 }
118
119 /*** Initialize the SettingsHandler.
120 *
121 * This method builds the settings data structure and initializes it with
122 * settings from the order file given to the constructor.
123 */
124 public void initialize() {
125 super.initialize();
126 }
127
128 /***
129 * Initialize the SettingsHandler from a source.
130 *
131 * This method builds the settings data structure and initializes it with
132 * settings from the order file given as a parameter. The intended use is
133 * to create a new order file based on a default (template) order file.
134 *
135 * @param source the order file to initialize from.
136 */
137 public void initialize(File source) {
138 File tmpOrderFile = orderFile;
139 orderFile = source.getAbsoluteFile();
140 this.initialize();
141 orderFile = tmpOrderFile;
142 }
143
144 private File getSettingsDirectory() {
145 String settingsDirectoryName = null;
146 try {
147 settingsDirectoryName =
148 (String) getOrder().getAttribute(
149 CrawlOrder.ATTR_SETTINGS_DIRECTORY);
150 } catch (AttributeNotFoundException e) {
151 e.printStackTrace();
152 } catch (MBeanException e) {
153 e.printStackTrace();
154 } catch (ReflectionException e) {
155 e.printStackTrace();
156 }
157
158 return getPathRelativeToWorkingDirectory(settingsDirectoryName);
159 }
160
161 /*** Resolves the filename for a settings object into a file path.
162 *
163 * It will also create the directory structure leading to this file
164 * if it doesn't exist.
165 *
166 * @param settings the settings object to get file path for.
167 * @return the file path for this settings object.
168 */
169 protected final File settingsToFilename(CrawlerSettings settings) {
170 File file;
171
172 if (settings.getScope() == null || settings.getScope().equals("")) {
173 if (settings.isRefinement()) {
174 file = new File(getSettingsDirectory(), File.separatorChar
175 + REFINEMENT_DIR + File.separatorChar
176 + settings.getName() + '.' + settingsFilenameSuffix);
177 } else {
178 file = orderFile;
179 }
180 } else {
181 String elements[] = settings.getScope().split("//.");
182 if (elements.length == 0) {
183 return orderFile;
184 }
185
186 StringBuffer path = new StringBuffer();
187 for (int i = elements.length - 1; i > 0; i--) {
188 path.append(elements[i]);
189 path.append(File.separatorChar);
190 }
191 path.append(elements[0]);
192
193 if (settings.isRefinement()) {
194 file = new File(getSettingsDirectory(), path.toString()
195 + File.separatorChar + REFINEMENT_DIR
196 + File.separatorChar + settings.getName() + '.'
197 + settingsFilenameSuffix);
198 } else {
199 file = new File(getSettingsDirectory(), path.toString()
200 + File.separatorChar + settingsFilename + "."
201 + settingsFilenameSuffix);
202 }
203 }
204 return file;
205 }
206
207 public final void writeSettingsObject(CrawlerSettings settings) {
208 File filename = settingsToFilename(settings);
209 writeSettingsObject(settings, filename);
210 }
211
212 /*** Write a CrawlerSettings object to a specified file.
213 *
214 * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
215 * except that it uses the submitted File object instead of trying to
216 * resolve where the file should be written.
217 *
218 * @param settings the settings object to be serialized.
219 * @param filename the file to which the settings object should be written.
220 */
221 public final void writeSettingsObject(
222 CrawlerSettings settings, File filename) {
223
224 logger.fine("Writing " + filename.getAbsolutePath());
225 filename.getParentFile().mkdirs();
226
227 FileOutputStream fos = null;
228 try {
229 long lastSaved = 0L;
230 File backup = null;
231 if (getOrder().getController() != null && filename.exists()) {
232
233 String name = filename.getName();
234 lastSaved = settings.getLastSavedTime().getTime();
235 name = name.substring(0, name.lastIndexOf('.')) + '_'
236 + ArchiveUtils.get14DigitDate(lastSaved) + "."
237 + settingsFilenameSuffix;
238 backup = new File(filename.getParentFile(), name);
239 FileUtils.copyFiles(filename, backup);
240 }
241
242 fos = new FileOutputStream(filename);
243 StreamResult result =
244 new StreamResult(
245 new BufferedOutputStream(fos));
246 Transformer transformer =
247 TransformerFactory.newInstance().newTransformer();
248 Source source = new CrawlSettingsSAXSource(settings);
249 transformer.transform(source, result);
250
251
252
253
254
255 if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
256 backup.delete();
257 }
258 } catch (Exception e) {
259 e.printStackTrace();
260 } finally {
261 IOUtils.closeQuietly(fos);
262 }
263 }
264
265 /*** Read the CrawlerSettings object from a specific file.
266 *
267 * @param settings the settings object to be updated with data from the
268 * persistent storage.
269 * @param f the file to read from.
270 * @return the updated settings object or null if there was no data for this
271 * in the persistent storage.
272 */
273 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings,
274 File f) {
275 CrawlerSettings result = null;
276 try {
277 InputStream is = null;
278 if (!f.exists()) {
279
280
281
282
283
284
285 if (!f.getName().startsWith(settingsFilename)) {
286 is = XMLSettingsHandler.class.
287 getResourceAsStream(toResourcePath(f));
288 }
289 } else {
290 is = new FileInputStream(f);
291 }
292 if (is != null) {
293 XMLReader parser = SAXParserFactory.newInstance()
294 .newSAXParser().getXMLReader();
295 InputStream file = new BufferedInputStream(is);
296 parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
297 InputSource source = new InputSource(file);
298 source.setSystemId(f.toURL().toExternalForm());
299 parser.parse(source);
300 result = settings;
301 }
302 } catch (SAXParseException e) {
303 logger.log(Level.WARNING,e.getMessage() + " in '" + e.getSystemId()
304 + "', line: " + e.getLineNumber() + ", column: "
305 + e.getColumnNumber(),e);
306 } catch (SAXException e) {
307 logger.log(Level.WARNING,e.getMessage() + ": "
308 + e.getException().getMessage(),e);
309 } catch (ParserConfigurationException e) {
310 logger.log(Level.WARNING,e.getMessage() + ": "
311 + e.getCause().getMessage(),e);
312 } catch (FactoryConfigurationError e) {
313 logger.log(Level.WARNING,e.getMessage() + ": "
314 + e.getException().getMessage(),e);
315 } catch (IOException e) {
316 logger.log(Level.WARNING,"Could not access file '"
317 + f.getAbsolutePath() + "': " + e.getMessage(),e);
318 }
319 return result;
320 }
321
322 /***
323 * Convert a File to a path that might be resolved from classpath/JAR
324 * resource sources. Such paths use linux-like path-separators.
325 *
326 * @param f File
327 * @return path, shorn of any Windows-specific drive identifiers
328 */
329 public static String toResourcePath(File f) {
330 String path = f.toURI().getPath();
331 if(path.matches("^/[A-Z]:/.*")) {
332
333 path = path.substring(3);
334 }
335 return path;
336 }
337
338 protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
339 File filename = settingsToFilename(settings);
340 return readSettingsObject(settings, filename);
341 }
342
343 /*** Get the <code>File</code> object pointing to the order file.
344 *
345 * @return File object for the order file.
346 */
347 public File getOrderFile() {
348 return orderFile;
349 }
350
351 /*** Creates a replica of the settings file structure in another directory
352 * (fully recursive, includes all per host settings). The SettingsHandler
353 * will then refer to the new files.
354 *
355 * Observe that this method should only be called after the SettingsHandler
356 * has been initialized.
357 *
358 * @param newOrderFileName where the new order file should be saved.
359 * @param newSettingsDirectory the top level directory of the per host/domain
360 * settings files.
361 * @throws IOException
362 */
363 public void copySettings(File newOrderFileName, String newSettingsDirectory)
364 throws IOException {
365 File oldSettingsDirectory = getSettingsDirectory();
366
367
368 orderFile = newOrderFileName;
369 try {
370 getOrder().setAttribute(
371 new Attribute(
372 CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
373 } catch (Exception e) {
374 throw new IOException("Could not update settings with new location: "
375 + e.getMessage());
376 }
377 writeSettingsObject(getSettingsObject(null));
378
379 File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
380
381
382 if (oldSettingsDirectory.compareTo(newDir) != 0) {
383 FileUtils.copyFiles(oldSettingsDirectory, newDir);
384 }
385 }
386
387 /***
388 * Transforms a relative path so that it is relative to the location of the
389 * order file. If an absolute path is given, it will be returned unchanged.<p>
390 * The location of it's order file is always considered as the 'working'
391 * directory for any given settings.
392 * @param path A relative path to a file (or directory)
393 * @return The same path modified so that it is relative to the file level
394 * location of the order file for the settings handler.
395 */
396 public File getPathRelativeToWorkingDirectory(String path) {
397 File f = new File(path);
398
399
400 if (!f.isAbsolute()) {
401 f = new File(this.getOrderFile().getParent(), path);
402 }
403 return f;
404 }
405
406 public Collection getDomainOverrides(String rootDomain) {
407 File settingsDir = getSettingsDirectory();
408
409
410 ArrayList<String> domains = new ArrayList<String>();
411
412 while(rootDomain != null && rootDomain.length()>0){
413 if(rootDomain.indexOf('.')<0){
414
415 domains.add(rootDomain);
416 break;
417 } else {
418
419 domains.add(rootDomain.substring(0,rootDomain.indexOf('.')));
420
421 rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1);
422 }
423 }
424
425
426 StringBuffer subDir = new StringBuffer();
427 for(int i=(domains.size()-1) ; i>=0 ; i--){
428 subDir.append(File.separator+domains.get(i));
429 }
430
431 settingsDir = new File(settingsDir.getPath()+subDir);
432 TreeSet<String> confirmedSubDomains = new TreeSet<String>();
433 if(settingsDir.exists()){
434
435 File[] possibleSubDomains = settingsDir.listFiles();
436 for (int i = 0; i < possibleSubDomains.length; i++) {
437 if (possibleSubDomains[i].isDirectory()
438 && isOverride(possibleSubDomains[i])) {
439
440 confirmedSubDomains.add(possibleSubDomains[i].getName());
441 }
442 }
443 }
444 return confirmedSubDomains;
445 }
446
447 /***
448 * Checks if a file is a a 'per host' override or if it's a directory if it
449 * or it's subdirectories contains a 'per host' override file.
450 * @param f The file or directory to check
451 * @return True if the file is an override or it's a directory that contains
452 * such a file.
453 */
454 private boolean isOverride(File f){
455 if(f.isDirectory()){
456
457 File[] subs = f.listFiles();
458 for(int i=0 ; i < subs.length ; i++){
459 if(isOverride(subs[i])){
460
461 return true;
462 }
463 }
464 } else if (f.getName().equals(
465 settingsFilename + "." + settingsFilenameSuffix)) {
466
467 return true;
468 }
469
470 return false;
471 }
472
473 /*** Delete a settings object from persistent storage.
474 *
475 * Deletes the file represented by the submitted settings object. All empty
476 * directories that are parents to the files path are also deleted.
477 *
478 * @param settings the settings object to delete.
479 */
480 public void deleteSettingsObject(CrawlerSettings settings) {
481 super.deleteSettingsObject(settings);
482 File settingsDirectory = getSettingsDirectory();
483 File settingsFile = settingsToFilename(settings);
484
485 if(!settingsFile.delete()) {
486 throw new RuntimeException("Could not delete: "+settingsFile);
487 }
488 settingsFile = settingsFile.getParentFile();
489 while (settingsFile.isDirectory() && settingsFile.list().length == 0
490 && !settingsFile.equals(settingsDirectory)) {
491 if(!settingsFile.delete()) {
492 logger.warning("Could not delete: "+settingsFile);
493 }
494 settingsFile = settingsFile.getParentFile();
495 }
496 }
497
498
499
500
501 public List<String> getListOfAllFiles() {
502 ArrayList<String> list = new ArrayList<String>();
503
504 list.add(getOrderFile().getAbsolutePath());
505
506 if (getSettingsDirectory().exists()) {
507 recursiveFindFiles(getSettingsDirectory(),list);
508 }
509
510 recursiveFindSecondaryFiles(getOrder(),list);
511 return list;
512 }
513
514 /***
515 * Add any files being used by any of the Modules making up the settings to
516 * the list.
517 *
518 * @param mbean A ModuleType to interrogate for files. Any child modules
519 * will be recursively interrogated.
520 * @param list The list to add found files to.
521 */
522 private void recursiveFindSecondaryFiles(ComplexType mbean,
523 ArrayList<String> list) {
524 MBeanInfo info = mbean.getMBeanInfo();
525 MBeanAttributeInfo[] a = info.getAttributes();
526
527 if(mbean instanceof ModuleType){
528 ((ModuleType)mbean).listUsedFiles(list);
529 }
530
531
532 for(int n=0; n<a.length; n++) {
533 if(a[n] == null) {
534
535 } else {
536 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n];
537 Object currentAttribute;
538 try {
539 currentAttribute = mbean.getAttribute(att.getName());
540 if(currentAttribute instanceof ComplexType) {
541 recursiveFindSecondaryFiles((ComplexType)currentAttribute,list);
542 }
543 } catch (AttributeNotFoundException e) {
544
545 e.printStackTrace();
546 } catch (MBeanException e) {
547
548 e.printStackTrace();
549 } catch (ReflectionException e) {
550
551 e.printStackTrace();
552 }
553 }
554 }
555 }
556
557 /***
558 * Starting at the specific directory this method will iterate through all
559 * sub directories and add each file (as absolute name, with path as a
560 * string) to the provided ArrayList. Any file found under the settings
561 * directory with the proper suffix will be considered valid and added to
562 * the list.
563 * @param dir Starting directory
564 * @param list The list to add to
565 */
566 private void recursiveFindFiles(File dir, ArrayList<String> list){
567 File[] subs = dir.listFiles();
568 if (subs != null) {
569 for(int i=0 ; i < subs.length ; i++){
570 if(subs[i].isDirectory()){
571 recursiveFindFiles(subs[i],list);
572 } else {
573 if(subs[i].getName().endsWith(settingsFilenameSuffix)){
574
575 list.add(subs[i].getAbsolutePath());
576 }
577 }
578 }
579 }
580 }
581 }