View Javadoc

1   /* XMLSettingsHandler
2    *
3    * $Id: XMLSettingsHandler.java 6873 2010-05-28 23:30:14Z gojomo $
4    *
5    * Created on Dec 18, 2003
6    *
7    * Copyright (C) 2004 Internet Archive.
8    *
9    * This file is part of the Heritrix web crawler (crawler.archive.org).
10   *
11   * Heritrix is free software; you can redistribute it and/or modify
12   * it under the terms of the GNU Lesser Public License as published by
13   * the Free Software Foundation; either version 2.1 of the License, or
14   * any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful,
17   * but WITHOUT ANY WARRANTY; without even the implied warranty of
18   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   * GNU Lesser Public License for more details.
20   *
21   * You should have received a copy of the GNU Lesser Public License
22   * along with Heritrix; if not, write to the Free Software
23   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24   */
25  package org.archive.crawler.settings;
26  
27  import java.io.BufferedInputStream;
28  import java.io.BufferedOutputStream;
29  import java.io.File;
30  import java.io.FileInputStream;
31  import java.io.FileNotFoundException;
32  import java.io.FileOutputStream;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.util.ArrayList;
36  import java.util.Collection;
37  import java.util.List;
38  import java.util.TreeSet;
39  import java.util.logging.Level;
40  import java.util.logging.Logger;
41  
42  import javax.management.Attribute;
43  import javax.management.AttributeNotFoundException;
44  import javax.management.InvalidAttributeValueException;
45  import javax.management.MBeanAttributeInfo;
46  import javax.management.MBeanException;
47  import javax.management.MBeanInfo;
48  import javax.management.ReflectionException;
49  import javax.xml.parsers.FactoryConfigurationError;
50  import javax.xml.parsers.ParserConfigurationException;
51  import javax.xml.parsers.SAXParserFactory;
52  import javax.xml.transform.Source;
53  import javax.xml.transform.Transformer;
54  import javax.xml.transform.TransformerFactory;
55  import javax.xml.transform.stream.StreamResult;
56  
57  import org.apache.commons.io.IOUtils;
58  import org.archive.crawler.datamodel.CrawlOrder;
59  import org.archive.util.ArchiveUtils;
60  import org.archive.util.FileUtils;
61  import org.xml.sax.InputSource;
62  import org.xml.sax.SAXException;
63  import org.xml.sax.SAXParseException;
64  import org.xml.sax.XMLReader;
65  
66  /*** A SettingsHandler which uses XML files as persistent storage.
67   *
68   * @author John Erik Halse
69   */
70  public class XMLSettingsHandler extends SettingsHandler {
71      private static Logger logger =
72          Logger.getLogger(
73              "org.archive.crawler.settings.XMLSettingsHandler");
74  
75      // XML element name constants
76      protected static final String XML_SCHEMA = "heritrix_settings.xsd";
77      protected static final String XML_ROOT_ORDER = "crawl-order";
78      protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
79      protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
80      protected static final String XML_ELEMENT_CONTROLLER = "controller";
81      protected static final String XML_ELEMENT_META = "meta";
82      protected static final String XML_ELEMENT_NAME = "name";
83      protected static final String XML_ELEMENT_DESCRIPTION = "description";
84      protected static final String XML_ELEMENT_OPERATOR = "operator";
85      protected static final String XML_ELEMENT_ORGANIZATION = "organization";
86      protected static final String XML_ELEMENT_AUDIENCE = "audience";
87      protected static final String XML_ELEMENT_DATE = "date";
88      protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
89      protected static final String XML_ELEMENT_REFINEMENT = "refinement";
90      protected static final String XML_ELEMENT_REFERENCE = "reference";
91      protected static final String XML_ELEMENT_LIMITS = "limits";
92      protected static final String XML_ELEMENT_TIMESPAN = "timespan";
93      protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
94      protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
95      protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
96      protected static final String XML_ELEMENT_OBJECT = "object";
97      protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
98      protected static final String XML_ATTRIBUTE_NAME = "name";
99      protected static final String XML_ATTRIBUTE_CLASS = "class";
100     protected static final String XML_ATTRIBUTE_FROM = "from";
101     protected static final String XML_ATTRIBUTE_TO = "to";
102 
103     private File orderFile;
104     private final static String settingsFilename = "settings";
105     private final static String settingsFilenameSuffix = "xml";
106     private final static String REFINEMENT_DIR = "_refinements";
107 
108     /*** Create a new XMLSettingsHandler object.
109      *
110      * @param orderFile where the order file is located.
111      * @throws InvalidAttributeValueException
112      */
113     public XMLSettingsHandler(File orderFile)
114     throws InvalidAttributeValueException {
115         super();
116         this.orderFile = orderFile.getAbsoluteFile();
117     }
118 
119     /*** Initialize the SettingsHandler.
120      *
121      * This method builds the settings data structure and initializes it with
122      * settings from the order file given to the constructor.
123      */
124     public void initialize() {
125         super.initialize();
126     }
127 
128     /*** 
129      * Initialize the SettingsHandler from a source.
130      *
131      * This method builds the settings data structure and initializes it with
132      * settings from the order file given as a parameter. The intended use is
133      * to create a new order file based on a default (template) order file.
134      *
135      * @param source the order file to initialize from.
136      */
137     public void initialize(File source) {
138         File tmpOrderFile = orderFile;
139         orderFile = source.getAbsoluteFile();
140         this.initialize();
141         orderFile = tmpOrderFile;
142     }
143 
144     private File getSettingsDirectory() {
145         String settingsDirectoryName = null;
146         try {
147             settingsDirectoryName =
148                     (String) getOrder().getAttribute(
149                         CrawlOrder.ATTR_SETTINGS_DIRECTORY);
150         } catch (AttributeNotFoundException e) {
151             e.printStackTrace();
152         } catch (MBeanException e) {
153             e.printStackTrace();
154         } catch (ReflectionException e) {
155             e.printStackTrace();
156         }
157 
158         return getPathRelativeToWorkingDirectory(settingsDirectoryName);
159     }
160 
161     /*** Resolves the filename for a settings object into a file path.
162      *
163      * It will also create the directory structure leading to this file
164      * if it doesn't exist.
165      *
166      * @param settings the settings object to get file path for.
167      * @return the file path for this settings object.
168      */
169     protected final File settingsToFilename(CrawlerSettings settings) {
170         File file;
171 
172         if (settings.getScope() == null || settings.getScope().equals("")) {
173             if (settings.isRefinement()) {
174                 file = new File(getSettingsDirectory(), File.separatorChar
175                         + REFINEMENT_DIR + File.separatorChar
176                         + settings.getName() + '.' + settingsFilenameSuffix);
177             } else {
178                 file = orderFile;
179             }
180         } else {
181             String elements[] = settings.getScope().split("//.");
182             if (elements.length == 0) {
183                 return orderFile;
184             }
185 
186             StringBuffer path = new StringBuffer();
187             for (int i = elements.length - 1; i > 0; i--) {
188                 path.append(elements[i]);
189                 path.append(File.separatorChar);
190             }
191             path.append(elements[0]);
192 
193             if (settings.isRefinement()) {
194                 file = new File(getSettingsDirectory(), path.toString()
195                         + File.separatorChar + REFINEMENT_DIR
196                         + File.separatorChar + settings.getName() + '.'
197                         + settingsFilenameSuffix);
198             } else {
199                 file = new File(getSettingsDirectory(), path.toString()
200                         + File.separatorChar + settingsFilename + "."
201                         + settingsFilenameSuffix);
202             }
203         }
204         return file;
205     }
206 
207     public final void writeSettingsObject(CrawlerSettings settings) {
208         File filename = settingsToFilename(settings);
209         writeSettingsObject(settings, filename);
210     }
211 
212     /*** Write a CrawlerSettings object to a specified file.
213      *
214      * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
215      * except that it uses the submitted File object instead of trying to
216      * resolve where the file should be written.
217      *
218      * @param settings the settings object to be serialized.
219      * @param filename the file to which the settings object should be written.
220      */
221     public final void writeSettingsObject(
222             CrawlerSettings settings, File filename) {
223 
224         logger.fine("Writing " + filename.getAbsolutePath());
225         filename.getParentFile().mkdirs();
226 
227         FileOutputStream fos = null;
228         try {
229             long lastSaved = 0L;
230             File backup = null;
231             if (getOrder().getController() != null && filename.exists()) {
232                 // The crawler is running and file exists - make backup first.
233                 String name = filename.getName();
234                 lastSaved = settings.getLastSavedTime().getTime();
235                 name = name.substring(0, name.lastIndexOf('.')) + '_'
236                         + ArchiveUtils.get14DigitDate(lastSaved) + "."
237                         + settingsFilenameSuffix;
238                 backup = new File(filename.getParentFile(), name);
239                 FileUtils.copyFiles(filename, backup);
240             }
241 
242             fos = new FileOutputStream(filename);
243             StreamResult result =
244                 new StreamResult(
245                     new BufferedOutputStream(fos));
246             Transformer transformer =
247                 TransformerFactory.newInstance().newTransformer();
248             Source source = new CrawlSettingsSAXSource(settings);
249             transformer.transform(source, result);
250 
251             // Hack to get rid of unnesessary backupfiles.
252             // What happens is that the WUI often saves settings files
253             // several times during a settings change. This code removes the
254             // last backup file if its no more than 2 minutes old.
255             if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
256                 backup.delete();
257             }
258         } catch (Exception e) {
259             e.printStackTrace();
260         } finally {
261             IOUtils.closeQuietly(fos);
262         }
263     }
264 
265     /*** Read the CrawlerSettings object from a specific file.
266      *
267      * @param settings the settings object to be updated with data from the
268      *                 persistent storage.
269      * @param f the file to read from.
270      * @return the updated settings object or null if there was no data for this
271      *         in the persistent storage.
272      */    
273     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings,
274             File f) {
275         CrawlerSettings result = null;
276         try {
277             InputStream is = null;
278             if (!f.exists()) {
279                 // Perhaps the file we're looking for is on the CLASSPATH.
280                 // DON'T look on the CLASSPATH for 'settings.xml' files.  The
281                 // look for 'settings.xml' files happens frequently. Not looking
282                 // on classpath for 'settings.xml' is an optimization based on
283                 // ASSUMPTION that there will never be a 'settings.xml' saved
284                 // on classpath.
285                 if (!f.getName().startsWith(settingsFilename)) {
286                     is = XMLSettingsHandler.class.
287                         getResourceAsStream(toResourcePath(f));
288                 }
289             } else {
290                 is = new FileInputStream(f);
291             }
292             if (is != null) {
293                 XMLReader parser = SAXParserFactory.newInstance()
294                     .newSAXParser().getXMLReader();
295                 InputStream file = new BufferedInputStream(is);
296                 parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
297                 InputSource source = new InputSource(file);
298                 source.setSystemId(f.toURL().toExternalForm());
299                 parser.parse(source);
300                 result = settings;
301             }
302         } catch (SAXParseException e) {
303             logger.log(Level.WARNING,e.getMessage() + " in '" + e.getSystemId()
304                 + "', line: " + e.getLineNumber() + ", column: "
305                 + e.getColumnNumber(),e);
306         } catch (SAXException e) {
307             logger.log(Level.WARNING,e.getMessage() + ": "
308                 + e.getException().getMessage(),e);
309         } catch (ParserConfigurationException e) {
310             logger.log(Level.WARNING,e.getMessage() + ": "
311                 + e.getCause().getMessage(),e);
312         } catch (FactoryConfigurationError e) {
313             logger.log(Level.WARNING,e.getMessage() + ": "
314                 + e.getException().getMessage(),e);
315         } catch (IOException e) {
316             logger.log(Level.WARNING,"Could not access file '"
317                 + f.getAbsolutePath() + "': " + e.getMessage(),e);
318         }
319         return result;
320     }
321 
322     /***
323      * Convert a File to a path that might be resolved from classpath/JAR
324      * resource sources. Such paths use linux-like path-separators. 
325      * 
326      * @param f File 
327      * @return path, shorn of any Windows-specific drive identifiers
328      */
329     public static String toResourcePath(File f) {
330         String path = f.toURI().getPath(); 
331         if(path.matches("^/[A-Z]:/.*")) {
332             // remove Windows drive-prefix, if any
333             path = path.substring(3); 
334         }
335         return path; 
336     }
337 
338     protected final CrawlerSettings readSettingsObject(CrawlerSettings settings) {
339         File filename = settingsToFilename(settings);
340         return readSettingsObject(settings, filename);
341     }
342 
343     /*** Get the <code>File</code> object pointing to the order file.
344      *
345      * @return File object for the order file.
346      */
347     public File getOrderFile() {
348         return orderFile;
349     }
350 
351     /*** Creates a replica of the settings file structure in another directory
352      * (fully recursive, includes all per host settings). The SettingsHandler
353      * will then refer to the new files.
354      *
355      * Observe that this method should only be called after the SettingsHandler
356      * has been initialized.
357      *
358      * @param newOrderFileName where the new order file should be saved.
359      * @param newSettingsDirectory the top level directory of the per host/domain
360      *                          settings files.
361      * @throws IOException
362      */
363     public void copySettings(File newOrderFileName, String newSettingsDirectory)
364       throws IOException {
365         File oldSettingsDirectory = getSettingsDirectory();
366 
367         // Write new orderfile and point the settingshandler to it
368         orderFile = newOrderFileName;
369         try {
370             getOrder().setAttribute(
371                 new Attribute(
372                     CrawlOrder.ATTR_SETTINGS_DIRECTORY, newSettingsDirectory));
373         } catch (Exception e) {
374             throw new IOException("Could not update settings with new location: "
375                 + e.getMessage());
376         }
377         writeSettingsObject(getSettingsObject(null));
378 
379         File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
380 
381         // Copy the per host files if src and dest directories are different.
382         if (oldSettingsDirectory.compareTo(newDir) != 0) {
383             FileUtils.copyFiles(oldSettingsDirectory, newDir);
384         }
385     }
386 
387     /***
388      * Transforms a relative path so that it is relative to the location of the
389      * order file. If an absolute path is given, it will be returned unchanged.<p>
390      * The location of it's order file is always considered as the 'working'
391      * directory for any given settings.
392      * @param path A relative path to a file (or directory)
393      * @return The same path modified so that it is relative to the file level
394      *         location of the order file for the settings handler.
395      */
396     public File getPathRelativeToWorkingDirectory(String path) {
397         File f = new File(path);
398         // If path is not absolute, set f's directory
399         // relative to the path of the order file
400         if (!f.isAbsolute()) {
401             f = new File(this.getOrderFile().getParent(), path);
402         }
403         return f;
404     }
405 
406     public Collection getDomainOverrides(String rootDomain) {
407         File settingsDir = getSettingsDirectory();
408 
409         //Find the right start directory.
410         ArrayList<String> domains = new ArrayList<String>();
411         //First we deconstruct the rootDomain string
412         while(rootDomain != null && rootDomain.length()>0){
413             if(rootDomain.indexOf('.')<0){
414                 // Last level.
415                 domains.add(rootDomain);
416                 break; //We're done.
417             } else {
418                 // Got more then one level left.
419                 domains.add(rootDomain.substring(0,rootDomain.indexOf('.')));
420                 // Strip down rootDomain.
421                 rootDomain = rootDomain.substring(rootDomain.indexOf('.')+1);
422             }
423         }
424         //Build up a proper path
425         //Since the domains are right to left, we start at the end of the array.
426         StringBuffer subDir = new StringBuffer();
427         for(int i=(domains.size()-1) ; i>=0 ; i--){
428             subDir.append(File.separator+domains.get(i));
429         }
430         //Then we move to the approprite directory.
431         settingsDir = new File(settingsDir.getPath()+subDir);
432         TreeSet<String> confirmedSubDomains = new TreeSet<String>();
433         if(settingsDir.exists()){
434             // Found our place! Search through it's subdirs.
435             File[] possibleSubDomains = settingsDir.listFiles();
436             for (int i = 0; i < possibleSubDomains.length; i++) {
437                 if (possibleSubDomains[i].isDirectory()
438                     && isOverride(possibleSubDomains[i])) {
439                     // Found one!
440                     confirmedSubDomains.add(possibleSubDomains[i].getName());
441                 }
442             }
443         }
444         return confirmedSubDomains;
445     }
446 
447     /***
448      * Checks if a file is a a 'per host' override or if it's a directory if it
449      * or it's subdirectories  contains a 'per host' override file.
450      * @param f The file or directory to check
451      * @return True if the file is an override or it's a directory that contains
452      *         such a file.
453      */
454     private boolean isOverride(File f){
455         if(f.isDirectory()){
456             // Have a directory, check it's contents.
457             File[] subs = f.listFiles();
458             for(int i=0 ; i < subs.length ; i++){
459                 if(isOverride(subs[i])){
460                     // Found one. Can stop looking.
461                     return true;
462                 }
463             }
464         } else if (f.getName().equals(
465                 settingsFilename + "." + settingsFilenameSuffix)) {
466             // This is an override file (or sure looks like one in any case).
467             return true;
468         }
469         // Didn't find an override.
470         return false;
471     }
472 
473     /*** Delete a settings object from persistent storage.
474      *
475      * Deletes the file represented by the submitted settings object. All empty
476      * directories that are parents to the files path are also deleted.
477      *
478      * @param settings the settings object to delete.
479      */
480     public void deleteSettingsObject(CrawlerSettings settings) {
481         super.deleteSettingsObject(settings);
482         File settingsDirectory = getSettingsDirectory();
483         File settingsFile = settingsToFilename(settings);
484 
485         if(!settingsFile.delete()) {
486             throw new RuntimeException("Could not delete: "+settingsFile);
487         }
488         settingsFile = settingsFile.getParentFile();
489         while (settingsFile.isDirectory() && settingsFile.list().length == 0
490                 && !settingsFile.equals(settingsDirectory)) {
491             if(!settingsFile.delete()) {
492                 logger.warning("Could not delete: "+settingsFile);
493             }
494             settingsFile = settingsFile.getParentFile();
495         }
496     }
497 
498     /* (non-Javadoc)
499      * @see org.archive.crawler.settings.SettingsHandler#getListOfAllFiles()
500      */
501     public List<String> getListOfAllFiles() {
502         ArrayList<String> list = new ArrayList<String>();
503         // Add CrawlOrder.
504         list.add(getOrderFile().getAbsolutePath());
505         // Iterate through the entire override hierarchy
506         if (getSettingsDirectory().exists()) {
507             recursiveFindFiles(getSettingsDirectory(),list);
508         }
509         // Get files used by settings modules.
510         recursiveFindSecondaryFiles(getOrder(),list);
511         return list;
512     }
513 
514     /***
515      * Add any files being used by any of the Modules making up the settings to
516      * the list.
517      *
518      * @param mbean A ModuleType to interrogate for files. Any child modules
519      *           will be recursively interrogated.
520      * @param list The list to add found files to.
521      */
522     private void recursiveFindSecondaryFiles(ComplexType mbean, 
523             ArrayList<String> list) {
524         MBeanInfo info = mbean.getMBeanInfo();
525         MBeanAttributeInfo[] a = info.getAttributes();
526         // Interrogate the current module
527         if(mbean instanceof ModuleType){
528             ((ModuleType)mbean).listUsedFiles(list);
529         }
530 
531         // Recursively interrogate all sub modules that are of ModuleType
532         for(int n=0; n<a.length; n++) {
533             if(a[n] == null) {
534                 // Error null attribute.
535             } else {
536                 ModuleAttributeInfo att = (ModuleAttributeInfo)a[n];
537                 Object currentAttribute;
538                 try {
539                     currentAttribute = mbean.getAttribute(att.getName());
540                     if(currentAttribute instanceof ComplexType) {
541                         recursiveFindSecondaryFiles((ComplexType)currentAttribute,list);
542                     }
543                 } catch (AttributeNotFoundException e) {
544                     // TODO Auto-generated catch block
545                     e.printStackTrace();
546                 } catch (MBeanException e) {
547                     // TODO Auto-generated catch block
548                     e.printStackTrace();
549                 } catch (ReflectionException e) {
550                     // TODO Auto-generated catch block
551                     e.printStackTrace();
552                 }
553             }
554         }
555     }
556 
557     /***
558      * Starting at the specific directory this method will iterate through all
559      * sub directories and add each file (as absolute name, with path as a
560      * string) to the provided ArrayList. Any file found under the settings
561      * directory with the proper suffix will be considered valid and added to
562      * the list.
563      * @param dir Starting directory
564      * @param list The list to add to
565      */
566     private void recursiveFindFiles(File dir, ArrayList<String> list){
567         File[] subs = dir.listFiles();
568         if (subs != null) {
569             for(int i=0 ; i < subs.length ; i++){
570                 if(subs[i].isDirectory()){
571                     recursiveFindFiles(subs[i],list);
572                 } else {
573                     if(subs[i].getName().endsWith(settingsFilenameSuffix)){
574                         // Add it to list
575                         list.add(subs[i].getAbsolutePath());
576                     }
577                 }
578             }
579         }
580     }
581 }