View Javadoc

1   /*
2    * CrawlSettingsSAXHandler
3    *
4    * $Id: CrawlSettingsSAXHandler.java 5111 2007-05-03 01:43:43Z gojomo $
5    *
6    * Created on Dec 8, 2003
7    *
8    * Copyright (C) 2004 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify it under the
13   * terms of the GNU Lesser Public License as published by the Free Software
14   * Foundation; either version 2.1 of the License, or any later version.
15   *
16   * Heritrix is distributed in the hope that it will be useful, but WITHOUT ANY
17   * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
18   * A PARTICULAR PURPOSE. See the GNU Lesser Public License for more details.
19   *
20   * You should have received a copy of the GNU Lesser Public License along with
21   * Heritrix; if not, write to the Free Software Foundation, Inc., 59 Temple
22   * Place, Suite 330, Boston, MA 02111-1307 USA
23   */
24  package org.archive.crawler.settings;
25  
26  import java.lang.reflect.InvocationTargetException;
27  import java.text.ParseException;
28  import java.util.HashMap;
29  import java.util.Map;
30  import java.util.Stack;
31  import java.util.logging.Level;
32  import java.util.logging.Logger;
33  
34  import javax.management.Attribute;
35  import javax.management.AttributeNotFoundException;
36  import javax.management.InvalidAttributeValueException;
37  
38  import org.archive.crawler.settings.Constraint.FailedCheck;
39  import org.archive.crawler.settings.refinements.PortnumberCriteria;
40  import org.archive.crawler.settings.refinements.Refinement;
41  import org.archive.crawler.settings.refinements.RegularExpressionCriteria;
42  import org.archive.crawler.settings.refinements.TimespanCriteria;
43  import org.archive.util.ArchiveUtils;
44  import org.xml.sax.Attributes;
45  import org.xml.sax.Locator;
46  import org.xml.sax.SAXException;
47  import org.xml.sax.SAXParseException;
48  import org.xml.sax.helpers.DefaultHandler;
49  
50  /***
51   * An SAX element handler that updates a CrawlerSettings object.
52   *
53   * This is a helper class for the XMLSettingsHandler.
54   *
55   * @author John Erik Halse
56   */
57  public class CrawlSettingsSAXHandler extends DefaultHandler implements
58          ValueErrorHandler {
59  
60      private static Logger logger = Logger
61              .getLogger("org.archive.crawler.settings.XMLSettingsHandler");
62  
63      private Locator locator;
64  
65      private CrawlerSettings settings;
66  
67      private SettingsHandler settingsHandler;
68  
69      private Map<String,ElementHandler> handlers
70       = new HashMap<String,ElementHandler>();
71  
72      private Stack<ElementHandler> handlerStack = new Stack<ElementHandler>();
73  
74      private Stack<Object> stack = new Stack<Object>();
75  
76      /*** Keeps track of elements which subelements should be skipped. */
77      private Stack<Boolean> skip = new Stack<Boolean>();
78  
79      private StringBuffer buffer = new StringBuffer();
80  
81      private String value;
82  
83      /***
84       * Creates a new CrawlSettingsSAXHandler.
85       *
86       * @param settings the settings object that should be updated from this
87       *            handler.
88       */
89      public CrawlSettingsSAXHandler(CrawlerSettings settings) {
90          super();
91          this.settings = settings;
92          this.settingsHandler = settings.getSettingsHandler();
93          handlers.put(XMLSettingsHandler.XML_ROOT_ORDER, new RootHandler());
94          handlers.put(XMLSettingsHandler.XML_ROOT_HOST_SETTINGS,
95                  new RootHandler());
96          handlers.put(XMLSettingsHandler.XML_ROOT_REFINEMENT, new RootHandler());
97          handlers.put(XMLSettingsHandler.XML_ELEMENT_CONTROLLER,
98                  new ModuleHandler());
99          handlers
100                 .put(XMLSettingsHandler.XML_ELEMENT_OBJECT, new ModuleHandler());
101         handlers.put(XMLSettingsHandler.XML_ELEMENT_NEW_OBJECT,
102                 new NewModuleHandler());
103         handlers.put(XMLSettingsHandler.XML_ELEMENT_META, new MetaHandler());
104         handlers.put(XMLSettingsHandler.XML_ELEMENT_NAME, new NameHandler());
105         handlers.put(XMLSettingsHandler.XML_ELEMENT_DESCRIPTION,
106                 new DescriptionHandler());
107         handlers.put(XMLSettingsHandler.XML_ELEMENT_OPERATOR,
108                 new OperatorHandler());
109         handlers.put(XMLSettingsHandler.XML_ELEMENT_ORGANIZATION,
110                 new OrganizationHandler());
111         handlers.put(XMLSettingsHandler.XML_ELEMENT_AUDIENCE,
112                 new AudienceHandler());
113         handlers.put(XMLSettingsHandler.XML_ELEMENT_DATE, new DateHandler());
114         handlers.put(SettingsHandler.MAP, new MapHandler());
115         handlers.put(SettingsHandler.INTEGER_LIST, new ListHandler());
116         handlers.put(SettingsHandler.STRING_LIST, new ListHandler());
117         handlers.put(SettingsHandler.DOUBLE_LIST, new ListHandler());
118         handlers.put(SettingsHandler.FLOAT_LIST, new ListHandler());
119         handlers.put(SettingsHandler.LONG_LIST, new ListHandler());
120         handlers.put(SettingsHandler.STRING, new SimpleElementHandler());
121         handlers.put(SettingsHandler.TEXT, new SimpleElementHandler());
122         handlers.put(SettingsHandler.INTEGER, new SimpleElementHandler());
123         handlers.put(SettingsHandler.FLOAT, new SimpleElementHandler());
124         handlers.put(SettingsHandler.LONG, new SimpleElementHandler());
125         handlers.put(SettingsHandler.BOOLEAN, new SimpleElementHandler());
126         handlers.put(SettingsHandler.DOUBLE, new SimpleElementHandler());
127 
128         handlers.put(XMLSettingsHandler.XML_ELEMENT_REFINEMENTLIST,
129                 new RefinementListHandler());
130         handlers.put(XMLSettingsHandler.XML_ELEMENT_REFINEMENT,
131                 new RefinementHandler());
132         handlers.put(XMLSettingsHandler.XML_ELEMENT_REFERENCE,
133                 new ReferenceHandler());
134         handlers
135                 .put(XMLSettingsHandler.XML_ELEMENT_LIMITS, new LimitsHandler());
136         handlers.put(XMLSettingsHandler.XML_ELEMENT_TIMESPAN,
137                 new TimespanHandler());
138         handlers.put(XMLSettingsHandler.XML_ELEMENT_PORTNUMBER,
139                 new PortnumberHandler());
140         handlers.put(XMLSettingsHandler.XML_ELEMENT_URIMATCHES,
141                 new URIMatcherHandler());
142     }
143 
144     /*
145      * (non-Javadoc)
146      *
147      * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
148      */
149     public void setDocumentLocator(Locator locator) {
150         super.setDocumentLocator(locator);
151         this.locator = locator;
152     }
153 
154     /*
155      * (non-Javadoc)
156      *
157      * @see org.xml.sax.ContentHandler#startDocument()
158      */
159     public void startDocument() throws SAXException {
160         settingsHandler.registerValueErrorHandler(this);
161         skip.push(new Boolean(false));
162         super.startDocument();
163     }
164 
165     /*
166      * (non-Javadoc)
167      *
168      * @see org.xml.sax.ContentHandler#endDocument()
169      */
170     public void endDocument() throws SAXException {
171         settingsHandler.unregisterValueErrorHandler(this);
172         super.endDocument();
173     }
174 
175     /*
176      * (non-Javadoc)
177      *
178      * @see org.xml.sax.ContentHandler#characters(char[], int, int)
179      */
180     public void characters(char[] ch, int start, int length)
181             throws SAXException {
182         super.characters(ch, start, length);
183         buffer.append(ch, start, length);
184     }
185 
186     /***
187      * Start of an element. Decide what handler to use, and call it.
188      *
189      * @param uri
190      * @param localName
191      * @param qName
192      * @param attributes
193      * @throws SAXException
194      */
195     public void startElement(String uri, String localName, String qName,
196             Attributes attributes) throws SAXException {
197 
198         ElementHandler handler = ((ElementHandler) handlers.get(qName));
199         if (handler != null) {
200             handlerStack.push(handler);
201 
202             if (((Boolean) skip.peek()).booleanValue()) {
203                 skip.push(new Boolean(true));
204                 String moduleName = attributes
205                         .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
206                 logger.fine("Skipping: " + qName + " " + moduleName);
207             } else {
208                 try {
209                     handler.startElement(qName, attributes);
210                     skip.push(new Boolean(false));
211                 } catch (SAXException e) {
212                     if (e.getException() instanceof InvocationTargetException
213                             || e.getException() instanceof AttributeNotFoundException) {
214                         skip.push(new Boolean(true));
215                     } else {
216                         skip.push(new Boolean(false));
217                         throw e;
218                     }
219                 }
220             }
221         } else {
222             String tmp = "Unknown element '" + qName + "' in '" +
223                 locator.getSystemId() + "', line: " + locator.getLineNumber() +
224                 ", column: " + locator.getColumnNumber();
225             if (this.settingsHandler.getOrder() != null &&
226                     this.settingsHandler.getOrder().getController() !=  null) {
227                 logger.log(Level.WARNING, tmp);
228             }
229             logger.warning(tmp);
230         }
231     }
232 
233     /***
234      * End of an element.
235      *
236      * @param uri
237      * @param localName
238      * @param qName
239      * @throws SAXException
240      */
241     public void endElement(String uri, String localName, String qName)
242             throws SAXException {
243         value = buffer.toString().trim();
244         buffer.setLength(0);
245         ElementHandler handler = (ElementHandler) handlerStack.pop();
246         if (!((Boolean) skip.pop()).booleanValue()) {
247             if (handler != null) {
248                 handler.endElement(qName);
249             }
250         }
251     }
252 
253     public void illegalElementError(String name) throws SAXParseException {
254         throw new SAXParseException("Element '" + name + "' not allowed here",
255                 locator);
256     }
257 
258     /***
259      * Superclass of all the elementhandlers.
260      *
261      * This class should be subclassed for the different XML-elements.
262      *
263      * @author John Erik Halse
264      */
265     private class ElementHandler {
266 
267         /***
268          * Start of an element
269          *
270          * @param name
271          * @param atts
272          * @throws SAXException
273          */
274         public void startElement(String name, Attributes atts)
275                 throws SAXException {
276         }
277 
278         /***
279          * End of an element
280          *
281          * @param name
282          * @throws SAXException
283          */
284         public void endElement(String name) throws SAXException {
285         }
286     }
287 
288     /***
289      * Handle the root element.
290      *
291      * This class checks that the root element is of the right type.
292      *
293      * @author John Erik Halse
294      */
295     private class RootHandler extends ElementHandler {
296 
297         public void startElement(String name, Attributes atts)
298                 throws SAXException {
299             //  Check filetype
300             if ((name.equals(XMLSettingsHandler.XML_ROOT_ORDER) && settings
301                     .getScope() != null)
302                     || (name.equals(XMLSettingsHandler.XML_ROOT_HOST_SETTINGS) && settings
303                             .getScope() == null)
304                     || (name.equals(XMLSettingsHandler.XML_ROOT_REFINEMENT) && !settings
305                             .isRefinement())) {
306                 throw new SAXParseException("Wrong document type '" + name
307                         + "'", locator);
308             }
309         }
310     }
311 
312     // Meta handlers
313     private class MetaHandler extends ElementHandler {
314     }
315 
316     private class NameHandler extends ElementHandler {
317 
318         public void endElement(String name) throws SAXException {
319             if (handlerStack.peek() instanceof MetaHandler) {
320                 settings.setName(value);
321             } else {
322                 illegalElementError(name);
323             }
324         }
325     }
326 
327     private class DescriptionHandler extends ElementHandler {
328 
329         public void endElement(String name) throws SAXException {
330             if (handlerStack.peek() instanceof MetaHandler) {
331                 settings.setDescription(value);
332             } else if (handlerStack.peek() instanceof RefinementHandler) {
333                 ((Refinement) stack.peek()).setDescription(value);
334             } else {
335                 illegalElementError(name);
336             }
337         }
338     }
339 
340     private class OrganizationHandler extends ElementHandler {
341 
342         public void endElement(String name) throws SAXException {
343             if (handlerStack.peek() instanceof MetaHandler) {
344                 settings.setOrganization(value);
345             } else if (handlerStack.peek() instanceof RefinementHandler) {
346                 ((Refinement) stack.peek()).setOrganization(value);
347             } else {
348                 illegalElementError(name);
349             }
350         }
351     }
352 
353     private class OperatorHandler extends ElementHandler {
354 
355         public void endElement(String name) throws SAXException {
356             if (handlerStack.peek() instanceof MetaHandler) {
357                 settings.setOperator(value);
358             } else if (handlerStack.peek() instanceof RefinementHandler) {
359                 ((Refinement) stack.peek()).setOperator(value);
360             } else {
361                 illegalElementError(name);
362             }
363         }
364     }
365 
366     private class AudienceHandler extends ElementHandler {
367 
368         public void endElement(String name) throws SAXException {
369             if (handlerStack.peek() instanceof MetaHandler) {
370                 settings.setAudience(value);
371             } else if (handlerStack.peek() instanceof RefinementHandler) {
372                 ((Refinement) stack.peek()).setAudience(value);
373             } else {
374                 illegalElementError(name);
375             }
376         }
377     }
378 
379     private class DateHandler extends ElementHandler {
380 
381         public void endElement(String name) throws SAXException {
382             if (handlerStack.peek() instanceof MetaHandler) {
383                 try {
384                     settings.setLastSavedTime(ArchiveUtils
385                             .parse14DigitDate(value));
386                 } catch (ParseException e) {
387                     throw new SAXException(e);
388                 }
389             } else {
390                 illegalElementError(name);
391             }
392         }
393     }
394 
395     // Refinement handlers
396     private class RefinementListHandler extends ElementHandler {
397 
398         public void startElement(String name) throws SAXException {
399             if (!(handlerStack.peek() instanceof RootHandler)) {
400                 illegalElementError(name);
401             }
402         }
403     }
404 
405     private class RefinementHandler extends ElementHandler {
406         public void startElement(String name, Attributes atts)
407                 throws SAXException {
408             stack.push(new Refinement(settings, atts
409                     .getValue(XMLSettingsHandler.XML_ELEMENT_REFERENCE)));
410         }
411     }
412 
413     private class ReferenceHandler extends ElementHandler {
414 
415         public void endElement(String name) throws SAXException {
416             if (handlerStack.peek() instanceof RefinementHandler) {
417                 ((Refinement) stack.peek()).setReference(value);
418             } else {
419                 illegalElementError(name);
420             }
421         }
422     }
423 
424     private class LimitsHandler extends ElementHandler {
425     }
426 
427     private class TimespanHandler extends ElementHandler {
428 
429         public void startElement(String name, Attributes atts)
430                 throws SAXException {
431             if (stack.peek() instanceof Refinement) {
432                 String from = atts
433                         .getValue(XMLSettingsHandler.XML_ATTRIBUTE_FROM);
434                 String to = atts.getValue(XMLSettingsHandler.XML_ATTRIBUTE_TO);
435                 try {
436                     TimespanCriteria timespan = new TimespanCriteria(from, to);
437                     ((Refinement) stack.peek()).addCriteria(timespan);
438                 } catch (ParseException e) {
439                     throw new SAXException(e);
440                 }
441             } else {
442                 illegalElementError(name);
443             }
444         }
445     }
446 
447     private class PortnumberHandler extends ElementHandler {
448 
449         public void endElement(String name) throws SAXException {
450             if (handlerStack.peek() instanceof LimitsHandler) {
451                 ((Refinement) stack.peek()).addCriteria(new PortnumberCriteria(value));
452             } else {
453                 illegalElementError(name);
454             }
455         }
456     }
457 
458     private class URIMatcherHandler extends ElementHandler {
459 
460         public void endElement(String name) throws SAXException {
461             if (handlerStack.peek() instanceof LimitsHandler) {
462                 ((Refinement) stack.peek()).addCriteria(new RegularExpressionCriteria(value));
463             } else {
464                 illegalElementError(name);
465             }
466         }
467     }
468 
469 
470     // Handlers for objects and attributes
471     private class ModuleHandler extends ElementHandler {
472 
473         public void startElement(String name, Attributes atts)
474                 throws SAXException {
475             ModuleType module;
476             if (name.equals(XMLSettingsHandler.XML_ELEMENT_CONTROLLER)) {
477                 module = settingsHandler.getOrder();
478             } else {
479                 module = settingsHandler.getSettingsObject(null).getModule(
480                         atts.getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME));
481             }
482             stack.push(module);
483         }
484 
485         public void endElement(String name) throws SAXException {
486             stack.pop();
487         }
488     }
489 
490     private class NewModuleHandler extends ElementHandler {
491 
492         public void startElement(String name, Attributes atts)
493                 throws SAXException {
494             ComplexType parentModule = (ComplexType) stack.peek();
495             String moduleName = atts
496                     .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
497             String moduleClass = atts
498                     .getValue(XMLSettingsHandler.XML_ATTRIBUTE_CLASS);
499             try {
500                 ModuleType module = SettingsHandler
501                         .instantiateModuleTypeFromClassName(moduleName,
502                                 moduleClass);
503                 try {
504                     parentModule.setAttribute(settings, module);
505                 } catch (AttributeNotFoundException e) {
506                     // Attribute was not found, but the complex type might
507                     // be a MapType and then we are allowed to add new
508                     // elements.
509                     try {
510                         parentModule.addElement(settings, module);
511                     } catch (IllegalStateException ise) {
512                         // An attribute in the settings file is not in the
513                         // ComplexType's definition, log and skip.
514                         logger.log(Level.WARNING,"Module '" + moduleName + "' in '"
515                                 + locator.getSystemId() + "', line: "
516                                 + locator.getLineNumber() + ", column: "
517                                 + locator.getColumnNumber()
518                                 + " is not defined in '"
519                                 + parentModule.getName() + "'.");
520                         throw new SAXException(new AttributeNotFoundException(
521                                 ise.getMessage()));
522                     }
523                 }
524                 stack.push(module);
525             } catch (InvocationTargetException e) {
526                 logger.log(Level.WARNING,"Couldn't instantiate " + moduleName
527                         + ", from class: " + moduleClass + "' in '"
528                         + locator.getSystemId() + "', line: "
529                         + locator.getLineNumber() + ", column: "
530                         + locator.getColumnNumber(), e);
531                 throw new SAXException(e);
532             } catch (InvalidAttributeValueException e) {
533                 throw new SAXException(e);
534             }
535         }
536 
537         public void endElement(String name) throws SAXException {
538             stack.pop();
539         }
540     }
541 
542     private class MapHandler extends ElementHandler {
543 
544         public void startElement(String name, Attributes atts)
545                 throws SAXException {
546             String mapName = atts
547                     .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
548             ComplexType parentModule = (ComplexType) stack.peek();
549             try {
550                 stack.push(parentModule.getAttribute(settings, mapName));
551             } catch (AttributeNotFoundException e) {
552                 throw new SAXException(e);
553             }
554         }
555 
556         public void endElement(String name) throws SAXException {
557             stack.pop();
558         }
559     }
560 
561     private class SimpleElementHandler extends ElementHandler {
562 
563         public void startElement(String name, Attributes atts)
564                 throws SAXException {
565             stack.push(atts.getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME));
566         }
567 
568         public void endElement(String name) throws SAXException {
569             String elementName = (String) stack.pop();
570             Object container = stack.peek();
571             if (container instanceof ComplexType) {
572                 try {
573                     try {
574                         ((ComplexType) container).setAttribute(settings,
575                                 new Attribute(elementName, value));
576                     } catch (AttributeNotFoundException e) {
577                         // Attribute was not found, but the complex type might
578                         // be a MapType and then we are allowed to add new
579                         // elements.
580                         try {
581                             ((ComplexType) container).addElement(settings,
582                                     new SimpleType(elementName, "", value));
583                         } catch (IllegalStateException ise) {
584                             logger.warning("Unknown attribute '" + elementName
585                                     + "' in '" + locator.getSystemId()
586                                     + "', line: " + locator.getLineNumber()
587                                     + ", column: " + locator.getColumnNumber());
588                         }
589                     }
590                 } catch (InvalidAttributeValueException e) {
591                     try {
592                         logger.warning("Illegal value '"
593                                 + value
594                                 + "' for attribute '"
595                                 + elementName
596                                 + "' in '"
597                                 + locator.getSystemId()
598                                 + "', line: "
599                                 + locator.getLineNumber()
600                                 + ", column: "
601                                 + locator.getColumnNumber()
602                                 + ", Value reset to default value: "
603                                 + ((ComplexType) container).getAttribute(
604                                         settings, elementName));
605                     } catch (AttributeNotFoundException e1) {
606                         throw new SAXException(e1);
607                     }
608                 }
609             } else {
610                 if (container == null) {
611                 	// We can get here if an override is referring to a global
612                     // filter since removed.  Log it as severe; operator will
613                     // probably want to know of all overrides with references
614                     // to a global filter since removed.
615                     logger.severe("Empty container (Was a referenced parent" +
616                         " filter removed?).  Element details: elementName " +
617                         elementName + ", name " + name);
618                 } else {
619                 	((ListType) container).add(value);
620                 }
621             }
622         }
623     }
624 
625     private class ListHandler extends ElementHandler {
626 
627         public void startElement(String name, Attributes atts)
628                 throws SAXException {
629             String listName = atts
630                     .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
631             ComplexType parentModule = (ComplexType) stack.peek();
632             ListType list;
633             try {
634                 list = (ListType) parentModule.getAttribute(settings, listName);
635             } catch (AttributeNotFoundException e) {
636                 throw new SAXException(e);
637             }
638             list.clear();
639             stack.push(list);
640         }
641 
642         public void endElement(String name) throws SAXException {
643             stack.pop();
644         }
645     }
646 
647     /*
648      * (non-Javadoc)
649      *
650      * @see org.archive.crawler.settings.ValueErrorHandler#handleValueError(org.archive.crawler.settings.Constraint.FailedCheck)
651      */
652     public void handleValueError(FailedCheck error) {
653         logger.warning(error.getMessage() + "\n Attribute: '"
654                 + error.getOwner().getName() + ":"
655                 + error.getDefinition().getName() + "'\n Value:     '" + value
656                 + "'\n File:      '" + locator.getSystemId() + "', line: "
657                 + locator.getLineNumber() + ", column: "
658                 + locator.getColumnNumber());
659     }
660 }