View Javadoc

1   
2   /* $Id:  $
3    *
4    * Copyright (C) 2007 Olaf Freyer
5    *
6    * This file is part of the Heritrix web crawler (crawler.archive.org).
7    *
8    * Heritrix is free software; you can redistribute it and/or modify
9    * it under the terms of the GNU Lesser Public License as published by
10   * the Free Software Foundation; either version 2.1 of the License, or
11   * any later version.
12   *
13   * Heritrix is distributed in the hope that it will be useful,
14   * but WITHOUT ANY WARRANTY; without even the implied warranty of
15   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   * GNU Lesser Public License for more details.
17   *
18   * You should have received a copy of the GNU Lesser Public License
19   * along with Heritrix; if not, write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21   */
22  package org.archive.crawler.deciderules;
23  
24  import org.archive.crawler.datamodel.CrawlURI;
25  
26  /***
27   * DecideRule whose decision is applied if the URI's content-type 
28   * is present and does not match the supplied regular expression. 
29   * 
30   * @author Olaf Freyer
31   */
32  public class ContentTypeNotMatchesRegExpDecideRule extends
33          ContentTypeMatchesRegExpDecideRule {
34      private static final long serialVersionUID = 4729800377757426137L;
35  
36      public ContentTypeNotMatchesRegExpDecideRule(String name) {
37          super(name);
38          setDescription("ContentTypeNotMatchesRegExpDecideRule. Applies the " +
39              "configured decision to URIs not matching the supplied regular " +
40              "expression. Cannot be used until after fetcher processors. " +
41              "Only then is the Content-Type known. A good place for this " +
42              "rule is at the writer step processing.  If the content-type " +
43              "is null, 301s usually have no content-type, this deciderule " +
44              "will PASS.");
45      }
46      
47      /***
48       * Evaluate whether given object's string version does not match 
49       * configured regexp (by reversing the superclass's answer).
50       * 
51       * @param object Object to make decision about.
52       * @return true if the regexp is not matched
53       */
54      protected boolean evaluate(Object o) {
55          if (!(o instanceof CrawlURI)) {
56              return false;
57          }
58          String content_type = ((CrawlURI)o).getContentType();
59          String regexp = getRegexp(o);
60          return (regexp == null || content_type == null)? false:
61               ! super.evaluate(o);
62      }
63      
64  }