View Javadoc

1   /* $Id:  $
2    *
3    * Copyright (C) 2007 Olaf Freyer
4    *
5    * This file is part of the Heritrix web crawler (crawler.archive.org).
6    *
7    * Heritrix is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser Public License as published by
9    * the Free Software Foundation; either version 2.1 of the License, or
10   * any later version.
11   *
12   * Heritrix is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU Lesser Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser Public License
18   * along with Heritrix; if not, write to the Free Software
19   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20   */
21  package org.archive.crawler.deciderules;
22  
23  import org.archive.crawler.datamodel.CrawlURI;
24  import org.archive.util.TextUtils;
25  
26  /***
27   * DecideRule whose decision is applied if the URI's content-type 
28   * is present and matches the supplied regular expression. 
29   * 
30   * @author Olaf Freyer
31   */
32  public class ContentTypeMatchesRegExpDecideRule extends MatchesRegExpDecideRule{
33      private static final long serialVersionUID = -2066930281015155843L;
34  
35      public ContentTypeMatchesRegExpDecideRule(String name) {
36          super(name);
37          setDescription("ContentTypeMatchesRegExpDecideRule. Applies the " +
38              "configured decision to URIs matching the supplied regular " +
39              "expression. Cannot be used until after fetcher processors. " +
40              "Only then is the Content-Type known. A good place for this " +
41              "rule is at the writer step processing.  If the content-type " +
42              "is null, 301s usually have no content-type, this deciderule " +
43              "will PASS.");
44      }
45      
46      @Override
47      protected boolean evaluate(Object o) {
48              if (!(o instanceof CrawlURI)) {
49                  return false;
50              }
51              String content_type = ((CrawlURI)o).getContentType();
52              String regexp = getRegexp(o);
53              return (regexp == null || content_type == null)? false:
54                      TextUtils.matches(getRegexp(o), content_type);
55          }
56  }