View Javadoc

1   /* Copyright (C) 2003 Internet Archive.
2    *
3    * This file is part of the Heritrix web crawler (crawler.archive.org).
4    *
5    * Heritrix is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU Lesser Public License as published by
7    * the Free Software Foundation; either version 2.1 of the License, or
8    * any later version.
9    *
10   * Heritrix is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU Lesser Public License for more details.
14   *
15   * You should have received a copy of the GNU Lesser Public License
16   * along with Heritrix; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   *
19   * HopsFilter.java
20   * Created on Oct 3, 2003
21   *
22   * $Header$
23   */
24  package org.archive.crawler.filter;
25  
26  import java.util.logging.Logger;
27  
28  import javax.management.AttributeNotFoundException;
29  
30  import org.archive.crawler.datamodel.CandidateURI;
31  import org.archive.crawler.datamodel.CrawlURI;
32  import org.archive.crawler.extractor.Link;
33  import org.archive.crawler.framework.CrawlScope;
34  import org.archive.crawler.framework.Filter;
35  import org.archive.crawler.scope.ClassicScope;
36  
37  /***
38   * Accepts (returns  for)) for all CandidateURIs passed in
39   * with a link-hop-count greater than the max-link-hops
40   * value.
41   *
42   * @author gojomo
43   * @deprecated As of release 1.10.0.  Replaced by {@link DecidingFilter} and
44   * equivalent {@link DecideRule}.
45   */
46  public class HopsFilter extends Filter {
47  
48      private static final long serialVersionUID = -5943030310651023640L;
49  
50      private static final Logger logger =
51          Logger.getLogger(HopsFilter.class.getName());
52  
53      /***
54       * @param name
55       */
56      public HopsFilter(String name) {
57          super(name, "Hops filter *Deprecated* Use" +
58              "DecidingFilter and equivalent DecideRule instead");
59      }
60  
61      int maxLinkHops = Integer.MAX_VALUE;
62      int maxTransHops = Integer.MAX_VALUE;
63  
64      /* (non-Javadoc)
65       * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object)
66       */
67      protected boolean innerAccepts(Object o) {
68          if(! (o instanceof CandidateURI)) {
69              return false;
70          }
71          String path = ((CandidateURI)o).getPathFromSeed();
72          int linkCount = 0;
73          int transCount = 0;
74          for(int i=path.length()-1;i>=0;i--) {
75              if(path.charAt(i)==Link.NAVLINK_HOP) {
76                  linkCount++;
77              } else if (linkCount==0) {
78                  transCount++;
79              }
80          }
81          if (o instanceof CrawlURI) {
82              CrawlURI curi = (CrawlURI) o;
83              CrawlScope scope =
84                  (CrawlScope) globalSettings().getModule(CrawlScope.ATTR_NAME);
85              try {
86                  maxLinkHops =
87                      ((Integer) scope
88                          .getAttribute(ClassicScope.ATTR_MAX_LINK_HOPS, curi))
89                          .intValue();
90                  maxTransHops =
91                      ((Integer) scope
92                          .getAttribute(ClassicScope.ATTR_MAX_TRANS_HOPS, curi))
93                          .intValue();
94              } catch (AttributeNotFoundException e) {
95                  logger.severe(e.getMessage());
96                  // Basically, true means the filter is PASSing this URI.
97                  return true; 
98              }
99          }
100 
101         return (linkCount > maxLinkHops)|| (transCount>maxTransHops);
102     }
103 }