DomainSensitiveFrontier xref

View Javadoc

1   /* DomainSensitiveFrontier
2   *
3   * $Id: DomainSensitiveFrontier.java 4656 2006-09-25 21:34:50Z paul_jack $
4   *
5   * Created on 2004-may-06
6   *
7   * Copyright (C) 2004 Royal Library of Sweden.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  package org.archive.crawler.frontier;
26  
27  import java.io.IOException;
28  import java.util.Hashtable;
29  import java.util.logging.Logger;
30  
31  import javax.management.AttributeNotFoundException;
32  import javax.management.MBeanException;
33  import javax.management.ReflectionException;
34  
35  import org.archive.crawler.datamodel.CrawlURI;
36  import org.archive.crawler.event.CrawlURIDispositionListener;
37  import org.archive.crawler.filter.OrFilter;
38  import org.archive.crawler.filter.URIRegExpFilter;
39  import org.archive.crawler.framework.CrawlController;
40  import org.archive.crawler.framework.exceptions.FatalConfigurationException;
41  import org.archive.crawler.prefetch.QuotaEnforcer;
42  import org.archive.crawler.scope.ClassicScope;
43  import org.archive.crawler.settings.CrawlerSettings;
44  import org.archive.crawler.settings.SimpleType;
45  import org.archive.crawler.settings.Type;
46  
47  /*** 
48   * Behaves like {@link BdbFrontier} (i.e., a basic mostly breadth-first
49   * frontier), but with the addition that you can set the number of documents
50   * to download on a per site basis. 
51   *
52   * Useful for case of frequent revisits of a site of frequent changes.
53   * 
54   * <p>Choose the number of docs you want to download and specify
55   * the count in <code>max-docs</code>.  If <code>count-per-host</code> is
56   * true, the default, then the crawler will download <code>max-docs</code> 
57   * per host.  If you create an override,  the overridden <code>max-docs</code>
58   * count will be downloaded instead, whether it is higher or lower.
59   * <p>If <code>count-per-host</code> is false, then <code>max-docs</code>
60   * acts like the the crawl order <code>max-docs</code> and the crawler will
61   * download this total amount of docs only.  Overrides will  
62   * download <code>max-docs</code> total in the overridden domain. 
63   *
64   * @author Oskar Grenholm <oskar dot grenholm at kb dot se>
65   * @deprecated As of release 1.10.0.  Replaced by {@link BdbFrontier} and
66   * {@link QuotaEnforcer}.
67   */
68  public class DomainSensitiveFrontier extends BdbFrontier
69  implements CrawlURIDispositionListener {
70  
71      private static final long serialVersionUID = -3330190056282726202L;
72  
73      private static final Logger logger =
74          Logger.getLogger(DomainSensitiveFrontier.class.getName());
75      
76      public static final String ATTR_MAX_DOCS = "max-docs";
77      public static final String ATTR_COUNTER_MODE = "counter-mode";
78      public static final String COUNT_OVERRIDE = "count-per-override";
79      public static final String COUNT_HOST = "count-per-host";
80      public static final String COUNT_DOMAIN = "count-per-domain";
81      public static final String[] ATTR_AVAILABLE_MODES = new String[] {
82          COUNT_OVERRIDE, COUNT_HOST, COUNT_DOMAIN };      
83      public static final String DEFAULT_MODE = COUNT_OVERRIDE;
84          
85      // TODO: Make this a BigMap.
86      private Hashtable<String,Long> hostCounters = new Hashtable<String,Long>();
87      private boolean countPerOverride = true;
88      private String counterMode;
89  
90      public DomainSensitiveFrontier(String name) {
91          super(ATTR_NAME, "DomainSensitiveFrontier. *Deprecated* Use " +
92          	"BdbFrontier+QuotaEnforcer instead. " +
93              "Overrides BdbFrontier to add specification of number of " +
94              "documents to download (Expects 'exclude-filter' " +
95              "to be part of CrawlScope).");
96          Type e = addElementToDefinition(new SimpleType(ATTR_MAX_DOCS,
97              "Maximum number of documents to download for host or domain" +
98              " (Zero means no limit).", new Long(0)));
99          e.setOverrideable(true);
100         e = addElementToDefinition(new SimpleType(ATTR_COUNTER_MODE,
101                "If " + COUNT_OVERRIDE + ", acts like the crawl " +
102                "order maximum download count and the crawler will download " +
103                "this total amount of docs only. Override to change the max " +
104                "count for the overridden domain or host. " +
105                "Else if " + COUNT_HOST + " the crawler will download " +
106                ATTR_MAX_DOCS + " per host. Add an override to change " +
107                "max count on a per-domain or a per-host basis.For " +
108                "example, if you set " + ATTR_MAX_DOCS + " to 30 in " +
109                "this mode, the crawler will download 30 docs from " +
110                "each host in scope. If you  override for kb.se setting " +
111                ATTR_MAX_DOCS +
112                " to 20, it will instead download only 20 docs from each " +
113                "host of kb.se. (It can be a larger as well as a smaller " +
114                "value here.). " +
115                "Finally " + COUNT_DOMAIN + " behaves similar to " +
116                COUNT_HOST +
117                ", but instead sets max on a per-domain basis." +
118                "Here you can do overrides on the domain-level, but " +
119                "not on the host-level. So if you here set " +
120                ATTR_MAX_DOCS + 
121                " to 30 the crawler will download 30 docs from each " +
122                "domain in scope. If you  override for kb.se setting " +
123                ATTR_MAX_DOCS + " to 20, it will instead download only " +
124                "20 docs in total from the whole kb.se domain. (It can be " +
125                "a larger as well as a smaller value here.)", 
126                DEFAULT_MODE, ATTR_AVAILABLE_MODES));
127          e.setOverrideable(false);         
128     }
129 
130     public void initialize(CrawlController c)
131     throws FatalConfigurationException, IOException {
132         super.initialize(c);
133         this.controller.addCrawlURIDispositionListener(this);
134         try {
135             counterMode = ((String)getAttribute(ATTR_COUNTER_MODE));
136             if(counterMode.equalsIgnoreCase(COUNT_DOMAIN) ||
137                     counterMode.equalsIgnoreCase(COUNT_HOST))
138                 countPerOverride = false;
139             else
140                 countPerOverride = true;
141         } catch (AttributeNotFoundException e) {
142             e.printStackTrace();
143         } catch (MBeanException e) {
144             e.printStackTrace();
145         } catch (ReflectionException e) {
146             e.printStackTrace();
147         }
148     }
149     
150     /***
151      * Check if the max document download limit for this host or domain has
152      * been reached.
153      * 
154      * If so, delete the rest of the URIs for this host or domain waiting in
155      * the queue. Then add an URIRegExpFilter for this host or domain, so
156      * we won't get any more URIs from this one later on.
157      * @param curi CrawlURI.
158      * @return True if discarded queue.
159      */
160     private synchronized boolean checkDownloadLimits(CrawlURI curi) {
161         long thisMaxDocs = 0;
162         long thisCounter = 0;
163         boolean discarded = false;
164         boolean retVal = false;
165         if (curi.getUURI().getScheme().equals("dns")) {
166             return false;
167         }
168         try {
169             String host = curi.getUURI().getHost();
170             CrawlerSettings cs = controller.getSettingsHandler().
171                 getSettings(host);
172             do {
173                 String scope;
174                 if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
175                     scope = cs.getScope() != null ? cs.getScope() : "root";
176                 else if(counterMode.equalsIgnoreCase(COUNT_HOST))    
177                     scope = host;
178                 else{ //Get domain part of host
179                     int i = host.lastIndexOf(".");
180                     i = host.lastIndexOf(".", i-1);
181                     scope = host.substring(i+1, host.length());
182                 }
183                 thisMaxDocs =
184                     ((Long) getAttribute(cs, ATTR_MAX_DOCS)).longValue();
185                 thisCounter = this.hostCounters.get(scope) != null ?
186                     ((Long) this.hostCounters.get(scope)).longValue(): 0;
187                 // Have we hit the max document download limit for this host
188                 // or domain?
189                 if ((thisMaxDocs > 0 && thisCounter >= thisMaxDocs)) {
190                     logger.fine("Discarding Queue: " + host + " ");
191                     curi.addAnnotation("dsfLimit");
192                    if (!discarded) {
193                         long count = 0;
194                         WorkQueue wq = getQueueFor(curi);
195                         wq.unpeek();
196                         count += wq.deleteMatching(this, ".*");
197                         decrementQueuedCount(count);
198                         discarded = true;
199                         // I tried adding annotation but we're past log time
200                         // for Curi so it doesn't work.
201                         // curi.addAnnotation("maxDocsForHost");
202                     }
203                     // Adding an exclude filter for this host or domain
204                     OrFilter or = (OrFilter) this.controller.getScope()
205                             .getAttribute(ClassicScope.ATTR_EXCLUDE_FILTER);
206                     // If we have hit max for root, block everything. Else
207                     // just the scope.
208                     String filter = scope.equalsIgnoreCase("root") ?
209                         ".*" : "^((https?://)?[a-zA-Z0-9//.]*)" + scope +
210                             "($|/.*)";
211                     logger.fine("Adding filter: [" + filter + "].");
212                     URIRegExpFilter urf =
213                         new URIRegExpFilter(curi.toString(), filter);
214                     or.addFilter(this.controller.getSettingsHandler().
215                         getSettings(null), urf);
216                     thisMaxDocs = 0;
217                     thisCounter = 0;
218                     retVal = true;
219                 }
220             } while ((cs = cs.getParent()) != null && countPerOverride);
221         } catch (Exception e) {
222             logger.severe("ERROR: checkDownloadLimits(), "
223                     + "while processing {" + curi.toString() + "}"
224                     + e.getClass()
225                     + "message: " + e.getMessage() + ".  Stack trace:");
226             e.printStackTrace();
227         }
228         return retVal;
229     }
230     
231     protected synchronized void incrementHostCounters(CrawlURI curi) {
232         if (!curi.getUURI().toString().startsWith("dns:")) {
233             try {
234                 String host = curi.getUURI().getHost();
235                 CrawlerSettings cs =
236                     controller.getSettingsHandler().getSettings(host);
237                 do {
238                     String scope;
239                     if(counterMode.equalsIgnoreCase(COUNT_OVERRIDE))
240                         scope = cs.getScope() != null? cs.getScope() : "root";
241                     else if(counterMode.equalsIgnoreCase(COUNT_HOST))    
242                         scope = host;
243                     else{ //Get only domain part of host
244                         int i = host.lastIndexOf(".");
245                         i = host.lastIndexOf(".", i-1);
246                         scope = host.substring(i+1, host.length());
247                     }
248                     long counter = this.hostCounters.get(scope) != null ?
249                         ((Long)this.hostCounters.get(scope)).longValue(): 0;
250                     this.hostCounters.put(scope, new Long(++counter));
251                 } while ((cs = cs.getParent()) != null && countPerOverride);
252             } catch (Exception e) {
253                 logger.severe("ERROR: incrementHostCounters() " +
254                     e.getMessage());
255             }
256         }
257     }
258     
259     public void crawledURISuccessful(CrawlURI curi) {
260         incrementHostCounters(curi);
261         checkDownloadLimits(curi);
262     }
263 
264     public void crawledURINeedRetry(CrawlURI curi) {
265     }
266 
267     public void crawledURIDisregard(CrawlURI curi) {
268     }
269 
270     public void crawledURIFailure(CrawlURI curi) {
271     }
272 }