View Javadoc

1   /* WagCostAssignmentPolicy
2   *
3   * $Id: WagCostAssignmentPolicy.java 3704 2005-07-18 17:30:21Z stack-sf $
4   *
5   * Created on Dec 10, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.frontier;
26  
27  import org.archive.crawler.datamodel.CrawlURI;
28  import org.archive.net.UURI;
29  
30  /***
31   * A CostAssignmentPolicy based on some wild guesses of kinds of URIs
32   * that should be deferred into the (potentially never-crawled) future.
33   * 
34   * @author gojomo
35   */
36  public class WagCostAssignmentPolicy extends CostAssignmentPolicy {
37  
38      /***
39       * Add constant penalties for certain features of URI (and
40       * its 'via') that make it more delayable/skippable. 
41       * 
42       * @param curi CrawlURI to be assigned a cost
43       * 
44       * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
45       */
46      public int costOf(CrawlURI curi) {
47          int cost = 1;
48          UURI uuri = curi.getUURI();
49          if (uuri.hasQuery()) {
50              // has query string
51              cost++;
52              int qIndex = uuri.toString().indexOf('?');
53              if (curi.flattenVia().startsWith(uuri.toString().substring(0,qIndex))) {
54                  // non-query-string portion of URI is same as previous
55                  cost++;
56              }
57              // TODO: other potential query-related cost penalties:
58              //  - more than X query-string attributes
59              //  - calendarish terms
60              //  - query-string over certain size
61          }
62          // TODO: other potential path-based penalties
63          //  - new path is simply extension of via path
64          //  - many path segments
65          // TODO: other potential hops-based penalties
66          //  - more than X hops
67          //  - each speculative hop
68          return cost;
69      }
70  }