1 /* WagCostAssignmentPolicy
2 *
3 * $Id: WagCostAssignmentPolicy.java 3704 2005-07-18 17:30:21Z stack-sf $
4 *
5 * Created on Dec 10, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25 package org.archive.crawler.frontier;
26
27 import org.archive.crawler.datamodel.CrawlURI;
28 import org.archive.net.UURI;
29
30 /***
31 * A CostAssignmentPolicy based on some wild guesses of kinds of URIs
32 * that should be deferred into the (potentially never-crawled) future.
33 *
34 * @author gojomo
35 */
36 public class WagCostAssignmentPolicy extends CostAssignmentPolicy {
37
38 /***
39 * Add constant penalties for certain features of URI (and
40 * its 'via') that make it more delayable/skippable.
41 *
42 * @param curi CrawlURI to be assigned a cost
43 *
44 * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
45 */
46 public int costOf(CrawlURI curi) {
47 int cost = 1;
48 UURI uuri = curi.getUURI();
49 if (uuri.hasQuery()) {
50 // has query string
51 cost++;
52 int qIndex = uuri.toString().indexOf('?');
53 if (curi.flattenVia().startsWith(uuri.toString().substring(0,qIndex))) {
54 // non-query-string portion of URI is same as previous
55 cost++;
56 }
57 // TODO: other potential query-related cost penalties:
58 // - more than X query-string attributes
59 // - calendarish terms
60 // - query-string over certain size
61 }
62 // TODO: other potential path-based penalties
63 // - new path is simply extension of via path
64 // - many path segments
65 // TODO: other potential hops-based penalties
66 // - more than X hops
67 // - each speculative hop
68 return cost;
69 }
70 }