1 /* WagCostAssignmentPolicy 2 * 3 * $Id: WagCostAssignmentPolicy.java 3704 2005-07-18 17:30:21Z stack-sf $ 4 * 5 * Created on Dec 10, 2004 6 * 7 * Copyright (C) 2004 Internet Archive. 8 * 9 * This file is part of the Heritrix web crawler (crawler.archive.org). 10 * 11 * Heritrix is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser Public License as published by 13 * the Free Software Foundation; either version 2.1 of the License, or 14 * any later version. 15 * 16 * Heritrix is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU Lesser Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser Public License 22 * along with Heritrix; if not, write to the Free Software 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 */ 25 package org.archive.crawler.frontier; 26 27 import org.archive.crawler.datamodel.CrawlURI; 28 import org.archive.net.UURI; 29 30 /*** 31 * A CostAssignmentPolicy based on some wild guesses of kinds of URIs 32 * that should be deferred into the (potentially never-crawled) future. 33 * 34 * @author gojomo 35 */ 36 public class WagCostAssignmentPolicy extends CostAssignmentPolicy { 37 38 /*** 39 * Add constant penalties for certain features of URI (and 40 * its 'via') that make it more delayable/skippable. 41 * 42 * @param curi CrawlURI to be assigned a cost 43 * 44 * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI) 45 */ 46 public int costOf(CrawlURI curi) { 47 int cost = 1; 48 UURI uuri = curi.getUURI(); 49 if (uuri.hasQuery()) { 50 // has query string 51 cost++; 52 int qIndex = uuri.toString().indexOf('?'); 53 if (curi.flattenVia().startsWith(uuri.toString().substring(0,qIndex))) { 54 // non-query-string portion of URI is same as previous 55 cost++; 56 } 57 // TODO: other potential query-related cost penalties: 58 // - more than X query-string attributes 59 // - calendarish terms 60 // - query-string over certain size 61 } 62 // TODO: other potential path-based penalties 63 // - new path is simply extension of via path 64 // - many path segments 65 // TODO: other potential hops-based penalties 66 // - more than X hops 67 // - each speculative hop 68 return cost; 69 } 70 }