1 /* AntiCalendarCostAssignmentPolicy 2 * 3 * $Id: AntiCalendarCostAssignmentPolicy.java 4953 2007-03-03 01:32:53Z gojomo $ 4 * 5 * Created on Dec 15, 2004 6 * 7 * Copyright (C) 2004 Internet Archive. 8 * 9 * This file is part of the Heritrix web crawler (crawler.archive.org). 10 * 11 * Heritrix is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser Public License as published by 13 * the Free Software Foundation; either version 2.1 of the License, or 14 * any later version. 15 * 16 * Heritrix is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU Lesser Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser Public License 22 * along with Heritrix; if not, write to the Free Software 23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 24 */ 25 package org.archive.crawler.frontier; 26 27 import java.util.regex.Matcher; 28 29 import org.archive.crawler.datamodel.CrawlURI; 30 import org.archive.util.TextUtils; 31 32 /*** 33 * CostAssignmentPolicy that further penalizes URIs with 34 * calendar-suggestive strings in them, with an extra unit 35 * of cost. 36 * 37 * Will catch some 'innocent' URIs, but only when uncaught 38 * large-volume chaff is ranked higher than caught 'wheat' 39 * will this cause notable problems. 40 * 41 * @author gojomo 42 */ 43 public class AntiCalendarCostAssignmentPolicy extends UnitCostAssignmentPolicy { 44 public static String CALENDARISH = 45 "(?i)(calendar)|(year)|(month)|(day)|(date)|(viewcal)" + 46 "|(//D19//d//d//D)|(//D20//d//d//D)|(event)|(yr=)" + 47 "|(calendrier)|(jour)"; 48 49 /* (non-Javadoc) 50 * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI) 51 */ 52 public int costOf(CrawlURI curi) { 53 int cost = super.costOf(curi); 54 Matcher m = TextUtils.getMatcher(CALENDARISH, curi.toString()); 55 if (m.find()) { 56 cost++; 57 // TODO: consider if multiple occurences should cost more 58 } 59 TextUtils.recycleMatcher(m); 60 return cost; 61 } 62 }