1 /* AntiCalendarCostAssignmentPolicy
2 *
3 * $Id: AntiCalendarCostAssignmentPolicy.java 4953 2007-03-03 01:32:53Z gojomo $
4 *
5 * Created on Dec 15, 2004
6 *
7 * Copyright (C) 2004 Internet Archive.
8 *
9 * This file is part of the Heritrix web crawler (crawler.archive.org).
10 *
11 * Heritrix is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU Lesser Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * any later version.
15 *
16 * Heritrix is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser Public License
22 * along with Heritrix; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 */
25 package org.archive.crawler.frontier;
26
27 import java.util.regex.Matcher;
28
29 import org.archive.crawler.datamodel.CrawlURI;
30 import org.archive.util.TextUtils;
31
32 /***
33 * CostAssignmentPolicy that further penalizes URIs with
34 * calendar-suggestive strings in them, with an extra unit
35 * of cost.
36 *
37 * Will catch some 'innocent' URIs, but only when uncaught
38 * large-volume chaff is ranked higher than caught 'wheat'
39 * will this cause notable problems.
40 *
41 * @author gojomo
42 */
43 public class AntiCalendarCostAssignmentPolicy extends UnitCostAssignmentPolicy {
44 public static String CALENDARISH =
45 "(?i)(calendar)|(year)|(month)|(day)|(date)|(viewcal)" +
46 "|(//D19//d//d//D)|(//D20//d//d//D)|(event)|(yr=)" +
47 "|(calendrier)|(jour)";
48
49 /* (non-Javadoc)
50 * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
51 */
52 public int costOf(CrawlURI curi) {
53 int cost = super.costOf(curi);
54 Matcher m = TextUtils.getMatcher(CALENDARISH, curi.toString());
55 if (m.find()) {
56 cost++;
57 // TODO: consider if multiple occurences should cost more
58 }
59 TextUtils.recycleMatcher(m);
60 return cost;
61 }
62 }