View Javadoc

1   /* AntiCalendarCostAssignmentPolicy
2   *
3   * $Id: AntiCalendarCostAssignmentPolicy.java 4953 2007-03-03 01:32:53Z gojomo $
4   *
5   * Created on Dec 15, 2004
6   *
7   * Copyright (C) 2004 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.frontier;
26  
27  import java.util.regex.Matcher;
28  
29  import org.archive.crawler.datamodel.CrawlURI;
30  import org.archive.util.TextUtils;
31  
32  /***
33   * CostAssignmentPolicy that further penalizes URIs with
34   * calendar-suggestive strings in them, with an extra unit 
35   * of cost. 
36   * 
37   * Will catch some 'innocent' URIs, but only when uncaught 
38   * large-volume chaff is ranked higher than caught 'wheat' 
39   * will this cause notable problems.
40   * 
41   * @author gojomo
42   */
43  public class AntiCalendarCostAssignmentPolicy extends UnitCostAssignmentPolicy {
44      public static String CALENDARISH =
45              "(?i)(calendar)|(year)|(month)|(day)|(date)|(viewcal)" +
46              "|(//D19//d//d//D)|(//D20//d//d//D)|(event)|(yr=)" +
47              "|(calendrier)|(jour)";
48      
49      /* (non-Javadoc)
50       * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.crawler.datamodel.CrawlURI)
51       */
52      public int costOf(CrawlURI curi) {
53          int cost = super.costOf(curi);
54          Matcher m = TextUtils.getMatcher(CALENDARISH, curi.toString());
55          if (m.find()) {
56              cost++;
57              // TODO: consider if multiple occurences should cost more
58          }
59          TextUtils.recycleMatcher(m);
60          return cost;
61      }
62  }