View Javadoc

1   /*
2    * ExtractorURI
3    *
4    * $Id: ExtractorImpliedURI.java 4943 2007-02-27 02:54:54Z ia_igor $
5    *
6    * Created on July 20, 2006
7    *
8    * Copyright (C) 2006 Internet Archive.
9    *
10   * This file is part of the Heritrix web crawler (crawler.archive.org).
11   *
12   * Heritrix is free software; you can redistribute it and/or modify
13   * it under the terms of the GNU Lesser Public License as published by
14   * the Free Software Foundation; either version 2.1 of the License, or
15   * any later version.
16   *
17   * Heritrix is distributed in the hope that it will be useful,
18   * but WITHOUT ANY WARRANTY; without even the implied warranty of
19   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20   * GNU Lesser Public License for more details.
21   *
22   * You should have received a copy of the GNU Lesser Public License
23   * along with Heritrix; if not, write to the Free Software
24   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25   */
26  
27  package org.archive.crawler.extractor;
28  
29  import java.util.Collection;
30  import java.util.logging.Level;
31  import java.util.logging.Logger;
32  import java.util.regex.Matcher;
33  
34  import org.apache.commons.httpclient.URIException;
35  import org.archive.crawler.datamodel.CoreAttributeConstants;
36  import org.archive.crawler.datamodel.CrawlURI;
37  import org.archive.crawler.settings.SimpleType;
38  import org.archive.util.TextUtils;
39  
40  /***
41   * An extractor for finding 'implied' URIs inside other URIs.  If the 
42   * 'trigger' regex is matched, a new URI will be constructed from the
43   * 'build' replacement pattern. 
44   * 
45   * Unlike most other extractors, this works on URIs discovered by 
46   * previous extractors. Thus it should appear near the end of any 
47   * set of extractors.
48   *
49   * Initially, only finds absolute HTTP(S) URIs in query-string or its 
50   * parameters.
51   *
52   * TODO: extend to find URIs in path-info
53   *
54   * @author Gordon Mohr
55   *
56   **/
57  
58  public class ExtractorImpliedURI extends Extractor implements CoreAttributeConstants {
59  
60      private static final long serialVersionUID = 8579045413127769497L;
61  
62      private static Logger LOGGER =
63          Logger.getLogger(ExtractorImpliedURI.class.getName());
64     
65      /*** regex which when matched triggers addition of 'implied' URI */
66      public static final String ATTR_TRIGGER_REGEXP = "trigger-regexp";
67      /*** replacement pattern used to build 'implied' URI */
68      public static final String ATTR_BUILD_PATTERN = "build-pattern";
69      
70      /*** whether to remove URIs that trigger addition of 'implied' URI;
71       * default false 
72       */
73      public static final String ATTR_REMOVE_TRIGGER_URIS = "remove-trigger-uris";
74      
75      // FIXME: these counters are not incremented atomically; totals may not
76      // be correct
77      private long numberOfCURIsHandled = 0;
78      private long numberOfLinksExtracted = 0;
79  
80      /***
81       * Constructor
82       * 
83       * @param name
84       */
85      public ExtractorImpliedURI(String name) {
86          super(name, "Implied URI Extractor. Finds URIs implied by other " +
87                  "URIs according to regex/replacement patterns. Should " +
88                  "appear after most other extractors.");
89  
90          addElementToDefinition(
91              new SimpleType(ATTR_TRIGGER_REGEXP, 
92                      "Triggering regular expression. When a discovered URI " +
93                      "matches this pattern, the 'implied' URI will be " +
94                      "built. The capturing groups of this expression are " +
95                      "available for the build replacement pattern.", ""));
96          addElementToDefinition(
97                  new SimpleType(ATTR_BUILD_PATTERN, 
98                      "Replacement pattern to build 'implied' URI, using " +
99                      "captured groups of trigger expression.", ""));
100         addElementToDefinition(
101                 new SimpleType(ATTR_REMOVE_TRIGGER_URIS, 
102                     "If true, all URIs that match trigger regular expression " +
103                     "are removed from the list of extracted URIs. " +
104                     "Default is false.", Boolean.FALSE));
105     }
106 
107     /***
108      * Perform usual extraction on a CrawlURI
109      * 
110      * @param curi Crawl URI to process.
111      */
112     public void extract(CrawlURI curi) {
113 
114         this.numberOfCURIsHandled++;
115         // use array copy because discoveriess will add to outlinks
116         Collection<Link> links = curi.getOutLinks();
117         Link[] sourceLinks = links.toArray(new Link[links.size()]);
118         for (Link wref: sourceLinks) {
119             String implied = extractImplied(
120                     wref.getDestination(),
121                     (String)getUncheckedAttribute(curi,ATTR_TRIGGER_REGEXP),
122                     (String)getUncheckedAttribute(curi,ATTR_BUILD_PATTERN));
123             if (implied!=null) {
124                 try {
125                     curi.createAndAddLink(
126                             implied, 
127                             Link.SPECULATIVE_MISC,
128                             Link.SPECULATIVE_HOP);
129                 	
130                     numberOfLinksExtracted++;
131                 	
132                     final boolean removeTriggerURI = 
133                     	((Boolean)getUncheckedAttribute(curi,
134                     			ATTR_REMOVE_TRIGGER_URIS)).booleanValue();
135 
136                     // remove trigger URI from the outlinks if configured so.
137                     if (removeTriggerURI) {
138                     	if (curi.getOutLinks().remove(wref)) {
139                     		LOGGER.log(Level.FINE, wref.getDestination() + 
140                     				" has been removed from " + 
141                     				wref.getSource() + " outlinks list.");                    	
142                     		numberOfLinksExtracted--;
143 
144                     	} else {
145                         	LOGGER.log(Level.FINE, "Failed to remove " + 
146                         			wref.getDestination() + " from " + 
147                         			wref.getSource()+ " outlinks list.");             		
148                     	}
149                     }
150                     
151                 } catch (URIException e) {
152                     LOGGER.log(Level.FINE, "bad URI", e);
153                 }
154             }
155         }
156     }
157     
158     /***
159      * Utility method for extracting 'implied' URI given a source uri, 
160      * trigger pattern, and build pattern. 
161      * 
162      * @param uri source to check for implied URI
163      * @param trigger regex pattern which if matched implies another URI
164      * @param build replacement pattern to build the implied URI
165      * @return implied URI, or null if none
166      */
167     protected static String extractImplied(CharSequence uri, String trigger, String build) {
168         if(trigger.length()==0) {
169             // short-circuit empty-string trigger
170             return null; 
171         }
172         Matcher m = TextUtils.getMatcher(trigger, uri);
173         if(m.matches()) {
174             String result = m.replaceFirst(build);
175             TextUtils.recycleMatcher(m);
176             return result; 
177         }
178         return null; 
179     }
180 
181     public String report() {
182         StringBuffer ret = new StringBuffer();
183         ret.append("Processor: "+ExtractorImpliedURI.class.getName()+"\n");
184         ret.append("  Function:          Extracts links inside other URIs\n");
185         ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
186         ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
187 
188         return ret.toString();
189     }
190 }