View Javadoc

1   /* RecoveryLogMapper.java
2   *
3   * $Id: RecoveryLogMapper.java 4647 2006-09-22 18:39:39Z paul_jack $
4   *
5   * Created on Mar 7, 2005
6   *
7   * Copyright (C) 2005 Mike Schwartz.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */
25  
26  /***
27   * Parses a Heritrix recovery log file (recover.gz), and builds maps
28   * that allow a caller to look up any seed URL and get back an Iterator of all
29   * URLs successfully crawled from given seed.
30   *
31   * Also allows lookup on any crawled
32   * URL to find the seed URL from which the crawler reached that URL (through 1
33   * or more discovered URL hops, which are collapsed in this lookup).
34   * 
35   * <p>This code creates some fairly large collections (proprotionate in size to
36   * # discovered URLs) so make sure you allocate
37   * it a large heap to work in. It also takes a while to process a recover log.
38   * <p>See {@link #main()} method at end for test/demo code.
39   * @author Mike Schwartz, schwartz at CodeOnTheRoad dot com
40   */
41  package org.archive.crawler.util;
42  
43  import org.archive.crawler.frontier.RecoveryJournal;
44  
45  import java.io.File;
46  import java.io.LineNumberReader;
47  import java.io.PrintWriter;
48  import java.io.FileOutputStream;
49  import java.util.Collection;
50  import java.util.HashMap;
51  import java.util.HashSet;
52  import java.util.Iterator;
53  import java.util.Map;
54  import java.util.Set;
55  import java.util.logging.Level;
56  import java.util.logging.Logger;
57  
58  public class RecoveryLogMapper {
59      private static final char LOG_LINE_START_CHAR =
60          RecoveryJournal.F_ADD.charAt(0);
61      private static final Logger logger =
62          Logger.getLogger(RecoveryLogMapper.class.getName());
63      private PrintWriter seedNotFoundPrintWriter = null;
64  
65      /***
66       * Tracks seed for each crawled URL
67       */
68      private Map<String,String> crawledUrlToSeedMap
69       = new HashMap<String,String>();
70  
71      /***
72       * Maps seed URLs to Set of discovered URLs
73       */
74      private Map<String,Set<String>> seedUrlToDiscoveredUrlsMap
75       = new HashMap<String,Set<String>>();
76  
77      /***
78       * Tracks which URLs were successfully crawled
79       */
80      private Set<String> successfullyCrawledUrls = new HashSet<String>();
81  
82       /***
83       * Normal constructor - if encounter not-found seeds while loading
84       * recoverLogFileName, will throw throw SeedUrlNotFoundException.
85       * Use {@link #RecoveryLogMapper(String)} if you want to just log
86       * such cases and keep going.  (Those should not happen if the
87       * recover log is written correctly, but we see them in pratice.)
88       * @param recoverLogFileName
89       * @throws java.io.FileNotFoundException 
90       * @throws java.io.IOException 
91       * @throws SeedUrlNotFoundException 
92       */
93      public RecoveryLogMapper(String recoverLogFileName)
94      throws java.io.FileNotFoundException, java.io.IOException,
95              SeedUrlNotFoundException {
96          load(recoverLogFileName);
97      }
98  
99      /***
100      * Constructor to use if you want to allow not-found seeds, logging
101      * them to seedNotFoundLogFileName.  In contrast, {@link
102      * #RecoveryLogMapper(String)} will throw SeedUrlNotFoundException
103      * when a seed isn't found.
104      * @param recoverLogFileName
105      * @param seedNotFoundLogFileName
106      */
107     public RecoveryLogMapper(String recoverLogFileName,
108                              String seedNotFoundLogFileName)
109         throws java.io.FileNotFoundException, java.io.IOException,
110                SeedUrlNotFoundException {
111         seedNotFoundPrintWriter = new PrintWriter(new FileOutputStream(
112                seedNotFoundLogFileName));
113         load(recoverLogFileName);
114     }
115 
116     protected void load(String recoverLogFileName)
117     throws java.io.FileNotFoundException, java.io.IOException,
118             SeedUrlNotFoundException {
119         LineNumberReader reader = new LineNumberReader(RecoveryJournal.
120             getBufferedReader(new File(recoverLogFileName)));
121         String curLine = null;
122         while ((curLine = reader.readLine()) != null) {
123             if (curLine.length() == 0
124                     || curLine.charAt(0) != LOG_LINE_START_CHAR) {
125                 continue;
126             }
127             String args[] = curLine.split("//s+");
128             int curLineNumWords = args.length;
129             String firstUrl = args[1];
130             // Ignore DNS log entries
131             if (firstUrl.startsWith("dns:")) {
132                 continue;
133             }
134             if (curLine.startsWith(RecoveryJournal.F_ADD)) {
135                 // Seed URL
136                 if (curLineNumWords == 2) {
137                     if (logger.isLoggable(Level.FINE)) {
138                         logger.fine("F_ADD with 2 words --> seed URL (" +
139                             firstUrl + ")");
140                     }
141                     // Add seed the first time we find it
142                     if (seedUrlToDiscoveredUrlsMap.get(firstUrl) == null) {
143                         seedUrlToDiscoveredUrlsMap.put(firstUrl,
144                             new HashSet<String>());
145                     }
146                 } else {
147                     // URL found via an earlier seeded / discovered URL
148                     // Look for the seed from which firstUrlString came, so
149                     // we can collapse new URLString back to it
150                     String viaUrl = args[curLineNumWords - 1];
151                     if (logger.isLoggable(Level.FINE)) {
152                         logger.fine("F_ADD with 3+ words --> new URL "
153                                 + firstUrl + " via URL " + viaUrl);
154                     }
155                     String seedForFirstUrl =
156                         (String) crawledUrlToSeedMap.get(viaUrl);
157                     // viaUrlString is a seed URL
158                     if (seedForFirstUrl == null) {
159                         if (logger.isLoggable(Level.FINE)) {
160                             logger.fine("\tvia URL is a seed");
161                         }
162                         crawledUrlToSeedMap.put(firstUrl, viaUrl);
163                         seedForFirstUrl = viaUrl;
164                     } else {
165                         if (logger.isLoggable(Level.FINE)) {
166                             logger.fine("\tvia URL discovered via seed URL " +
167                                 seedForFirstUrl);
168                         }
169                         // Collapse
170                         crawledUrlToSeedMap.put(firstUrl, seedForFirstUrl);
171                     }
172                     Set<String> theSeedUrlList =
173                         seedUrlToDiscoveredUrlsMap.get(seedForFirstUrl);
174                         if (theSeedUrlList == null) {
175                         String message = "recover log " +
176                                          recoverLogFileName + " at line " +
177                                          reader.getLineNumber() +
178                                          " listed F+ URL (" + viaUrl +
179                                          ") for which found no seed list.";
180                         if (seedNotFoundPrintWriter != null) {
181                             seedNotFoundPrintWriter.println(message);
182                         } else {
183                             throw new SeedUrlNotFoundException(message);
184                         }
185                     } else {
186                         theSeedUrlList.add(firstUrl);
187                     }
188                 }
189             } else if (curLine.startsWith(RecoveryJournal.F_SUCCESS)) {
190                 if (logger.isLoggable(Level.FINE)) {
191                     logger.fine("F_SUCCESS for URL " + firstUrl);
192                 }
193                 successfullyCrawledUrls.add(firstUrl);
194             }
195         }
196         reader.close();
197         if (seedNotFoundPrintWriter != null) {
198             seedNotFoundPrintWriter.close();
199         }
200     }
201 
202     /***
203      * Returns seed for urlString (null if seed not found).
204      * @param urlString
205      * @return Seed.
206      */
207     public String getSeedForUrl(String urlString) {
208         return (seedUrlToDiscoveredUrlsMap.get(urlString) != null)?
209                 urlString: crawledUrlToSeedMap.get(urlString);
210     }
211 
212     /***
213      * @return Returns the seedUrlToDiscoveredUrlsMap.
214      */
215     public Map getSeedUrlToDiscoveredUrlsMap() {
216         return this.seedUrlToDiscoveredUrlsMap;
217     }
218 
219     /***
220      * @return Returns the successfullyCrawledUrls.
221      */
222     public Set getSuccessfullyCrawledUrls() {
223         return this.successfullyCrawledUrls;
224     }
225 
226     /***
227      * @return Returns the logger.
228      */
229     public static Logger getLogger() {
230         return logger;
231     }
232 
233     private class SuccessfullyCrawledURLsIterator
234     implements Iterator<String> {
235         private String nextValue = null;
236         private Iterator discoveredUrlsIterator;
237 
238         public SuccessfullyCrawledURLsIterator(String seedUrlString)
239         throws SeedUrlNotFoundException {
240             Set discoveredUrlList =
241                 (Set)getSeedUrlToDiscoveredUrlsMap().get(seedUrlString);
242             if (discoveredUrlList == null) {
243                 throw new SeedUrlNotFoundException("Seed URL " +
244                     seedUrlString + "  not found in seed list");
245             }
246             discoveredUrlsIterator = discoveredUrlList.iterator();
247         }
248 
249         /***
250          * Idempotent method (because of null check on nextValue).
251          */
252         private void populateNextValue() {
253             while (nextValue == null & discoveredUrlsIterator.hasNext()) {
254                 String curDiscoveredUrl =
255                     (String)discoveredUrlsIterator.next();
256                 boolean succCrawled = getSuccessfullyCrawledUrls().
257                     contains(curDiscoveredUrl);
258                 if (getLogger().isLoggable(Level.FINE)) {
259                     getLogger().fine("populateNextValue: curDiscoveredUrl=" +
260                             curDiscoveredUrl + ", succCrawled=" +
261                             succCrawled);
262                 }
263                 if (succCrawled)
264                     nextValue = curDiscoveredUrl;
265             }
266         }
267 
268         public boolean hasNext() {
269             populateNextValue();
270             return (nextValue != null);
271         }
272 
273         public String next() {
274             populateNextValue();
275             String returnValue = nextValue;
276             nextValue = null;
277             return returnValue;
278         }
279 
280         /***
281          * Remove operation is unsupported in this Iterator
282          * (will throw UnsupportedOperationException if called).
283          */
284         public void remove() {
285             throw new UnsupportedOperationException(
286                 "SuccessfullyCrawledURLsIterator.remove: not supported.");
287         }
288     }
289 
290     public Iterator<String> getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(
291             String seedUrlString) throws SeedUrlNotFoundException {
292         return new SuccessfullyCrawledURLsIterator(seedUrlString);
293     }
294 
295     public Collection<String> getSeedCollection() {
296         return seedUrlToDiscoveredUrlsMap.keySet();
297     }
298 
299     public static void main(String args[]) {
300         if (args.length < 1) {
301             System.out.println("Usage: RecoveryLogMapper recoverLogFileName");
302             Runtime.getRuntime().exit(-1);
303         }
304         String recoverLogFileName = args[0];
305         try {
306             RecoveryLogMapper myRecoveryLogMapper =
307                 new RecoveryLogMapper(recoverLogFileName);
308             for (String curSeedUrl: myRecoveryLogMapper.getSeedCollection()) {
309                 System.out.println("URLs successfully crawled from seed URL "
310                     + curSeedUrl);
311                 Iterator iteratorOfUrlsCrawledFromSeedUrl =
312                     myRecoveryLogMapper.
313                         getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(
314                             curSeedUrl);
315                 while (iteratorOfUrlsCrawledFromSeedUrl.hasNext()) {
316                     String curCrawledUrlString =
317                         (String)iteratorOfUrlsCrawledFromSeedUrl.next();
318                     System.out.println("    -> " + curCrawledUrlString);
319                 }
320             }
321         } catch (Exception e) {
322             e.printStackTrace();
323         }
324     }
325 }