1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 /***
27 * Parses a Heritrix recovery log file (recover.gz), and builds maps
28 * that allow a caller to look up any seed URL and get back an Iterator of all
29 * URLs successfully crawled from given seed.
30 *
31 * Also allows lookup on any crawled
32 * URL to find the seed URL from which the crawler reached that URL (through 1
33 * or more discovered URL hops, which are collapsed in this lookup).
34 *
35 * <p>This code creates some fairly large collections (proprotionate in size to
36 * # discovered URLs) so make sure you allocate
37 * it a large heap to work in. It also takes a while to process a recover log.
38 * <p>See {@link #main()} method at end for test/demo code.
39 * @author Mike Schwartz, schwartz at CodeOnTheRoad dot com
40 */
41 package org.archive.crawler.util;
42
43 import org.archive.crawler.frontier.RecoveryJournal;
44
45 import java.io.File;
46 import java.io.LineNumberReader;
47 import java.io.PrintWriter;
48 import java.io.FileOutputStream;
49 import java.util.Collection;
50 import java.util.HashMap;
51 import java.util.HashSet;
52 import java.util.Iterator;
53 import java.util.Map;
54 import java.util.Set;
55 import java.util.logging.Level;
56 import java.util.logging.Logger;
57
58 public class RecoveryLogMapper {
59 private static final char LOG_LINE_START_CHAR =
60 RecoveryJournal.F_ADD.charAt(0);
61 private static final Logger logger =
62 Logger.getLogger(RecoveryLogMapper.class.getName());
63 private PrintWriter seedNotFoundPrintWriter = null;
64
65 /***
66 * Tracks seed for each crawled URL
67 */
68 private Map<String,String> crawledUrlToSeedMap
69 = new HashMap<String,String>();
70
71 /***
72 * Maps seed URLs to Set of discovered URLs
73 */
74 private Map<String,Set<String>> seedUrlToDiscoveredUrlsMap
75 = new HashMap<String,Set<String>>();
76
77 /***
78 * Tracks which URLs were successfully crawled
79 */
80 private Set<String> successfullyCrawledUrls = new HashSet<String>();
81
82 /***
83 * Normal constructor - if encounter not-found seeds while loading
84 * recoverLogFileName, will throw throw SeedUrlNotFoundException.
85 * Use {@link #RecoveryLogMapper(String)} if you want to just log
86 * such cases and keep going. (Those should not happen if the
87 * recover log is written correctly, but we see them in pratice.)
88 * @param recoverLogFileName
89 * @throws java.io.FileNotFoundException
90 * @throws java.io.IOException
91 * @throws SeedUrlNotFoundException
92 */
93 public RecoveryLogMapper(String recoverLogFileName)
94 throws java.io.FileNotFoundException, java.io.IOException,
95 SeedUrlNotFoundException {
96 load(recoverLogFileName);
97 }
98
99 /***
100 * Constructor to use if you want to allow not-found seeds, logging
101 * them to seedNotFoundLogFileName. In contrast, {@link
102 * #RecoveryLogMapper(String)} will throw SeedUrlNotFoundException
103 * when a seed isn't found.
104 * @param recoverLogFileName
105 * @param seedNotFoundLogFileName
106 */
107 public RecoveryLogMapper(String recoverLogFileName,
108 String seedNotFoundLogFileName)
109 throws java.io.FileNotFoundException, java.io.IOException,
110 SeedUrlNotFoundException {
111 seedNotFoundPrintWriter = new PrintWriter(new FileOutputStream(
112 seedNotFoundLogFileName));
113 load(recoverLogFileName);
114 }
115
116 protected void load(String recoverLogFileName)
117 throws java.io.FileNotFoundException, java.io.IOException,
118 SeedUrlNotFoundException {
119 LineNumberReader reader = new LineNumberReader(RecoveryJournal.
120 getBufferedReader(new File(recoverLogFileName)));
121 String curLine = null;
122 while ((curLine = reader.readLine()) != null) {
123 if (curLine.length() == 0
124 || curLine.charAt(0) != LOG_LINE_START_CHAR) {
125 continue;
126 }
127 String args[] = curLine.split("//s+");
128 int curLineNumWords = args.length;
129 String firstUrl = args[1];
130
131 if (firstUrl.startsWith("dns:")) {
132 continue;
133 }
134 if (curLine.startsWith(RecoveryJournal.F_ADD)) {
135
136 if (curLineNumWords == 2) {
137 if (logger.isLoggable(Level.FINE)) {
138 logger.fine("F_ADD with 2 words --> seed URL (" +
139 firstUrl + ")");
140 }
141
142 if (seedUrlToDiscoveredUrlsMap.get(firstUrl) == null) {
143 seedUrlToDiscoveredUrlsMap.put(firstUrl,
144 new HashSet<String>());
145 }
146 } else {
147
148
149
150 String viaUrl = args[curLineNumWords - 1];
151 if (logger.isLoggable(Level.FINE)) {
152 logger.fine("F_ADD with 3+ words --> new URL "
153 + firstUrl + " via URL " + viaUrl);
154 }
155 String seedForFirstUrl =
156 (String) crawledUrlToSeedMap.get(viaUrl);
157
158 if (seedForFirstUrl == null) {
159 if (logger.isLoggable(Level.FINE)) {
160 logger.fine("\tvia URL is a seed");
161 }
162 crawledUrlToSeedMap.put(firstUrl, viaUrl);
163 seedForFirstUrl = viaUrl;
164 } else {
165 if (logger.isLoggable(Level.FINE)) {
166 logger.fine("\tvia URL discovered via seed URL " +
167 seedForFirstUrl);
168 }
169
170 crawledUrlToSeedMap.put(firstUrl, seedForFirstUrl);
171 }
172 Set<String> theSeedUrlList =
173 seedUrlToDiscoveredUrlsMap.get(seedForFirstUrl);
174 if (theSeedUrlList == null) {
175 String message = "recover log " +
176 recoverLogFileName + " at line " +
177 reader.getLineNumber() +
178 " listed F+ URL (" + viaUrl +
179 ") for which found no seed list.";
180 if (seedNotFoundPrintWriter != null) {
181 seedNotFoundPrintWriter.println(message);
182 } else {
183 throw new SeedUrlNotFoundException(message);
184 }
185 } else {
186 theSeedUrlList.add(firstUrl);
187 }
188 }
189 } else if (curLine.startsWith(RecoveryJournal.F_SUCCESS)) {
190 if (logger.isLoggable(Level.FINE)) {
191 logger.fine("F_SUCCESS for URL " + firstUrl);
192 }
193 successfullyCrawledUrls.add(firstUrl);
194 }
195 }
196 reader.close();
197 if (seedNotFoundPrintWriter != null) {
198 seedNotFoundPrintWriter.close();
199 }
200 }
201
202 /***
203 * Returns seed for urlString (null if seed not found).
204 * @param urlString
205 * @return Seed.
206 */
207 public String getSeedForUrl(String urlString) {
208 return (seedUrlToDiscoveredUrlsMap.get(urlString) != null)?
209 urlString: crawledUrlToSeedMap.get(urlString);
210 }
211
212 /***
213 * @return Returns the seedUrlToDiscoveredUrlsMap.
214 */
215 public Map getSeedUrlToDiscoveredUrlsMap() {
216 return this.seedUrlToDiscoveredUrlsMap;
217 }
218
219 /***
220 * @return Returns the successfullyCrawledUrls.
221 */
222 public Set getSuccessfullyCrawledUrls() {
223 return this.successfullyCrawledUrls;
224 }
225
226 /***
227 * @return Returns the logger.
228 */
229 public static Logger getLogger() {
230 return logger;
231 }
232
233 private class SuccessfullyCrawledURLsIterator
234 implements Iterator<String> {
235 private String nextValue = null;
236 private Iterator discoveredUrlsIterator;
237
238 public SuccessfullyCrawledURLsIterator(String seedUrlString)
239 throws SeedUrlNotFoundException {
240 Set discoveredUrlList =
241 (Set)getSeedUrlToDiscoveredUrlsMap().get(seedUrlString);
242 if (discoveredUrlList == null) {
243 throw new SeedUrlNotFoundException("Seed URL " +
244 seedUrlString + " not found in seed list");
245 }
246 discoveredUrlsIterator = discoveredUrlList.iterator();
247 }
248
249 /***
250 * Idempotent method (because of null check on nextValue).
251 */
252 private void populateNextValue() {
253 while (nextValue == null & discoveredUrlsIterator.hasNext()) {
254 String curDiscoveredUrl =
255 (String)discoveredUrlsIterator.next();
256 boolean succCrawled = getSuccessfullyCrawledUrls().
257 contains(curDiscoveredUrl);
258 if (getLogger().isLoggable(Level.FINE)) {
259 getLogger().fine("populateNextValue: curDiscoveredUrl=" +
260 curDiscoveredUrl + ", succCrawled=" +
261 succCrawled);
262 }
263 if (succCrawled)
264 nextValue = curDiscoveredUrl;
265 }
266 }
267
268 public boolean hasNext() {
269 populateNextValue();
270 return (nextValue != null);
271 }
272
273 public String next() {
274 populateNextValue();
275 String returnValue = nextValue;
276 nextValue = null;
277 return returnValue;
278 }
279
280 /***
281 * Remove operation is unsupported in this Iterator
282 * (will throw UnsupportedOperationException if called).
283 */
284 public void remove() {
285 throw new UnsupportedOperationException(
286 "SuccessfullyCrawledURLsIterator.remove: not supported.");
287 }
288 }
289
290 public Iterator<String> getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(
291 String seedUrlString) throws SeedUrlNotFoundException {
292 return new SuccessfullyCrawledURLsIterator(seedUrlString);
293 }
294
295 public Collection<String> getSeedCollection() {
296 return seedUrlToDiscoveredUrlsMap.keySet();
297 }
298
299 public static void main(String args[]) {
300 if (args.length < 1) {
301 System.out.println("Usage: RecoveryLogMapper recoverLogFileName");
302 Runtime.getRuntime().exit(-1);
303 }
304 String recoverLogFileName = args[0];
305 try {
306 RecoveryLogMapper myRecoveryLogMapper =
307 new RecoveryLogMapper(recoverLogFileName);
308 for (String curSeedUrl: myRecoveryLogMapper.getSeedCollection()) {
309 System.out.println("URLs successfully crawled from seed URL "
310 + curSeedUrl);
311 Iterator iteratorOfUrlsCrawledFromSeedUrl =
312 myRecoveryLogMapper.
313 getIteratorOfURLsSuccessfullyCrawledFromSeedUrl(
314 curSeedUrl);
315 while (iteratorOfUrlsCrawledFromSeedUrl.hasNext()) {
316 String curCrawledUrlString =
317 (String)iteratorOfUrlsCrawledFromSeedUrl.next();
318 System.out.println(" -> " + curCrawledUrlString);
319 }
320 }
321 } catch (Exception e) {
322 e.printStackTrace();
323 }
324 }
325 }