View Javadoc

1   /* DiskFPMergeUriUniqFilter
2   *
3   * $Id: DiskFPMergeUriUniqFilter.java 4340 2006-07-13 06:04:11Z gojomo $
4   *
5   * Created on Dec 14, 2005
6   *
7   * Copyright (C) 2005 Internet Archive.
8   *
9   * This file is part of the Heritrix web crawler (crawler.archive.org).
10  *
11  * Heritrix is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * any later version.
15  *
16  * Heritrix is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser Public License
22  * along with Heritrix; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
24  */ 
25  package org.archive.crawler.util;
26  
27  import it.unimi.dsi.fastutil.longs.LongIterators;
28  import it.unimi.dsi.fastutil.longs.LongIterator;
29  
30  import java.io.BufferedInputStream;
31  import java.io.BufferedOutputStream;
32  import java.io.DataInputStream;
33  import java.io.DataOutputStream;
34  import java.io.File;
35  import java.io.FileInputStream;
36  import java.io.FileNotFoundException;
37  import java.io.FileOutputStream;
38  import java.io.IOException;
39  import java.util.NoSuchElementException;
40  
41  import org.archive.util.ArchiveUtils;
42  
43  /***
44   * Crude FPMergeUriUniqFilter using a disk data file of raw longs as the
45   * overall FP record. 
46   * 
47   * @author gojomo
48   */
49  public class DiskFPMergeUriUniqFilter extends FPMergeUriUniqFilter {
50      long count = 0; 
51      File scratchDir; 
52      File currentFps;
53      File newFpsFile;
54      DataOutputStream newFps; 
55      long newCount; 
56      DataInputStream oldFps; 
57      
58      public DiskFPMergeUriUniqFilter(File scratchDir) {
59          super();
60          this.scratchDir = scratchDir; 
61          // TODO: Use two scratch locations, to allow IO to be split
62          // over separate disks
63      }
64      
65      /* (non-Javadoc)
66       * @see org.archive.crawler.util.FPMergeUriUniqFilter#beginFpMerge()
67       */
68      protected LongIterator beginFpMerge() {
69          newFpsFile = new File(scratchDir,ArchiveUtils.get17DigitDate()+".fp");
70          if(newFpsFile.exists()) {
71              throw new RuntimeException(newFpsFile+" exists");
72          }
73          try {
74              newFps = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(newFpsFile)));
75          } catch (FileNotFoundException e) {
76              throw new RuntimeException(e);
77          }
78          newCount = 0;
79          if(currentFps==null) {
80              return LongIterators.EMPTY_ITERATOR;
81          }
82          try {
83              oldFps = new DataInputStream(new BufferedInputStream(new FileInputStream(currentFps)));
84          } catch (FileNotFoundException e1) {
85              throw new RuntimeException(e1);
86          }
87          return new DataFileLongIterator(oldFps);
88      }
89  
90      /* (non-Javadoc)
91       * @see org.archive.crawler.util.FPMergeUriUniqFilter#addNewFp(long)
92       */
93      protected void addNewFp(long fp) {
94          try {
95              newFps.writeLong(fp);
96              newCount++;
97          } catch (IOException e) {
98              throw new RuntimeException(e);
99          }
100     }
101 
102     /* (non-Javadoc)
103      * @see org.archive.crawler.util.FPMergeUriUniqFilter#finishFpMerge()
104      */
105     protected void finishFpMerge() {
106         try {
107             newFps.close();
108             File oldFpsFile = currentFps;
109             currentFps = newFpsFile;
110             if(oldFps!=null) {
111                 oldFps.close();
112             }
113             if(oldFpsFile!=null) {
114                 oldFpsFile.delete();
115             }
116         } catch (IOException e) {
117             throw new RuntimeException(e);
118         }
119         count = newCount;
120     }
121 
122     /* (non-Javadoc)
123      * @see org.archive.crawler.datamodel.UriUniqFilter#count()
124      */
125     public long count() {
126         return count;
127     }
128 
129     public class DataFileLongIterator implements LongIterator {
130         DataInputStream in; 
131         long next;
132         boolean nextIsValid = false; 
133         
134         /***
135          * Construct a long iterator reading from the given 
136          * stream. 
137          * 
138          * @param disStream DataInputStream from which to read longs
139          */
140         public DataFileLongIterator(DataInputStream disStream) {
141             this.in = disStream;
142         }
143 
144         /*** 
145          * Test whether any items remain; loads next item into
146          * holding 'next' field. 
147          * 
148          * @see java.util.Iterator#hasNext()
149          */
150         public boolean hasNext() {
151             return nextIsValid ? true: lookahead();
152         }
153         
154         /***
155          * Check if there's a next by trying to read it. 
156          * 
157          * @return true if 'next' field is filled with a valid next, false otherwise
158          */
159         protected boolean lookahead() {
160             try {
161                 next = in.readLong();
162             } catch (IOException e) {
163                 return false; 
164             }
165             nextIsValid = true; 
166             return true; 
167         }
168 
169         /*** 
170          * Return the next item.
171          * 
172          * @see java.util.Iterator#next()
173          */
174         public Long next() {
175             if (!hasNext()) {
176                 throw new NoSuchElementException();
177             }
178             // 'next' is guaranteed set by a hasNext() which returned true
179             Long returnObj = new Long(this.next);
180             this.nextIsValid = false;
181             return returnObj;
182         }
183         
184         /* (non-Javadoc)
185          * @see java.util.Iterator#remove()
186          */
187         public void remove() {
188             throw new UnsupportedOperationException();
189         }
190         
191         
192         /* (non-Javadoc)
193          * @see it.unimi.dsi.fastutil.longs.LongIterator#nextLong()
194          */
195         public long nextLong() {
196             if (!hasNext()) {
197                 throw new NoSuchElementException();
198             }
199             // 'next' is guaranteed non-null by a hasNext() which returned true
200             this.nextIsValid = false; // after this return, 'next' needs refresh
201             return this.next;
202         }
203 
204         /* (non-Javadoc)
205          * @see it.unimi.dsi.fastutil.longs.LongIterator#skip(int)
206          */
207         public int skip(int arg0) {
208             return 0;
209         }
210     }
211 
212 }