1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 package org.archive.crawler.util;
26
27 import it.unimi.dsi.fastutil.longs.LongIterators;
28 import it.unimi.dsi.fastutil.longs.LongIterator;
29
30 import java.io.BufferedInputStream;
31 import java.io.BufferedOutputStream;
32 import java.io.DataInputStream;
33 import java.io.DataOutputStream;
34 import java.io.File;
35 import java.io.FileInputStream;
36 import java.io.FileNotFoundException;
37 import java.io.FileOutputStream;
38 import java.io.IOException;
39 import java.util.NoSuchElementException;
40
41 import org.archive.util.ArchiveUtils;
42
43 /***
44 * Crude FPMergeUriUniqFilter using a disk data file of raw longs as the
45 * overall FP record.
46 *
47 * @author gojomo
48 */
49 public class DiskFPMergeUriUniqFilter extends FPMergeUriUniqFilter {
50 long count = 0;
51 File scratchDir;
52 File currentFps;
53 File newFpsFile;
54 DataOutputStream newFps;
55 long newCount;
56 DataInputStream oldFps;
57
58 public DiskFPMergeUriUniqFilter(File scratchDir) {
59 super();
60 this.scratchDir = scratchDir;
61
62
63 }
64
65
66
67
68 protected LongIterator beginFpMerge() {
69 newFpsFile = new File(scratchDir,ArchiveUtils.get17DigitDate()+".fp");
70 if(newFpsFile.exists()) {
71 throw new RuntimeException(newFpsFile+" exists");
72 }
73 try {
74 newFps = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(newFpsFile)));
75 } catch (FileNotFoundException e) {
76 throw new RuntimeException(e);
77 }
78 newCount = 0;
79 if(currentFps==null) {
80 return LongIterators.EMPTY_ITERATOR;
81 }
82 try {
83 oldFps = new DataInputStream(new BufferedInputStream(new FileInputStream(currentFps)));
84 } catch (FileNotFoundException e1) {
85 throw new RuntimeException(e1);
86 }
87 return new DataFileLongIterator(oldFps);
88 }
89
90
91
92
93 protected void addNewFp(long fp) {
94 try {
95 newFps.writeLong(fp);
96 newCount++;
97 } catch (IOException e) {
98 throw new RuntimeException(e);
99 }
100 }
101
102
103
104
105 protected void finishFpMerge() {
106 try {
107 newFps.close();
108 File oldFpsFile = currentFps;
109 currentFps = newFpsFile;
110 if(oldFps!=null) {
111 oldFps.close();
112 }
113 if(oldFpsFile!=null) {
114 oldFpsFile.delete();
115 }
116 } catch (IOException e) {
117 throw new RuntimeException(e);
118 }
119 count = newCount;
120 }
121
122
123
124
125 public long count() {
126 return count;
127 }
128
129 public class DataFileLongIterator implements LongIterator {
130 DataInputStream in;
131 long next;
132 boolean nextIsValid = false;
133
134 /***
135 * Construct a long iterator reading from the given
136 * stream.
137 *
138 * @param disStream DataInputStream from which to read longs
139 */
140 public DataFileLongIterator(DataInputStream disStream) {
141 this.in = disStream;
142 }
143
144 /***
145 * Test whether any items remain; loads next item into
146 * holding 'next' field.
147 *
148 * @see java.util.Iterator#hasNext()
149 */
150 public boolean hasNext() {
151 return nextIsValid ? true: lookahead();
152 }
153
154 /***
155 * Check if there's a next by trying to read it.
156 *
157 * @return true if 'next' field is filled with a valid next, false otherwise
158 */
159 protected boolean lookahead() {
160 try {
161 next = in.readLong();
162 } catch (IOException e) {
163 return false;
164 }
165 nextIsValid = true;
166 return true;
167 }
168
169 /***
170 * Return the next item.
171 *
172 * @see java.util.Iterator#next()
173 */
174 public Long next() {
175 if (!hasNext()) {
176 throw new NoSuchElementException();
177 }
178
179 Long returnObj = new Long(this.next);
180 this.nextIsValid = false;
181 return returnObj;
182 }
183
184
185
186
187 public void remove() {
188 throw new UnsupportedOperationException();
189 }
190
191
192
193
194
195 public long nextLong() {
196 if (!hasNext()) {
197 throw new NoSuchElementException();
198 }
199
200 this.nextIsValid = false;
201 return this.next;
202 }
203
204
205
206
207 public int skip(int arg0) {
208 return 0;
209 }
210 }
211
212 }