View Javadoc

1   
2   package st.ata.util;
3   
4   import java.util.Hashtable;
5   
6   /***
7   
8   <p> This class provides methods that construct fingerprints of strings
9   of bytes via operations in <i>GF[2^d]</i> for <i>0 < d <= 64</i>.
10  <i>GF[2^d]</i> is represented as the set of polynomials of degree
11  <i>d</i> with coefficients in <i>Z(2)</i>, modulo an irreducible
12  polynomial <i>P</i> of degree <i>d</i>.  The representation of
13  polynomials is as an unsigned binary number in which the least
14  significant exponent is kept in the most significant bit.
15  
16  <p> Let S be a string of bytes and <i>g(S)</i> the string obtained by
17  taking the byte <code>0x01</code> followed by eight <code>0x00</code>
18  bytes followed by <code>S</code>.  Let <i>f(S)</i> be the polynomial
19  associated to the string <i>S</i> viewed as a polynomial with
20  coefficients in the field <i>Z(2)</i>.  The fingerprint of S is simply
21  the value <i>f(g(S))</i> modulo <i>P</i>.  Because polynomials are
22  represented with the least significant coefficient in the most
23  significant bit, fingerprints of degree <i>d</i> are stored in the
24  <code>d</code> <strong>most</code> significant bits of a long word.
25  
26  <p> Fingerprints can be used as a probably unique id for the input
27  string.  More precisely, if <i>P</i> is chosen at random among
28  irreducible polynomials of degree <i>d</i>, then the probability that
29  any two strings <i>A</i> and <i>B</i> have the same fingerprint is
30  less than <i>max(|A|,|B|)/2^(d+1)</i> where <i>|A|</i> is the length
31  of A in bits.
32  
33  <p> The routines named <code>extend[8]</code> and <code>fp[8]</code>
34  return reduced results, while <code>extend_[byte/char/int/long]</code>
35  do not.  An <em>un</em>reduced result is a number that is equal (mod
36  </code>polynomial</code> to the desired fingerprint but may have
37  degree <code>degree</code> or higher.  The method <code>reduce</code>
38  reduces such a result to a polynomial of degree less than
39  <code>degree</code>.  Obtaining reduced results takes longer than
40  obtaining unreduced results; thus, when fingerprinting long strings,
41  it's better to obtain irreduced results inside the fingerprinting loop
42  and use <code>reduce</code> to reduce to a fingerprint after the loop.
43  
44  */
45  
46  // Tested by: TestFPGenerator
47  @SuppressWarnings("unchecked")
48  public final class FPGenerator {
49  
50      /*** Return a fingerprint generator.  The fingerprints generated
51          will have degree <code>degree</code> and will be generated by
52          <code>polynomial</code>.  If a generator based on
53          <code>polynomial</code> has already been created, it will be
54          returned.  Requires that <code>polynomial</code> is an
55          irreducible polynomial of degree <code>degree</code> (the
56          array <code>polynomials</code> contains some irreducible
57          polynomials). */
58      public static FPGenerator make(long polynomial, int degree) {
59          Long l = new Long(polynomial);
60          FPGenerator fpgen = (FPGenerator) generators.get(l);
61          if (fpgen == null) {
62              fpgen = new FPGenerator(polynomial, degree);
63              generators.put(l, fpgen);
64          }
65          return fpgen;
66      }
67      private static final Hashtable generators = new Hashtable(10);
68  
69      private static final long zero = 0;
70      private static final long one = 0x8000000000000000L;
71  
72  
73      /*** Return a value equal (mod <code>polynomial</code>) to
74          <code>fp</code> and of degree less than <code>degree</code>. */
75      public long reduce(long fp) {
76          int N = (8 - degree/8);
77          long local = (N == 8 ? 0 : fp & (-1L << 8*N));
78          long temp = zero;
79          for (int i = 0; i < N; i++) {
80              temp ^= ByteModTable[8+i][((int)fp) & 0xff];
81              fp >>>= 8;
82          };
83          return local ^ temp;
84      }
85  
86      /*** Extends <code>f</code> with lower eight bits of <code>v</code>
87          with<em>out</em> full reduction.  In other words, returns a
88          polynomial that is equal (mod <code>polynomial</code>) to the
89          desired fingerprint but may be of higher degree than the
90          desired fingerprint. */
91      public long extend_byte(long f, int v) {
92          f ^= (0xff & v);
93          int i = (int)f;
94          long result = (f>>>8);
95          result ^= ByteModTable[7][i & 0xff];
96          return result;
97      }
98  
99      /*** Extends <code>f</code> with lower sixteen bits of <code>v</code>.
100         Does not reduce. */
101     public long extend_char(long f, int v) {
102         f ^= (0xffff & v);
103         int i = (int)f;
104         long result = (f>>>16);
105         result ^= ByteModTable[6][i & 0xff]; i >>>= 8;
106         result ^= ByteModTable[7][i & 0xff];
107         return result;
108     }
109 
110     /*** Extends <code>f</code> with (all bits of) <code>v</code>.
111         Does not reduce. */
112     public long extend_int(long f, int v) {
113         f ^= (0xffffffffL & v);
114         int i = (int)f;
115         long result = (f>>>32);
116         result ^= ByteModTable[4][i & 0xff]; i >>>= 8;
117         result ^= ByteModTable[5][i & 0xff]; i >>>= 8;
118         result ^= ByteModTable[6][i & 0xff]; i >>>= 8;
119         result ^= ByteModTable[7][i & 0xff];
120         return result;
121     }
122 
123     /*** Extends <code>f</code> with <code>v</code>.
124         Does not reduce. */
125     public long extend_long(long f, long v) {
126         f ^= v;
127         long result = ByteModTable[0][(int)(f & 0xff)]; f >>>= 8;
128         result ^= ByteModTable[1][(int)(f & 0xff)]; f >>>= 8;
129         result ^= ByteModTable[2][(int)(f & 0xff)]; f >>>= 8;
130         result ^= ByteModTable[3][(int)(f & 0xff)]; f >>>= 8;
131         result ^= ByteModTable[4][(int)(f & 0xff)]; f >>>= 8;
132         result ^= ByteModTable[5][(int)(f & 0xff)]; f >>>= 8;
133         result ^= ByteModTable[6][(int)(f & 0xff)]; f >>>= 8;
134         result ^= ByteModTable[7][(int)(f & 0xff)];
135         return result;
136     }
137 
138 
139     /*** Compute fingerprint of "n" bytes of "buf" starting from
140         "buf[start]".  Requires "[start, start+n)" is in bounds. */
141     public long fp(byte[] buf, int start, int n) {
142         return extend(empty, buf, start, n);
143     }
144 
145     /*** Compute fingerprint of (all bits of) "n" characters of "buf"
146         starting from "buf[i]".  Requires "[i, i+n)" is in bounds. */
147     public long fp(char[] buf, int start, int n) {
148         return extend(empty, buf, start, n);
149     }
150 
151 // COMMENTED OUT TO REMOVE Dependency on st.ata.util.Text
152 //    /*** Compute fingerprint of (all bits of) <code>t</code> */
153 //    public long fp(Text t) {
154 //        return extend(empty, t);
155 //    }
156     /*** Compute fingerprint of (all bits of) the characters of "s". */
157     public long fp(CharSequence s) {
158         return extend(empty, s);
159     }
160 
161     /*** Compute fingerprint of (all bits of) "n" characters of "buf"
162         starting from "buf[i]".  Requires "[i, i+n)" is in bounds. */
163     public long fp(int[] buf, int start, int n) {
164         return extend(empty, buf, start, n);
165     }
166 
167     /*** Compute fingerprint of (all bits of) "n" characters of "buf"
168         starting from "buf[i]".  Requires "[i, i+n)" is in bounds. */
169     public long fp(long[] buf, int start, int n) {
170         return extend(empty, buf, start, n);
171     }
172 
173     /*** Compute fingerprint of the lower eight bits of the characters
174         of "s". */
175     public long fp8(String s) {
176         return extend8(empty, s);
177     }
178 
179     /*** Compute fingerprint of the lower eight bits of "n" characters
180         of "buf" starting from "buf[i]".  Requires "[i, i+n)" is in
181         bounds. */
182     public long fp8(char[] buf, int start, int n) {
183         return extend8(empty, buf, start, n);
184     }
185 
186 
187     /*** Extends fingerprint <code>f</code> by adding the low eight
188         bits of "b". */
189     public long extend(long f, byte v) {
190         return reduce(extend_byte(f, v));
191     }
192 
193     /*** Extends fingerprint <code>f</code> by adding (all bits of)
194         "v". */
195     public long extend(long f, char v) {
196         return reduce(extend_char(f, v));
197     }
198 
199     /*** Extends fingerprint <code>f</code> by adding (all bits of)
200         "v". */
201     public long extend(long f, int v) {
202         return reduce(extend_int(f, v));
203     }
204 
205     /*** Extends fingerprint <code>f</code> by adding (all bits of)
206         "v". */
207     public long extend(long f, long v) {
208         return reduce(extend_long(f, v));
209     }
210 
211     /*** Extends fingerprint <code>f</code> by adding "n" bytes of
212         "buf" starting from "buf[start]".
213         Result is reduced.
214         Requires "[i,&nbsp;i+n)" is in bounds. */
215     public long extend(long f, byte[] buf, int start, int n) {
216         for (int i = 0; i < n; i++) {
217             f = extend_byte(f, buf[start+i]);
218         }
219         return reduce(f);
220     }
221 
222     /*** Extends fingerprint <code>f</code> by adding (all bits of) "n"
223         characters of "buf" starting from "buf[i]".
224         Result is reduced.
225         Requires "[i,&nbsp;i+n)" is in bounds. */
226     public long extend(long f, char[] buf, int start, int n) {
227         for (int i = 0; i < n; i++) {
228             f = extend_char(f, buf[start+i]);
229         }
230         return reduce(f);
231     }
232 
233     /*** Extends fingerprint <code>f</code> by adding (all bits of)
234         the characters of "s".
235         Result is reduced. */
236     public long extend(long f, CharSequence s) {
237         int n = s.length();
238         for (int i = 0; i < n; i++) {
239             int v = (int) s.charAt(i);
240             f = extend_char(f, v);
241         }
242         return reduce(f);
243     }
244 
245 //  COMMENTED OUT TO REMOVE Dependency on st.ata.util.Text
246 //    /*** Extends fingerprint <code>f</code> by adding (all bits of)
247 //     *  <code>t</code> */
248 //    public long extend(long f, Text t) {
249 //        return extend(f, t.buf, t.start, t.length());
250 //    }
251 
252 
253     /*** Extends fingerprint <code>f</code> by adding (all bits of) "n"
254         characters of "buf" starting from "buf[i]".
255         Result is reduced.
256         Requires "[i,&nbsp;i+n)" is in bounds. */
257     public long extend(long f, int[] buf, int start, int n) {
258         for (int i = 0; i < n; i++) {
259             f = extend_int(f, buf[start+i]);
260         }
261         return reduce(f);
262     }
263 
264     /*** Extends fingerprint <code>f</code> by adding (all bits of) "n"
265         characters of "buf" starting from "buf[i]".
266         Result is reduced.
267         Requires "[i,&nbsp;i+n)" is in bounds. */
268     public long extend(long f, long[] buf, int start, int n) {
269         for (int i = 0; i < n; i++) {
270             f = extend_long(f, buf[start+i]);
271         }
272         return reduce(f);
273     }
274 
275     /*** Extends fingerprint <code>f</code> by adding the lower eight
276         bits of the characters of "s".
277         Result is reduced. */
278     public long extend8(long f, String s) {
279         int n = s.length();
280         for (int i = 0; i < n; i++) {
281             int x = (int) s.charAt(i);
282             f = extend_byte(f, x);
283         }
284         return reduce(f);
285     }
286 
287     /*** Extends fingerprint <code>f</code> by adding the lower eight
288         bits of "n" characters of "buf" starting from "buf[i]".
289         Result is reduced.
290         Requires "[i, i+n)" is in bounds. */
291     public long extend8(long f, char[] buf, int start, int n) {
292         for (int i = 0; i < n; i++) {
293             f = extend_byte(f, buf[start+i]);
294         }
295         return reduce(f);
296     }
297 
298 
299     /*** Fingerprint of the empty string of bytes. */
300     public final long empty;
301 
302     /*** The number of bits in fingerprints generated by
303         <code>this</code>. */
304     public final int degree;
305 
306     /*** The polynomial used by <code>this</code> to generate
307         fingerprints. */
308     public long polynomial;
309 
310     /*** Result of reducing certain polynomials.  Specifically, if
311         <code>f(S)</code> is bit string <code>S</code> interpreted as
312         a polynomial, <code>f(ByteModTable[i][j])</code> equals
313         <code>mod(x^(127&nbsp;-&nbsp;8*i)&nbsp;*&nbsp;f(j),&nbsp;P)</code>. */
314     private long[][] ByteModTable;
315 
316     /*** Create a fingerprint generator.  The fingerprints generated
317         will have degree <code>degree</code> and will be generated by
318         <code>polynomial</code>.  Requires that
319         <code>polynomial</code> is an irreducible polynomial of degree
320         <code>degree</code> (the array <code>polynomials</code>
321         contains some irreducible polynomials). */
322     private FPGenerator(long polynomial, int degree) {
323         this.degree = degree;
324         this.polynomial = polynomial;
325         ByteModTable = new long[16][256];
326 
327         long[] PowerTable = new long[128];
328 
329         long x_to_the_i = one;
330         long x_to_the_degree_minus_one = (one >>> (degree-1));
331         for (int i = 0; i < 128; i++) {
332             // Invariants:
333             //   x_to_the_i = mod(x^i, polynomial)
334             //   forall 0 <= j < i, PowerTable[i] = mod(x^i, polynomial)
335             PowerTable[i] = x_to_the_i;
336             boolean overflow = ((x_to_the_i & x_to_the_degree_minus_one) != 0);
337             x_to_the_i >>>= 1;
338             if (overflow) {
339                 x_to_the_i ^= polynomial;
340             }
341         }
342         this.empty = PowerTable[64];
343 
344         for (int i = 0; i < 16; i++) {
345             // Invariant: forall 0 <= i' < i, forall 0 <= j' < 256,
346             //   ByteModTable[i'][j'] = mod(x^(127 - 8*i') * f(j'), polynomial)
347             for (int j = 0; j < 256; j++) {
348                 // Invariant: forall 0 <= i' < i, forall 0 <= j' < j,
349                 //   ByteModTable[i'][j'] = mod(x^(degree+i')*f(j'),polynomial)
350                 long v = zero;
351                 for (int k = 0; k < 8; k++) {
352                     // Invariant:
353                     //   v = mod(x^(degree+i) * f(j & ((1<<k)-1)), polynomial)
354                     if ((j & (1 << k)) != 0) {
355                         v ^= PowerTable[127 - i*8 - k];
356                     }
357                 }
358                 ByteModTable[i][j] = v;
359             }
360         }
361     }
362 
363     /*** Array of irreducible polynomials.  For each degree
364         <code>d</code> between 1 and 64 (inclusive),
365         <code>polynomials[d][i]</code> is an irreducible polynomial of
366         degree <code>d</code>.  There are at least two irreducible
367         polynomials for each degree. */
368     public static final long polynomials[][] = {
369         null,
370         {0xC000000000000000L, 0xC000000000000000L},
371         {0xE000000000000000L, 0xE000000000000000L},
372         {0xD000000000000000L, 0xB000000000000000L},
373         {0xF800000000000000L, 0xF800000000000000L},
374         {0xEC00000000000000L, 0xBC00000000000000L},
375         {0xDA00000000000000L, 0xB600000000000000L},
376         {0xE500000000000000L, 0xE500000000000000L},
377         {0x9680000000000000L, 0xD480000000000000L},
378         {0x80C0000000000000L, 0x8840000000000000L},
379         {0xB0A0000000000000L, 0xE9A0000000000000L},
380         {0xD9F0000000000000L, 0xC9B0000000000000L},
381         {0xE758000000000000L, 0xDE98000000000000L},
382         {0xE42C000000000000L, 0x94E4000000000000L},
383         {0xD4CE000000000000L, 0xB892000000000000L},
384         {0xE2AB000000000000L, 0x9E39000000000000L},
385         {0xCCE4800000000000L, 0x9783800000000000L},
386         {0xD8F8C00000000000L, 0xA9CDC00000000000L},
387         {0x9A28200000000000L, 0xFD79E00000000000L},
388         {0xC782500000000000L, 0x96CD300000000000L},
389         {0xBEE6880000000000L, 0xE902C80000000000L},
390         {0x86D7E40000000000L, 0xF066340000000000L},
391         {0x9888060000000000L, 0x910ABE0000000000L},
392         {0xFF30E30000000000L, 0xB27EFB0000000000L},
393         {0x8E375B8000000000L, 0xA03D948000000000L},
394         {0xD1415C4000000000L, 0xF5357CC000000000L},
395         {0x91A916E000000000L, 0xB6CE66E000000000L},
396         {0xE6D2FC5000000000L, 0xD55882B000000000L},
397         {0x9A3BA0B800000000L, 0xFBD654E800000000L},
398         {0xAEA5D2E400000000L, 0xF0E533AC00000000L},
399         {0xDA88B7BE00000000L, 0xAA3AAEDE00000000L},
400         {0xBA75BB4300000000L, 0xF5A811C500000000L},
401         {0x9B6C9A2F80000000L, 0x9603CCED80000000L},
402         {0xFABB538840000000L, 0xE2747090C0000000L},
403         {0x8358898EA0000000L, 0x8C698D3D20000000L},
404         {0xDA8ABD5BF0000000L, 0xF6DF3A0AF0000000L},
405         {0xB090C3F758000000L, 0xD3B4D3D298000000L},
406         {0xAD9882F5BC000000L, 0x88DA4FB544000000L},
407         {0xC3C366272A000000L, 0xDCCF2A2262000000L},
408         {0x9BC0224A97000000L, 0xAF5D96F273000000L},
409         {0x8643FFF621800000L, 0x8E390C6EDC800000L},
410         {0xE45C01919BC00000L, 0xCBB34C8945C00000L},
411         {0x80D8141BC2E00000L, 0x886AFC3912200000L},
412         {0xF605807C26500000L, 0xA3B92D28F6300000L},
413         {0xCE9A2CFC76280000L, 0x98400C1921280000L},
414         {0xF61894904C040000L, 0xC8BE6DBCEC8C0000L},
415         {0xE3A44C104D160000L, 0xCA84A59443760000L},
416         {0xC7E84953A11B0000L, 0xD9D4F6AA02CB0000L},
417         {0xC26CDD1C9A358000L, 0x8BE8478434328000L},
418         {0xAE125DBEB198C000L, 0xFCC5DBFD5AAAC000L},
419         {0x86DE52A079A6A000L, 0xC5F16BD883816000L},
420         {0xDF82486AAFE37000L, 0xA293EC735692D000L},
421         {0xE91ABA275C272800L, 0xD686192874E3C800L},
422         {0x963D0960DAB3FC00L, 0xBA9DE62072621400L},
423         {0xA2188C4E8A46CE00L, 0xD31F75BCB4977E00L},
424         {0xC43A416020A6CB00L, 0x99F57FECA12B3900L},
425         {0xA4F72EF82A58AE80L, 0xCECE4391B81DA380L},
426         {0xB39F119264BC0940L, 0x80A277D20DABB9C0L},
427         {0xFD6616C0CBFA0B20L, 0xED16E64117DC11A0L},
428         {0xFFA8BC44327B5390L, 0xEDFB13DB3B66C210L},
429         {0xCAE8EB99E73AB548L, 0xC86135B6EA2F0B98L},
430         {0xBA49BADCDD19B16CL, 0x8F1944AFB18564C4L},
431         {0xECFC86D765EABBEEL, 0x9190E1C46CC13702L},
432         {0xE1F8D6B3195D6D97L, 0xDF70267FF5E4C979L},
433         {0xD74307D3FD3382DBL, 0x9999B3FFDC769B48L}
434     };
435 
436     /*** The standard 64-bit fingerprint generator using
437         <code>polynomials[0][64]</code>. */
438     public static final FPGenerator std64 = make(polynomials[64][0], 64);
439 
440     /*** A standard 32-bit fingerprint generator using
441         <code>polynomials[0][32]</code>. */
442     public static final FPGenerator std32 = make(polynomials[32][0], 32);
443     
444     // Below added by St.Ack on 09/30/2004.
445     /*** A standard 40-bit fingerprint generator using
446         <code>polynomials[0][40]</code>. */
447     public static final FPGenerator std40 = make(polynomials[40][0], 40);
448     /*** A standard 24-bit fingerprint generator using
449         <code>polynomials[0][24]</code>. */
450     public static final FPGenerator std24 = make(polynomials[24][0], 24);
451 }