First commit

2021-09-08 13:02:57 +02:00 · 2021-09-08 13:02:57 +02:00 · 0fbc439593
commit 0fbc439593
17 changed files with 24331 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,20 @@
+**/bin/
+**/target/
+
+# Eclipse files
+**/.classpath
+**/.factorypath
+**/.project
+**/.settings/
+
+# IntelliJ files
+**/.idea/
+**/*.iml
+
+# Misc
+.mtj.tmp/
+**/hs_err_pid*
+config.properties
+
+# 
+
--- a/corpus1.txt
+++ b/corpus1.txt
--- a/corpus2.txt
+++ b/corpus2.txt
--- a/pom.xml
+++ b/pom.xml
@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.albertoventurini</groupId>
+  <artifactId>probabilistic-data-structures</artifactId>
+  <version>1.0-SNAPSHOT</version>
+
+  <name>probabilistic-data-structures</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-api</artifactId>
+      <version>5.7.2</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-engine</artifactId>
+      <version>5.7.2</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>1.15</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
+      <plugins>
+        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
+        <plugin>
+          <artifactId>maven-clean-plugin</artifactId>
+          <version>3.1.0</version>
+        </plugin>
+        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
+        <plugin>
+          <artifactId>maven-resources-plugin</artifactId>
+          <version>3.0.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-compiler-plugin</artifactId>
+          <version>3.8.0</version>
+          <configuration>
+            <release>11</release>
+          </configuration>
+        </plugin>
+        <plugin>
+          <artifactId>maven-surefire-plugin</artifactId>
+          <version>2.22.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-jar-plugin</artifactId>
+          <version>3.0.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-install-plugin</artifactId>
+          <version>2.5.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-deploy-plugin</artifactId>
+          <version>2.8.2</version>
+        </plugin>
+        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
+        <plugin>
+          <artifactId>maven-site-plugin</artifactId>
+          <version>3.7.1</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-project-info-reports-plugin</artifactId>
+          <version>3.0.0</version>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+</project>
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/BloomFilter.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/BloomFilter.java
@ -0,0 +1,63 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+/**
+ * This class implements a thread-safe Bloom filter that allows adding strings
+ * and checking the presence of strings.
+ */
+final class BloomFilter {
+    private final int hashCount;
+    private final ConcurrentAddOnlyBitSet bitSet;
+    private final Hash hash;
+
+    BloomFilter(final int hashCount, final int size, final Hash hash) {
+        this.hashCount = hashCount;
+        this.bitSet = new ConcurrentAddOnlyBitSet(size);
+        this.hash = hash;
+    }
+
+    // As explained here: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
+    // it is possible to simulate n hash functions using 2 hash functions.
+    private long[] calculateHashes(final String string) {
+        final long[] hash = this.hash.calculate(string);
+
+        final int max = bitSet.getCapacity();
+
+        if (hashCount <= 2) {
+            return new long[]{Math.abs(hash[0] % max), Math.abs(hash[1] % max)};
+        }
+
+        final long[] result = new long[hashCount];
+        for (int i = 0; i < hashCount; ++i) {
+            result[i] = Math.abs((hash[0] + (long) i * hash[1]) % max);
+        }
+        return result;
+    }
+
+    /**
+     * Add a string to this Bloom filter
+     *
+     * @param key the string to add
+     * @return true if the string was not already present in the Bloom filter
+     */
+    boolean add(final String key) {
+        boolean notPresent = false;
+        for (long hash : calculateHashes(key)) {
+            notPresent |= bitSet.add(hash);
+        }
+        return notPresent;
+    }
+
+    /**
+     * Test whether this Bloom filter contains a string
+     *
+     * @param key the string to test
+     * @return true if the Bloom filter contains the string
+     */
+    boolean contains(final String key) {
+        boolean present = true;
+        for (long hash : calculateHashes(key)) {
+            present &= bitSet.get(hash);
+        }
+        return present;
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/BloomFilterTest.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/BloomFilterTest.java
@ -0,0 +1,44 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.Collectors;
+
+public class BloomFilterTest {
+
+    public static void main(String[] args) throws Exception {
+        final List<String> words =
+                Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
+
+        final Set<String> wordSet = new HashSet<>(words);
+
+        final BloomFilter bloomFilter = new BloomFilter(2, 200000, new MurmurHash());
+
+        words.forEach(bloomFilter::add);
+
+        final List<Integer> errors = new ArrayList<>();
+
+        // Query words
+        words.stream().limit(100).forEach(w -> {
+            final String variation = w + ThreadLocalRandom.current().nextInt('a', 'z');
+            System.out.println("HashSet contains " + w + ": " + wordSet.contains(w));
+            System.out.println("HashSet contains " + variation + ": " + wordSet.contains(variation));
+
+            System.out.println("BloomFilter contains " + w + ": " + bloomFilter.contains(w));
+            System.out.println("BloomFilter contains " + variation + ": " + bloomFilter.contains(variation));
+
+            errors.add(wordSet.contains(w) != bloomFilter.contains(w) ? 1 : 0);
+            errors.add(wordSet.contains(variation) != bloomFilter.contains(variation) ? 1 : 0);
+        });
+
+        final double avgError = errors.stream().mapToDouble(i -> (double) i).average().getAsDouble();
+        System.out.println("Average error: " + avgError);
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/ConcurrentAddOnlyBitSet.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/ConcurrentAddOnlyBitSet.java
@ -0,0 +1,66 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+import java.util.concurrent.atomic.AtomicLongArray;
+
+/**
+ * This class implements a thread-safe set that allows adding and checking the presence of a long value.
+ * The set is backed by a bit sequence.
+ */
+final class ConcurrentAddOnlyBitSet {
+    private static final int BASE = 64;
+    private final int capacity;
+    private final AtomicLongArray buckets;
+
+    ConcurrentAddOnlyBitSet(final int capacity) {
+        this.capacity = capacity;
+
+        final int bucketsCount = (capacity / BASE) + 1;
+        buckets = new AtomicLongArray(bucketsCount);
+
+        for (int i = 0; i < buckets.length(); i++) {
+            buckets.set(i, 0);
+        }
+    }
+
+    public int getCapacity() {
+        return capacity;
+    }
+
+    /**
+     * Adds the given value to the set.
+     *
+     * @param value the value to add to the set
+     * @return true if the set did not already contain the specified element
+     */
+    boolean add(final long value) {
+        final int bucketIdx = (int) value / BASE;
+        final int bitIdx = (int) value - (BASE * bucketIdx);
+        return atomicSet(bucketIdx, bitIdx);
+    }
+
+    /**
+     * Tests whether the given value is contained in the set.
+     *
+     * @param value the value to test
+     * @return true if the set contains the specified element
+     */
+    boolean get(final long value) {
+        final int bucketIdx = (int) value / BASE;
+        final int bitIdx = (int) value - (BASE * bucketIdx);
+        return atomicGet(bucketIdx, bitIdx);
+    }
+
+    private boolean atomicSet(final int bucketIdx, final int bitIdx) {
+        final long mask = mask(bitIdx);
+        return (buckets.getAndUpdate(bucketIdx, l -> l | mask) & mask) == 0;
+    }
+
+    private boolean atomicGet(final int bucketIdx, final int bitIdx) {
+        final long mask = mask(bitIdx);
+        return (buckets.get(bucketIdx) & mask) == mask;
+    }
+
+    private static long mask(final int id) {
+        return 1L << id;
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/CountMinSketch.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/CountMinSketch.java
@ -0,0 +1,92 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicLongArray;
+import java.util.concurrent.atomic.LongAdder;
+
+final class CountMinSketch {
+
+    private final int width;
+    private final int depth;
+    private final Hash hash;
+
+    private final AtomicLongArray sketch;
+
+    private final LongAdder count = new LongAdder();
+
+    CountMinSketch(final int width, final int depth, final Hash hash) {
+        this.width = width;
+        this.depth = depth;
+        this.hash = hash;
+        sketch = new AtomicLongArray(width * depth);
+    }
+
+    // As explained here: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
+    // it is possible to simulate n hash functions using 2 hash functions.
+    private long[] calculateHashes(final String string) {
+        final long[] hash = this.hash.calculate(string);
+
+        if (depth <= 2) {
+            return new long[]{Math.abs(hash[0] % width), Math.abs(hash[1] % width)};
+        }
+
+        final long[] result = new long[depth];
+        for (int i = 0; i < depth; ++i) {
+            result[i] = Math.abs((hash[0] + (long) i * hash[1]) % width);
+        }
+        return result;
+    }
+
+    long update(final String string, long value) {
+        long min = Long.MAX_VALUE;
+        long[] hashes = calculateHashes(string);
+        for (int i = 0; i < hashes.length; i++) {
+            long cell = sketch.getAndSet((int) (i * width + hashes[i]), value);
+            if (cell < min) {
+                min = cell;
+            }
+        }
+        return min;
+    }
+
+    long increment(final String string) {
+        count.increment();
+
+        long min = Long.MAX_VALUE;
+        long[] hashes = calculateHashes(string);
+        for (int i = 0; i < hashes.length; i++) {
+            long cell = sketch.getAndIncrement((int) (i * width + hashes[i]));
+            if (cell < min) {
+                min = cell;
+            }
+        }
+        return min;
+    }
+
+    long get(final String string) {
+        long min = Long.MAX_VALUE;
+        long[] hashes = calculateHashes(string);
+        for (int i = 0; i < hashes.length; i++) {
+            long cell = sketch.get((int) (i * width + hashes[i]));
+            if (cell < min) {
+                min = cell;
+            }
+        }
+        return min;
+    }
+
+    long get2(final String string) {
+        long[] e = new long[depth];
+
+        long[] hashes = calculateHashes(string);
+
+        for (int i = 0; i < hashes.length; i++) {
+            long cell = sketch.get((int) (i * width + hashes[i]));
+            long noiseEstimation = (count.longValue() - cell) / (width - 1);
+            e[i] = cell - noiseEstimation;
+        }
+
+        Arrays.sort(e);
+        return e[e.length / 2];
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/CountMinSketchTest.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/CountMinSketchTest.java
@ -0,0 +1,53 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+import static java.util.stream.Collectors.summingInt;
+
+public class CountMinSketchTest {
+
+    public static void main(String[] args) throws Exception {
+        final List<String> words =
+                Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
+
+        // Calculate word frequencies with a map
+        final Map<String, Integer> wordFrequencies = words
+                .stream()
+                .collect(Collectors.groupingBy(Function.identity(), summingInt(e -> 1)));
+
+        // Calculate word frequencies with a count min sketch
+        final CountMinSketch countMinSketch = new CountMinSketch(6000, 4, new MurmurHash());
+        words.forEach(countMinSketch::increment);
+
+        final List<Integer> errors = new ArrayList<>();
+
+        // Query words
+        words.stream().limit(100).forEach(w -> {
+            final int freq1 = wordFrequencies.getOrDefault(w, 0);
+            final int freq2 = (int) countMinSketch.get(w);
+
+            System.out.println("HashMap " + w + " -> " + freq1);
+            System.out.println("CountMinSketch " + w + " -> " + freq2);
+
+            errors.add(Math.abs(freq1 - freq2));
+        });
+
+        Collections.sort(errors);
+        final int medianError = errors.get(errors.size() / 2);
+        final double avgError = errors.stream().mapToDouble(i -> (double) i).average().getAsDouble();
+
+        System.out.println("\nMedian error: " + medianError);
+        System.out.println("Average error: " + avgError);
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/Hash.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/Hash.java
@ -0,0 +1,7 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+public interface Hash {
+
+    long[] calculate(String string);
+
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/Int32MurmurHash.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/Int32MurmurHash.java
@ -0,0 +1,11 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+import org.apache.commons.codec.digest.MurmurHash3;
+
+public class Int32MurmurHash implements Hash {
+
+    @Override
+    public long[] calculate(final String string) {
+        return new long[]{MurmurHash3.hash32(string), 0};
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/JavaHash.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/JavaHash.java
@ -0,0 +1,12 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+public class JavaHash implements Hash {
+
+    @Override
+    public long[] calculate(final String string) {
+        final int hash = string.hashCode();
+        final int hash1 = hash & 0x0000FFFF;
+        final int hash2 = hash >>> 16;
+        return new long[]{hash1, hash2};
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/LogLog.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/LogLog.java
@ -0,0 +1,67 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+// See https://storage.googleapis.com/pub-tools-public-publication-data/pdf/40671.pdf
+public class LogLog {
+
+    private final int m;
+    private final Hash hash;
+    private final long[] M;
+    private final double alpha;
+
+    // Number of bits to reserve for bucket index
+    private final int k;
+
+    public LogLog(final int k, final Hash hash) {
+        this.k = k;
+        this.m = 1 << k;
+        this.hash = hash;
+        M = new long[m];
+        alpha = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m);
+    }
+
+    public void add(final String string) {
+        final int hash = (int) (this.hash.calculate(string)[0]);
+        final int bucketIdx = calculateBucketIdx(hash);
+        final int leadingZeros = calculateLeadingZeros(hash);
+        M[bucketIdx] = Math.max(M[bucketIdx], leadingZeros);
+    }
+
+    public long estimateCount() {
+        double c = 0.0;
+        for (final long l : M) {
+            c += 1.0 / (1 << l);  //Math.pow(2, l);
+        }
+
+        double estimate = alpha * m * m / c;
+
+        if (estimate <= 5.0/2.0 * m) {
+            double V = 0;
+            for (final long l : M) {
+                if (l == 0) V++;
+            }
+            if (V > 0) {
+                estimate = m * Math.log(m / V);
+            } else if (estimate > 1.0/30*(0x100000000L)) {
+                estimate = - 0x100000000L * Math.log(1 - estimate/0x100000000L);
+            }
+        }
+
+        return (long) estimate;
+    }
+
+    private int calculateBucketIdx(final int l) {
+        // Extract k most significant bits
+        return (int) l >>> (32 - k);
+    }
+
+    // leading zeros plus one, as per the paper
+    private int calculateLeadingZeros(final int l) {
+        int i = 1;
+        int mask = 0x80000000 >>> k;
+        while (mask > 0 && (l & mask) == 0) {
+            mask = mask >>> 1;
+            i++;
+        }
+        return i;
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/LogLogTest.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/LogLogTest.java
@ -0,0 +1,25 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class LogLogTest {
+
+    public static void main(String[] args) throws Exception {
+        final List<String> words =
+                Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
+
+        final Set<String> wordSet = new HashSet<>(words);
+
+        final LogLog logLog = new LogLog(5, new Int32MurmurHash());
+        words.forEach(logLog::add);
+
+        System.out.println(wordSet.size());
+        System.out.println(logLog.estimateCount());
+    }
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/Membership.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/Membership.java
@ -0,0 +1,8 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+interface Membership {
+
+    void add(String item);
+
+    boolean contains(String item);
+}
--- a/src/main/java/com/albertoventurini/datastructures/probabilistic/MurmurHash.java
+++ b/src/main/java/com/albertoventurini/datastructures/probabilistic/MurmurHash.java
@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+This file was taken from Apache Cassandra source code:
+https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java
+ */
+
+package com.albertoventurini.datastructures.probabilistic;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+
+/**
+ * This is a very fast, non-cryptographic hash suitable for general hash-based
+ * lookup. See http://murmurhash.googlepages.com/ for more details.
+ *
+ * hash32() and hash64() are MurmurHash 2.0.
+ *
+ * hash3_x64_128() is *almost* MurmurHash 3.0.  It was supposed to match, but we didn't catch a sign bug with
+ * the result that it doesn't.  Unfortunately, we can't change it now without breaking Murmur3Partitioner. *
+ *
+ * <p>
+ * The C version of MurmurHash 2.0 found at that site was ported to Java by
+ * Andrzej Bialecki (ab at getopt org).
+ * </p>
+ */
+public final class MurmurHash implements Hash
+{
+    private final Charset charset;
+
+    public MurmurHash() {
+        charset = Charset.defaultCharset();
+    }
+
+    private static long getBlock(ByteBuffer key, int offset, int index)
+    {
+        int i_8 = index << 3;
+        int blockOffset = offset + i_8;
+        return ((long) key.get(blockOffset + 0) & 0xff) + (((long) key.get(blockOffset + 1) & 0xff) << 8) +
+                (((long) key.get(blockOffset + 2) & 0xff) << 16) + (((long) key.get(blockOffset + 3) & 0xff) << 24) +
+                (((long) key.get(blockOffset + 4) & 0xff) << 32) + (((long) key.get(blockOffset + 5) & 0xff) << 40) +
+                (((long) key.get(blockOffset + 6) & 0xff) << 48) + (((long) key.get(blockOffset + 7) & 0xff) << 56);
+    }
+
+    private static long rotl64(long v, int n)
+    {
+        return ((v << n) | (v >>> (64 - n)));
+    }
+
+    private static long fmix(long k)
+    {
+        k ^= k >>> 33;
+        k *= 0xff51afd7ed558ccdL;
+        k ^= k >>> 33;
+        k *= 0xc4ceb9fe1a85ec53L;
+        k ^= k >>> 33;
+
+        return k;
+    }
+
+    static long[] hash3_x64_128(ByteBuffer key, int offset, int length, long seed)
+    {
+        final int nblocks = length >> 4; // Process as 128-bit blocks.
+
+        long h1 = seed;
+        long h2 = seed;
+
+        long c1 = 0x87c37b91114253d5L;
+        long c2 = 0x4cf5ad432745937fL;
+
+        //----------
+        // body
+
+        for(int i = 0; i < nblocks; i++)
+        {
+            long k1 = getBlock(key, offset, i * 2 + 0);
+            long k2 = getBlock(key, offset, i * 2 + 1);
+
+            k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1;
+
+            h1 = rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+
+            k2 *= c2; k2  = rotl64(k2,33); k2 *= c1; h2 ^= k2;
+
+            h2 = rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+        }
+
+        //----------
+        // tail
+
+        // Advance offset to the unprocessed tail of the data.
+        offset += nblocks * 16;
+
+        long k1 = 0;
+        long k2 = 0;
+
+        switch(length & 15)
+        {
+            case 15: k2 ^= ((long) key.get(offset+14)) << 48;
+            case 14: k2 ^= ((long) key.get(offset+13)) << 40;
+            case 13: k2 ^= ((long) key.get(offset+12)) << 32;
+            case 12: k2 ^= ((long) key.get(offset+11)) << 24;
+            case 11: k2 ^= ((long) key.get(offset+10)) << 16;
+            case 10: k2 ^= ((long) key.get(offset+9)) << 8;
+            case  9: k2 ^= ((long) key.get(offset+8)) << 0;
+                k2 *= c2; k2  = rotl64(k2,33); k2 *= c1; h2 ^= k2;
+
+            case  8: k1 ^= ((long) key.get(offset+7)) << 56;
+            case  7: k1 ^= ((long) key.get(offset+6)) << 48;
+            case  6: k1 ^= ((long) key.get(offset+5)) << 40;
+            case  5: k1 ^= ((long) key.get(offset+4)) << 32;
+            case  4: k1 ^= ((long) key.get(offset+3)) << 24;
+            case  3: k1 ^= ((long) key.get(offset+2)) << 16;
+            case  2: k1 ^= ((long) key.get(offset+1)) << 8;
+            case  1: k1 ^= ((long) key.get(offset));
+                k1 *= c1; k1  = rotl64(k1,31); k1 *= c2; h1 ^= k1;
+        };
+
+        //----------
+        // finalization
+
+        h1 ^= length; h2 ^= length;
+
+        h1 += h2;
+        h2 += h1;
+
+        h1 = fmix(h1);
+        h2 = fmix(h2);
+
+        h1 += h2;
+        h2 += h1;
+
+        return(new long[] {h1, h2});
+    }
+
+    @Override
+    public long[] calculate(final String string) {
+        final ByteBuffer b = charset.encode(string);
+        return MurmurHash.hash3_x64_128(b, 0, b.limit(), 0L);
+    }
+}
--- a/src/test/java/com/albertoventurini/datastructures/probabilistic/CountMinSketchTest.java
+++ b/src/test/java/com/albertoventurini/datastructures/probabilistic/CountMinSketchTest.java
@ -0,0 +1,57 @@
+package com.albertoventurini.datastructures.probabilistic;
+
+import org.junit.jupiter.api.Test;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.Collectors;
+
+public class CountMinSketchTest {
+
+    private final Set<String> hostnames;
+
+    public CountMinSketchTest() throws Exception {
+        final List<String> hostnames = Files.readAllLines(Path.of("hostnames2.csv"))
+                .stream()
+                .filter(l -> l.indexOf('"') == 0 && l.lastIndexOf('"') > 0)
+                .map(l -> l.substring(1, l.lastIndexOf('"')))
+                .collect(Collectors.toList());
+
+        this.hostnames = new HashSet<>();
+
+        for (int i = 0; i < 20; i++) {
+            this.hostnames.add(hostnames.get(ThreadLocalRandom.current().nextInt(hostnames.size())));
+        }
+    }
+
+    @Test
+    public void testHash() {
+        final CountMinSketch countMinSketch = new CountMinSketch(200, 2, new JavaHash());
+
+        final Map<String, Integer> numbers = new HashMap<>();
+
+        for (final String hostname : hostnames) {
+            final int random = ThreadLocalRandom.current().nextInt(1000);
+            numbers.put(hostname, random);
+            countMinSketch.update(hostname, random);
+        }
+
+        int misses = 0;
+        int error = 0;
+
+        for (final String hostname : hostnames) {
+            if (countMinSketch.get(hostname) != numbers.get(hostname)) {
+                misses++;
+                error += Math.abs(countMinSketch.get(hostname) - numbers.get(hostname));
+            }
+        }
+
+        System.out.println("There were " + misses + " misses over " + hostnames.size() + " hostnames. Error = " + error);
+    }
+}