First commit
This commit is contained in:
commit
0fbc439593
17 changed files with 24331 additions and 0 deletions
20
.gitignore
vendored
Normal file
20
.gitignore
vendored
Normal file
|
@ -0,0 +1,20 @@
|
|||
**/bin/
|
||||
**/target/
|
||||
|
||||
# Eclipse files
|
||||
**/.classpath
|
||||
**/.factorypath
|
||||
**/.project
|
||||
**/.settings/
|
||||
|
||||
# IntelliJ files
|
||||
**/.idea/
|
||||
**/*.iml
|
||||
|
||||
# Misc
|
||||
.mtj.tmp/
|
||||
**/hs_err_pid*
|
||||
config.properties
|
||||
|
||||
#
|
||||
|
1252
corpus1.txt
Normal file
1252
corpus1.txt
Normal file
File diff suppressed because one or more lines are too long
22316
corpus2.txt
Normal file
22316
corpus2.txt
Normal file
File diff suppressed because it is too large
Load diff
81
pom.xml
Normal file
81
pom.xml
Normal file
|
@ -0,0 +1,81 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>com.albertoventurini</groupId>
|
||||
<artifactId>probabilistic-data-structures</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
|
||||
<name>probabilistic-data-structures</name>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<version>5.7.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<version>5.7.2</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-codec</groupId>
|
||||
<artifactId>commons-codec</artifactId>
|
||||
<version>1.15</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
|
||||
<plugins>
|
||||
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
|
||||
<plugin>
|
||||
<artifactId>maven-clean-plugin</artifactId>
|
||||
<version>3.1.0</version>
|
||||
</plugin>
|
||||
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
|
||||
<plugin>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<version>3.0.2</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.8.0</version>
|
||||
<configuration>
|
||||
<release>11</release>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<version>2.22.2</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>3.0.2</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-install-plugin</artifactId>
|
||||
<version>2.5.2</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<version>2.8.2</version>
|
||||
</plugin>
|
||||
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
|
||||
<plugin>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>3.7.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-project-info-reports-plugin</artifactId>
|
||||
<version>3.0.0</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,63 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
/**
|
||||
* This class implements a thread-safe Bloom filter that allows adding strings
|
||||
* and checking the presence of strings.
|
||||
*/
|
||||
final class BloomFilter {
|
||||
private final int hashCount;
|
||||
private final ConcurrentAddOnlyBitSet bitSet;
|
||||
private final Hash hash;
|
||||
|
||||
BloomFilter(final int hashCount, final int size, final Hash hash) {
|
||||
this.hashCount = hashCount;
|
||||
this.bitSet = new ConcurrentAddOnlyBitSet(size);
|
||||
this.hash = hash;
|
||||
}
|
||||
|
||||
// As explained here: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
|
||||
// it is possible to simulate n hash functions using 2 hash functions.
|
||||
private long[] calculateHashes(final String string) {
|
||||
final long[] hash = this.hash.calculate(string);
|
||||
|
||||
final int max = bitSet.getCapacity();
|
||||
|
||||
if (hashCount <= 2) {
|
||||
return new long[]{Math.abs(hash[0] % max), Math.abs(hash[1] % max)};
|
||||
}
|
||||
|
||||
final long[] result = new long[hashCount];
|
||||
for (int i = 0; i < hashCount; ++i) {
|
||||
result[i] = Math.abs((hash[0] + (long) i * hash[1]) % max);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a string to this Bloom filter
|
||||
*
|
||||
* @param key the string to add
|
||||
* @return true if the string was not already present in the Bloom filter
|
||||
*/
|
||||
boolean add(final String key) {
|
||||
boolean notPresent = false;
|
||||
for (long hash : calculateHashes(key)) {
|
||||
notPresent |= bitSet.add(hash);
|
||||
}
|
||||
return notPresent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test whether this Bloom filter contains a string
|
||||
*
|
||||
* @param key the string to test
|
||||
* @return true if the Bloom filter contains the string
|
||||
*/
|
||||
boolean contains(final String key) {
|
||||
boolean present = true;
|
||||
for (long hash : calculateHashes(key)) {
|
||||
present &= bitSet.get(hash);
|
||||
}
|
||||
return present;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class BloomFilterTest {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final List<String> words =
|
||||
Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
|
||||
|
||||
final Set<String> wordSet = new HashSet<>(words);
|
||||
|
||||
final BloomFilter bloomFilter = new BloomFilter(2, 200000, new MurmurHash());
|
||||
|
||||
words.forEach(bloomFilter::add);
|
||||
|
||||
final List<Integer> errors = new ArrayList<>();
|
||||
|
||||
// Query words
|
||||
words.stream().limit(100).forEach(w -> {
|
||||
final String variation = w + ThreadLocalRandom.current().nextInt('a', 'z');
|
||||
System.out.println("HashSet contains " + w + ": " + wordSet.contains(w));
|
||||
System.out.println("HashSet contains " + variation + ": " + wordSet.contains(variation));
|
||||
|
||||
System.out.println("BloomFilter contains " + w + ": " + bloomFilter.contains(w));
|
||||
System.out.println("BloomFilter contains " + variation + ": " + bloomFilter.contains(variation));
|
||||
|
||||
errors.add(wordSet.contains(w) != bloomFilter.contains(w) ? 1 : 0);
|
||||
errors.add(wordSet.contains(variation) != bloomFilter.contains(variation) ? 1 : 0);
|
||||
});
|
||||
|
||||
final double avgError = errors.stream().mapToDouble(i -> (double) i).average().getAsDouble();
|
||||
System.out.println("Average error: " + avgError);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicLongArray;
|
||||
|
||||
/**
|
||||
* This class implements a thread-safe set that allows adding and checking the presence of a long value.
|
||||
* The set is backed by a bit sequence.
|
||||
*/
|
||||
final class ConcurrentAddOnlyBitSet {
|
||||
private static final int BASE = 64;
|
||||
private final int capacity;
|
||||
private final AtomicLongArray buckets;
|
||||
|
||||
ConcurrentAddOnlyBitSet(final int capacity) {
|
||||
this.capacity = capacity;
|
||||
|
||||
final int bucketsCount = (capacity / BASE) + 1;
|
||||
buckets = new AtomicLongArray(bucketsCount);
|
||||
|
||||
for (int i = 0; i < buckets.length(); i++) {
|
||||
buckets.set(i, 0);
|
||||
}
|
||||
}
|
||||
|
||||
public int getCapacity() {
|
||||
return capacity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given value to the set.
|
||||
*
|
||||
* @param value the value to add to the set
|
||||
* @return true if the set did not already contain the specified element
|
||||
*/
|
||||
boolean add(final long value) {
|
||||
final int bucketIdx = (int) value / BASE;
|
||||
final int bitIdx = (int) value - (BASE * bucketIdx);
|
||||
return atomicSet(bucketIdx, bitIdx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether the given value is contained in the set.
|
||||
*
|
||||
* @param value the value to test
|
||||
* @return true if the set contains the specified element
|
||||
*/
|
||||
boolean get(final long value) {
|
||||
final int bucketIdx = (int) value / BASE;
|
||||
final int bitIdx = (int) value - (BASE * bucketIdx);
|
||||
return atomicGet(bucketIdx, bitIdx);
|
||||
}
|
||||
|
||||
private boolean atomicSet(final int bucketIdx, final int bitIdx) {
|
||||
final long mask = mask(bitIdx);
|
||||
return (buckets.getAndUpdate(bucketIdx, l -> l | mask) & mask) == 0;
|
||||
}
|
||||
|
||||
private boolean atomicGet(final int bucketIdx, final int bitIdx) {
|
||||
final long mask = mask(bitIdx);
|
||||
return (buckets.get(bucketIdx) & mask) == mask;
|
||||
}
|
||||
|
||||
private static long mask(final int id) {
|
||||
return 1L << id;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.atomic.AtomicLongArray;
|
||||
import java.util.concurrent.atomic.LongAdder;
|
||||
|
||||
final class CountMinSketch {
|
||||
|
||||
private final int width;
|
||||
private final int depth;
|
||||
private final Hash hash;
|
||||
|
||||
private final AtomicLongArray sketch;
|
||||
|
||||
private final LongAdder count = new LongAdder();
|
||||
|
||||
CountMinSketch(final int width, final int depth, final Hash hash) {
|
||||
this.width = width;
|
||||
this.depth = depth;
|
||||
this.hash = hash;
|
||||
sketch = new AtomicLongArray(width * depth);
|
||||
}
|
||||
|
||||
// As explained here: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
|
||||
// it is possible to simulate n hash functions using 2 hash functions.
|
||||
private long[] calculateHashes(final String string) {
|
||||
final long[] hash = this.hash.calculate(string);
|
||||
|
||||
if (depth <= 2) {
|
||||
return new long[]{Math.abs(hash[0] % width), Math.abs(hash[1] % width)};
|
||||
}
|
||||
|
||||
final long[] result = new long[depth];
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
result[i] = Math.abs((hash[0] + (long) i * hash[1]) % width);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
long update(final String string, long value) {
|
||||
long min = Long.MAX_VALUE;
|
||||
long[] hashes = calculateHashes(string);
|
||||
for (int i = 0; i < hashes.length; i++) {
|
||||
long cell = sketch.getAndSet((int) (i * width + hashes[i]), value);
|
||||
if (cell < min) {
|
||||
min = cell;
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
long increment(final String string) {
|
||||
count.increment();
|
||||
|
||||
long min = Long.MAX_VALUE;
|
||||
long[] hashes = calculateHashes(string);
|
||||
for (int i = 0; i < hashes.length; i++) {
|
||||
long cell = sketch.getAndIncrement((int) (i * width + hashes[i]));
|
||||
if (cell < min) {
|
||||
min = cell;
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
long get(final String string) {
|
||||
long min = Long.MAX_VALUE;
|
||||
long[] hashes = calculateHashes(string);
|
||||
for (int i = 0; i < hashes.length; i++) {
|
||||
long cell = sketch.get((int) (i * width + hashes[i]));
|
||||
if (cell < min) {
|
||||
min = cell;
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
long get2(final String string) {
|
||||
long[] e = new long[depth];
|
||||
|
||||
long[] hashes = calculateHashes(string);
|
||||
|
||||
for (int i = 0; i < hashes.length; i++) {
|
||||
long cell = sketch.get((int) (i * width + hashes[i]));
|
||||
long noiseEstimation = (count.longValue() - cell) / (width - 1);
|
||||
e[i] = cell - noiseEstimation;
|
||||
}
|
||||
|
||||
Arrays.sort(e);
|
||||
return e[e.length / 2];
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static java.util.stream.Collectors.summingInt;
|
||||
|
||||
public class CountMinSketchTest {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final List<String> words =
|
||||
Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
|
||||
|
||||
// Calculate word frequencies with a map
|
||||
final Map<String, Integer> wordFrequencies = words
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(Function.identity(), summingInt(e -> 1)));
|
||||
|
||||
// Calculate word frequencies with a count min sketch
|
||||
final CountMinSketch countMinSketch = new CountMinSketch(6000, 4, new MurmurHash());
|
||||
words.forEach(countMinSketch::increment);
|
||||
|
||||
final List<Integer> errors = new ArrayList<>();
|
||||
|
||||
// Query words
|
||||
words.stream().limit(100).forEach(w -> {
|
||||
final int freq1 = wordFrequencies.getOrDefault(w, 0);
|
||||
final int freq2 = (int) countMinSketch.get(w);
|
||||
|
||||
System.out.println("HashMap " + w + " -> " + freq1);
|
||||
System.out.println("CountMinSketch " + w + " -> " + freq2);
|
||||
|
||||
errors.add(Math.abs(freq1 - freq2));
|
||||
});
|
||||
|
||||
Collections.sort(errors);
|
||||
final int medianError = errors.get(errors.size() / 2);
|
||||
final double avgError = errors.stream().mapToDouble(i -> (double) i).average().getAsDouble();
|
||||
|
||||
System.out.println("\nMedian error: " + medianError);
|
||||
System.out.println("Average error: " + avgError);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
public interface Hash {
|
||||
|
||||
long[] calculate(String string);
|
||||
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import org.apache.commons.codec.digest.MurmurHash3;
|
||||
|
||||
public class Int32MurmurHash implements Hash {
|
||||
|
||||
@Override
|
||||
public long[] calculate(final String string) {
|
||||
return new long[]{MurmurHash3.hash32(string), 0};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
public class JavaHash implements Hash {
|
||||
|
||||
@Override
|
||||
public long[] calculate(final String string) {
|
||||
final int hash = string.hashCode();
|
||||
final int hash1 = hash & 0x0000FFFF;
|
||||
final int hash2 = hash >>> 16;
|
||||
return new long[]{hash1, hash2};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
// See https://storage.googleapis.com/pub-tools-public-publication-data/pdf/40671.pdf
|
||||
public class LogLog {
|
||||
|
||||
private final int m;
|
||||
private final Hash hash;
|
||||
private final long[] M;
|
||||
private final double alpha;
|
||||
|
||||
// Number of bits to reserve for bucket index
|
||||
private final int k;
|
||||
|
||||
public LogLog(final int k, final Hash hash) {
|
||||
this.k = k;
|
||||
this.m = 1 << k;
|
||||
this.hash = hash;
|
||||
M = new long[m];
|
||||
alpha = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m);
|
||||
}
|
||||
|
||||
public void add(final String string) {
|
||||
final int hash = (int) (this.hash.calculate(string)[0]);
|
||||
final int bucketIdx = calculateBucketIdx(hash);
|
||||
final int leadingZeros = calculateLeadingZeros(hash);
|
||||
M[bucketIdx] = Math.max(M[bucketIdx], leadingZeros);
|
||||
}
|
||||
|
||||
public long estimateCount() {
|
||||
double c = 0.0;
|
||||
for (final long l : M) {
|
||||
c += 1.0 / (1 << l); //Math.pow(2, l);
|
||||
}
|
||||
|
||||
double estimate = alpha * m * m / c;
|
||||
|
||||
if (estimate <= 5.0/2.0 * m) {
|
||||
double V = 0;
|
||||
for (final long l : M) {
|
||||
if (l == 0) V++;
|
||||
}
|
||||
if (V > 0) {
|
||||
estimate = m * Math.log(m / V);
|
||||
} else if (estimate > 1.0/30*(0x100000000L)) {
|
||||
estimate = - 0x100000000L * Math.log(1 - estimate/0x100000000L);
|
||||
}
|
||||
}
|
||||
|
||||
return (long) estimate;
|
||||
}
|
||||
|
||||
private int calculateBucketIdx(final int l) {
|
||||
// Extract k most significant bits
|
||||
return (int) l >>> (32 - k);
|
||||
}
|
||||
|
||||
// leading zeros plus one, as per the paper
|
||||
private int calculateLeadingZeros(final int l) {
|
||||
int i = 1;
|
||||
int mask = 0x80000000 >>> k;
|
||||
while (mask > 0 && (l & mask) == 0) {
|
||||
mask = mask >>> 1;
|
||||
i++;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class LogLogTest {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final List<String> words =
|
||||
Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
|
||||
|
||||
final Set<String> wordSet = new HashSet<>(words);
|
||||
|
||||
final LogLog logLog = new LogLog(5, new Int32MurmurHash());
|
||||
words.forEach(logLog::add);
|
||||
|
||||
System.out.println(wordSet.size());
|
||||
System.out.println(logLog.estimateCount());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
interface Membership {
|
||||
|
||||
void add(String item);
|
||||
|
||||
boolean contains(String item);
|
||||
}
|
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*
|
||||
This file was taken from Apache Cassandra source code:
|
||||
https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java
|
||||
*/
|
||||
|
||||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
/**
|
||||
* This is a very fast, non-cryptographic hash suitable for general hash-based
|
||||
* lookup. See http://murmurhash.googlepages.com/ for more details.
|
||||
*
|
||||
* hash32() and hash64() are MurmurHash 2.0.
|
||||
*
|
||||
* hash3_x64_128() is *almost* MurmurHash 3.0. It was supposed to match, but we didn't catch a sign bug with
|
||||
* the result that it doesn't. Unfortunately, we can't change it now without breaking Murmur3Partitioner. *
|
||||
*
|
||||
* <p>
|
||||
* The C version of MurmurHash 2.0 found at that site was ported to Java by
|
||||
* Andrzej Bialecki (ab at getopt org).
|
||||
* </p>
|
||||
*/
|
||||
public final class MurmurHash implements Hash
|
||||
{
|
||||
private final Charset charset;
|
||||
|
||||
public MurmurHash() {
|
||||
charset = Charset.defaultCharset();
|
||||
}
|
||||
|
||||
private static long getBlock(ByteBuffer key, int offset, int index)
|
||||
{
|
||||
int i_8 = index << 3;
|
||||
int blockOffset = offset + i_8;
|
||||
return ((long) key.get(blockOffset + 0) & 0xff) + (((long) key.get(blockOffset + 1) & 0xff) << 8) +
|
||||
(((long) key.get(blockOffset + 2) & 0xff) << 16) + (((long) key.get(blockOffset + 3) & 0xff) << 24) +
|
||||
(((long) key.get(blockOffset + 4) & 0xff) << 32) + (((long) key.get(blockOffset + 5) & 0xff) << 40) +
|
||||
(((long) key.get(blockOffset + 6) & 0xff) << 48) + (((long) key.get(blockOffset + 7) & 0xff) << 56);
|
||||
}
|
||||
|
||||
private static long rotl64(long v, int n)
|
||||
{
|
||||
return ((v << n) | (v >>> (64 - n)));
|
||||
}
|
||||
|
||||
private static long fmix(long k)
|
||||
{
|
||||
k ^= k >>> 33;
|
||||
k *= 0xff51afd7ed558ccdL;
|
||||
k ^= k >>> 33;
|
||||
k *= 0xc4ceb9fe1a85ec53L;
|
||||
k ^= k >>> 33;
|
||||
|
||||
return k;
|
||||
}
|
||||
|
||||
static long[] hash3_x64_128(ByteBuffer key, int offset, int length, long seed)
|
||||
{
|
||||
final int nblocks = length >> 4; // Process as 128-bit blocks.
|
||||
|
||||
long h1 = seed;
|
||||
long h2 = seed;
|
||||
|
||||
long c1 = 0x87c37b91114253d5L;
|
||||
long c2 = 0x4cf5ad432745937fL;
|
||||
|
||||
//----------
|
||||
// body
|
||||
|
||||
for(int i = 0; i < nblocks; i++)
|
||||
{
|
||||
long k1 = getBlock(key, offset, i * 2 + 0);
|
||||
long k2 = getBlock(key, offset, i * 2 + 1);
|
||||
|
||||
k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1;
|
||||
|
||||
h1 = rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
|
||||
|
||||
k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2;
|
||||
|
||||
h2 = rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
|
||||
}
|
||||
|
||||
//----------
|
||||
// tail
|
||||
|
||||
// Advance offset to the unprocessed tail of the data.
|
||||
offset += nblocks * 16;
|
||||
|
||||
long k1 = 0;
|
||||
long k2 = 0;
|
||||
|
||||
switch(length & 15)
|
||||
{
|
||||
case 15: k2 ^= ((long) key.get(offset+14)) << 48;
|
||||
case 14: k2 ^= ((long) key.get(offset+13)) << 40;
|
||||
case 13: k2 ^= ((long) key.get(offset+12)) << 32;
|
||||
case 12: k2 ^= ((long) key.get(offset+11)) << 24;
|
||||
case 11: k2 ^= ((long) key.get(offset+10)) << 16;
|
||||
case 10: k2 ^= ((long) key.get(offset+9)) << 8;
|
||||
case 9: k2 ^= ((long) key.get(offset+8)) << 0;
|
||||
k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2;
|
||||
|
||||
case 8: k1 ^= ((long) key.get(offset+7)) << 56;
|
||||
case 7: k1 ^= ((long) key.get(offset+6)) << 48;
|
||||
case 6: k1 ^= ((long) key.get(offset+5)) << 40;
|
||||
case 5: k1 ^= ((long) key.get(offset+4)) << 32;
|
||||
case 4: k1 ^= ((long) key.get(offset+3)) << 24;
|
||||
case 3: k1 ^= ((long) key.get(offset+2)) << 16;
|
||||
case 2: k1 ^= ((long) key.get(offset+1)) << 8;
|
||||
case 1: k1 ^= ((long) key.get(offset));
|
||||
k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1;
|
||||
};
|
||||
|
||||
//----------
|
||||
// finalization
|
||||
|
||||
h1 ^= length; h2 ^= length;
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
h1 = fmix(h1);
|
||||
h2 = fmix(h2);
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
return(new long[] {h1, h2});
|
||||
}
|
||||
|
||||
@Override
|
||||
public long[] calculate(final String string) {
|
||||
final ByteBuffer b = charset.encode(string);
|
||||
return MurmurHash.hash3_x64_128(b, 0, b.limit(), 0L);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package com.albertoventurini.datastructures.probabilistic;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class CountMinSketchTest {
|
||||
|
||||
private final Set<String> hostnames;
|
||||
|
||||
public CountMinSketchTest() throws Exception {
|
||||
final List<String> hostnames = Files.readAllLines(Path.of("hostnames2.csv"))
|
||||
.stream()
|
||||
.filter(l -> l.indexOf('"') == 0 && l.lastIndexOf('"') > 0)
|
||||
.map(l -> l.substring(1, l.lastIndexOf('"')))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
this.hostnames = new HashSet<>();
|
||||
|
||||
for (int i = 0; i < 20; i++) {
|
||||
this.hostnames.add(hostnames.get(ThreadLocalRandom.current().nextInt(hostnames.size())));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHash() {
|
||||
final CountMinSketch countMinSketch = new CountMinSketch(200, 2, new JavaHash());
|
||||
|
||||
final Map<String, Integer> numbers = new HashMap<>();
|
||||
|
||||
for (final String hostname : hostnames) {
|
||||
final int random = ThreadLocalRandom.current().nextInt(1000);
|
||||
numbers.put(hostname, random);
|
||||
countMinSketch.update(hostname, random);
|
||||
}
|
||||
|
||||
int misses = 0;
|
||||
int error = 0;
|
||||
|
||||
for (final String hostname : hostnames) {
|
||||
if (countMinSketch.get(hostname) != numbers.get(hostname)) {
|
||||
misses++;
|
||||
error += Math.abs(countMinSketch.get(hostname) - numbers.get(hostname));
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("There were " + misses + " misses over " + hostnames.size() + " hostnames. Error = " + error);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue