First commit

This commit is contained in:
Alberto Venturini 2021-09-08 13:02:57 +02:00
commit 0fbc439593
17 changed files with 24331 additions and 0 deletions

20
.gitignore vendored Normal file
View file

@ -0,0 +1,20 @@
**/bin/
**/target/
# Eclipse files
**/.classpath
**/.factorypath
**/.project
**/.settings/
# IntelliJ files
**/.idea/
**/*.iml
# Misc
.mtj.tmp/
**/hs_err_pid*
config.properties
#

1252
corpus1.txt Normal file

File diff suppressed because one or more lines are too long

22316
corpus2.txt Normal file

File diff suppressed because it is too large Load diff

81
pom.xml Normal file
View file

@ -0,0 +1,81 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.albertoventurini</groupId>
<artifactId>probabilistic-data-structures</artifactId>
<version>1.0-SNAPSHOT</version>
<name>probabilistic-data-structures</name>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.7.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.7.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.15</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
<configuration>
<release>11</release>
</configuration>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.2</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View file

@ -0,0 +1,63 @@
package com.albertoventurini.datastructures.probabilistic;
/**
* This class implements a thread-safe Bloom filter that allows adding strings
* and checking the presence of strings.
*/
final class BloomFilter {
private final int hashCount;
private final ConcurrentAddOnlyBitSet bitSet;
private final Hash hash;
BloomFilter(final int hashCount, final int size, final Hash hash) {
this.hashCount = hashCount;
this.bitSet = new ConcurrentAddOnlyBitSet(size);
this.hash = hash;
}
// As explained here: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
// it is possible to simulate n hash functions using 2 hash functions.
private long[] calculateHashes(final String string) {
final long[] hash = this.hash.calculate(string);
final int max = bitSet.getCapacity();
if (hashCount <= 2) {
return new long[]{Math.abs(hash[0] % max), Math.abs(hash[1] % max)};
}
final long[] result = new long[hashCount];
for (int i = 0; i < hashCount; ++i) {
result[i] = Math.abs((hash[0] + (long) i * hash[1]) % max);
}
return result;
}
/**
* Add a string to this Bloom filter
*
* @param key the string to add
* @return true if the string was not already present in the Bloom filter
*/
boolean add(final String key) {
boolean notPresent = false;
for (long hash : calculateHashes(key)) {
notPresent |= bitSet.add(hash);
}
return notPresent;
}
/**
* Test whether this Bloom filter contains a string
*
* @param key the string to test
* @return true if the Bloom filter contains the string
*/
boolean contains(final String key) {
boolean present = true;
for (long hash : calculateHashes(key)) {
present &= bitSet.get(hash);
}
return present;
}
}

View file

@ -0,0 +1,44 @@
package com.albertoventurini.datastructures.probabilistic;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import java.util.stream.Collectors;
public class BloomFilterTest {
public static void main(String[] args) throws Exception {
final List<String> words =
Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
final Set<String> wordSet = new HashSet<>(words);
final BloomFilter bloomFilter = new BloomFilter(2, 200000, new MurmurHash());
words.forEach(bloomFilter::add);
final List<Integer> errors = new ArrayList<>();
// Query words
words.stream().limit(100).forEach(w -> {
final String variation = w + ThreadLocalRandom.current().nextInt('a', 'z');
System.out.println("HashSet contains " + w + ": " + wordSet.contains(w));
System.out.println("HashSet contains " + variation + ": " + wordSet.contains(variation));
System.out.println("BloomFilter contains " + w + ": " + bloomFilter.contains(w));
System.out.println("BloomFilter contains " + variation + ": " + bloomFilter.contains(variation));
errors.add(wordSet.contains(w) != bloomFilter.contains(w) ? 1 : 0);
errors.add(wordSet.contains(variation) != bloomFilter.contains(variation) ? 1 : 0);
});
final double avgError = errors.stream().mapToDouble(i -> (double) i).average().getAsDouble();
System.out.println("Average error: " + avgError);
}
}

View file

@ -0,0 +1,66 @@
package com.albertoventurini.datastructures.probabilistic;
import java.util.concurrent.atomic.AtomicLongArray;
/**
* This class implements a thread-safe set that allows adding and checking the presence of a long value.
* The set is backed by a bit sequence.
*/
final class ConcurrentAddOnlyBitSet {
private static final int BASE = 64;
private final int capacity;
private final AtomicLongArray buckets;
ConcurrentAddOnlyBitSet(final int capacity) {
this.capacity = capacity;
final int bucketsCount = (capacity / BASE) + 1;
buckets = new AtomicLongArray(bucketsCount);
for (int i = 0; i < buckets.length(); i++) {
buckets.set(i, 0);
}
}
public int getCapacity() {
return capacity;
}
/**
* Adds the given value to the set.
*
* @param value the value to add to the set
* @return true if the set did not already contain the specified element
*/
boolean add(final long value) {
final int bucketIdx = (int) value / BASE;
final int bitIdx = (int) value - (BASE * bucketIdx);
return atomicSet(bucketIdx, bitIdx);
}
/**
* Tests whether the given value is contained in the set.
*
* @param value the value to test
* @return true if the set contains the specified element
*/
boolean get(final long value) {
final int bucketIdx = (int) value / BASE;
final int bitIdx = (int) value - (BASE * bucketIdx);
return atomicGet(bucketIdx, bitIdx);
}
private boolean atomicSet(final int bucketIdx, final int bitIdx) {
final long mask = mask(bitIdx);
return (buckets.getAndUpdate(bucketIdx, l -> l | mask) & mask) == 0;
}
private boolean atomicGet(final int bucketIdx, final int bitIdx) {
final long mask = mask(bitIdx);
return (buckets.get(bucketIdx) & mask) == mask;
}
private static long mask(final int id) {
return 1L << id;
}
}

View file

@ -0,0 +1,92 @@
package com.albertoventurini.datastructures.probabilistic;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicLongArray;
import java.util.concurrent.atomic.LongAdder;
final class CountMinSketch {
private final int width;
private final int depth;
private final Hash hash;
private final AtomicLongArray sketch;
private final LongAdder count = new LongAdder();
CountMinSketch(final int width, final int depth, final Hash hash) {
this.width = width;
this.depth = depth;
this.hash = hash;
sketch = new AtomicLongArray(width * depth);
}
// As explained here: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
// it is possible to simulate n hash functions using 2 hash functions.
private long[] calculateHashes(final String string) {
final long[] hash = this.hash.calculate(string);
if (depth <= 2) {
return new long[]{Math.abs(hash[0] % width), Math.abs(hash[1] % width)};
}
final long[] result = new long[depth];
for (int i = 0; i < depth; ++i) {
result[i] = Math.abs((hash[0] + (long) i * hash[1]) % width);
}
return result;
}
long update(final String string, long value) {
long min = Long.MAX_VALUE;
long[] hashes = calculateHashes(string);
for (int i = 0; i < hashes.length; i++) {
long cell = sketch.getAndSet((int) (i * width + hashes[i]), value);
if (cell < min) {
min = cell;
}
}
return min;
}
long increment(final String string) {
count.increment();
long min = Long.MAX_VALUE;
long[] hashes = calculateHashes(string);
for (int i = 0; i < hashes.length; i++) {
long cell = sketch.getAndIncrement((int) (i * width + hashes[i]));
if (cell < min) {
min = cell;
}
}
return min;
}
long get(final String string) {
long min = Long.MAX_VALUE;
long[] hashes = calculateHashes(string);
for (int i = 0; i < hashes.length; i++) {
long cell = sketch.get((int) (i * width + hashes[i]));
if (cell < min) {
min = cell;
}
}
return min;
}
long get2(final String string) {
long[] e = new long[depth];
long[] hashes = calculateHashes(string);
for (int i = 0; i < hashes.length; i++) {
long cell = sketch.get((int) (i * width + hashes[i]));
long noiseEstimation = (count.longValue() - cell) / (width - 1);
e[i] = cell - noiseEstimation;
}
Arrays.sort(e);
return e[e.length / 2];
}
}

View file

@ -0,0 +1,53 @@
package com.albertoventurini.datastructures.probabilistic;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Function;
import java.util.stream.Collectors;
import static java.util.stream.Collectors.summingInt;
public class CountMinSketchTest {
public static void main(String[] args) throws Exception {
final List<String> words =
Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
// Calculate word frequencies with a map
final Map<String, Integer> wordFrequencies = words
.stream()
.collect(Collectors.groupingBy(Function.identity(), summingInt(e -> 1)));
// Calculate word frequencies with a count min sketch
final CountMinSketch countMinSketch = new CountMinSketch(6000, 4, new MurmurHash());
words.forEach(countMinSketch::increment);
final List<Integer> errors = new ArrayList<>();
// Query words
words.stream().limit(100).forEach(w -> {
final int freq1 = wordFrequencies.getOrDefault(w, 0);
final int freq2 = (int) countMinSketch.get(w);
System.out.println("HashMap " + w + " -> " + freq1);
System.out.println("CountMinSketch " + w + " -> " + freq2);
errors.add(Math.abs(freq1 - freq2));
});
Collections.sort(errors);
final int medianError = errors.get(errors.size() / 2);
final double avgError = errors.stream().mapToDouble(i -> (double) i).average().getAsDouble();
System.out.println("\nMedian error: " + medianError);
System.out.println("Average error: " + avgError);
}
}

View file

@ -0,0 +1,7 @@
package com.albertoventurini.datastructures.probabilistic;
public interface Hash {
long[] calculate(String string);
}

View file

@ -0,0 +1,11 @@
package com.albertoventurini.datastructures.probabilistic;
import org.apache.commons.codec.digest.MurmurHash3;
public class Int32MurmurHash implements Hash {
@Override
public long[] calculate(final String string) {
return new long[]{MurmurHash3.hash32(string), 0};
}
}

View file

@ -0,0 +1,12 @@
package com.albertoventurini.datastructures.probabilistic;
public class JavaHash implements Hash {
@Override
public long[] calculate(final String string) {
final int hash = string.hashCode();
final int hash1 = hash & 0x0000FFFF;
final int hash2 = hash >>> 16;
return new long[]{hash1, hash2};
}
}

View file

@ -0,0 +1,67 @@
package com.albertoventurini.datastructures.probabilistic;
// See https://storage.googleapis.com/pub-tools-public-publication-data/pdf/40671.pdf
public class LogLog {
private final int m;
private final Hash hash;
private final long[] M;
private final double alpha;
// Number of bits to reserve for bucket index
private final int k;
public LogLog(final int k, final Hash hash) {
this.k = k;
this.m = 1 << k;
this.hash = hash;
M = new long[m];
alpha = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m);
}
public void add(final String string) {
final int hash = (int) (this.hash.calculate(string)[0]);
final int bucketIdx = calculateBucketIdx(hash);
final int leadingZeros = calculateLeadingZeros(hash);
M[bucketIdx] = Math.max(M[bucketIdx], leadingZeros);
}
public long estimateCount() {
double c = 0.0;
for (final long l : M) {
c += 1.0 / (1 << l); //Math.pow(2, l);
}
double estimate = alpha * m * m / c;
if (estimate <= 5.0/2.0 * m) {
double V = 0;
for (final long l : M) {
if (l == 0) V++;
}
if (V > 0) {
estimate = m * Math.log(m / V);
} else if (estimate > 1.0/30*(0x100000000L)) {
estimate = - 0x100000000L * Math.log(1 - estimate/0x100000000L);
}
}
return (long) estimate;
}
private int calculateBucketIdx(final int l) {
// Extract k most significant bits
return (int) l >>> (32 - k);
}
// leading zeros plus one, as per the paper
private int calculateLeadingZeros(final int l) {
int i = 1;
int mask = 0x80000000 >>> k;
while (mask > 0 && (l & mask) == 0) {
mask = mask >>> 1;
i++;
}
return i;
}
}

View file

@ -0,0 +1,25 @@
package com.albertoventurini.datastructures.probabilistic;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
public class LogLogTest {
public static void main(String[] args) throws Exception {
final List<String> words =
Arrays.stream(Files.readString(Path.of("corpus2.txt")).split(" ")).collect(Collectors.toList());
final Set<String> wordSet = new HashSet<>(words);
final LogLog logLog = new LogLog(5, new Int32MurmurHash());
words.forEach(logLog::add);
System.out.println(wordSet.size());
System.out.println(logLog.estimateCount());
}
}

View file

@ -0,0 +1,8 @@
package com.albertoventurini.datastructures.probabilistic;
interface Membership {
void add(String item);
boolean contains(String item);
}

View file

@ -0,0 +1,157 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
This file was taken from Apache Cassandra source code:
https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/MurmurHash.java
*/
package com.albertoventurini.datastructures.probabilistic;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
/**
* This is a very fast, non-cryptographic hash suitable for general hash-based
* lookup. See http://murmurhash.googlepages.com/ for more details.
*
* hash32() and hash64() are MurmurHash 2.0.
*
* hash3_x64_128() is *almost* MurmurHash 3.0. It was supposed to match, but we didn't catch a sign bug with
* the result that it doesn't. Unfortunately, we can't change it now without breaking Murmur3Partitioner. *
*
* <p>
* The C version of MurmurHash 2.0 found at that site was ported to Java by
* Andrzej Bialecki (ab at getopt org).
* </p>
*/
public final class MurmurHash implements Hash
{
private final Charset charset;
public MurmurHash() {
charset = Charset.defaultCharset();
}
private static long getBlock(ByteBuffer key, int offset, int index)
{
int i_8 = index << 3;
int blockOffset = offset + i_8;
return ((long) key.get(blockOffset + 0) & 0xff) + (((long) key.get(blockOffset + 1) & 0xff) << 8) +
(((long) key.get(blockOffset + 2) & 0xff) << 16) + (((long) key.get(blockOffset + 3) & 0xff) << 24) +
(((long) key.get(blockOffset + 4) & 0xff) << 32) + (((long) key.get(blockOffset + 5) & 0xff) << 40) +
(((long) key.get(blockOffset + 6) & 0xff) << 48) + (((long) key.get(blockOffset + 7) & 0xff) << 56);
}
private static long rotl64(long v, int n)
{
return ((v << n) | (v >>> (64 - n)));
}
private static long fmix(long k)
{
k ^= k >>> 33;
k *= 0xff51afd7ed558ccdL;
k ^= k >>> 33;
k *= 0xc4ceb9fe1a85ec53L;
k ^= k >>> 33;
return k;
}
static long[] hash3_x64_128(ByteBuffer key, int offset, int length, long seed)
{
final int nblocks = length >> 4; // Process as 128-bit blocks.
long h1 = seed;
long h2 = seed;
long c1 = 0x87c37b91114253d5L;
long c2 = 0x4cf5ad432745937fL;
//----------
// body
for(int i = 0; i < nblocks; i++)
{
long k1 = getBlock(key, offset, i * 2 + 0);
long k2 = getBlock(key, offset, i * 2 + 1);
k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1;
h1 = rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2;
h2 = rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
}
//----------
// tail
// Advance offset to the unprocessed tail of the data.
offset += nblocks * 16;
long k1 = 0;
long k2 = 0;
switch(length & 15)
{
case 15: k2 ^= ((long) key.get(offset+14)) << 48;
case 14: k2 ^= ((long) key.get(offset+13)) << 40;
case 13: k2 ^= ((long) key.get(offset+12)) << 32;
case 12: k2 ^= ((long) key.get(offset+11)) << 24;
case 11: k2 ^= ((long) key.get(offset+10)) << 16;
case 10: k2 ^= ((long) key.get(offset+9)) << 8;
case 9: k2 ^= ((long) key.get(offset+8)) << 0;
k2 *= c2; k2 = rotl64(k2,33); k2 *= c1; h2 ^= k2;
case 8: k1 ^= ((long) key.get(offset+7)) << 56;
case 7: k1 ^= ((long) key.get(offset+6)) << 48;
case 6: k1 ^= ((long) key.get(offset+5)) << 40;
case 5: k1 ^= ((long) key.get(offset+4)) << 32;
case 4: k1 ^= ((long) key.get(offset+3)) << 24;
case 3: k1 ^= ((long) key.get(offset+2)) << 16;
case 2: k1 ^= ((long) key.get(offset+1)) << 8;
case 1: k1 ^= ((long) key.get(offset));
k1 *= c1; k1 = rotl64(k1,31); k1 *= c2; h1 ^= k1;
};
//----------
// finalization
h1 ^= length; h2 ^= length;
h1 += h2;
h2 += h1;
h1 = fmix(h1);
h2 = fmix(h2);
h1 += h2;
h2 += h1;
return(new long[] {h1, h2});
}
@Override
public long[] calculate(final String string) {
final ByteBuffer b = charset.encode(string);
return MurmurHash.hash3_x64_128(b, 0, b.limit(), 0L);
}
}

View file

@ -0,0 +1,57 @@
package com.albertoventurini.datastructures.probabilistic;
import org.junit.jupiter.api.Test;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import java.util.stream.Collectors;
public class CountMinSketchTest {
private final Set<String> hostnames;
public CountMinSketchTest() throws Exception {
final List<String> hostnames = Files.readAllLines(Path.of("hostnames2.csv"))
.stream()
.filter(l -> l.indexOf('"') == 0 && l.lastIndexOf('"') > 0)
.map(l -> l.substring(1, l.lastIndexOf('"')))
.collect(Collectors.toList());
this.hostnames = new HashSet<>();
for (int i = 0; i < 20; i++) {
this.hostnames.add(hostnames.get(ThreadLocalRandom.current().nextInt(hostnames.size())));
}
}
@Test
public void testHash() {
final CountMinSketch countMinSketch = new CountMinSketch(200, 2, new JavaHash());
final Map<String, Integer> numbers = new HashMap<>();
for (final String hostname : hostnames) {
final int random = ThreadLocalRandom.current().nextInt(1000);
numbers.put(hostname, random);
countMinSketch.update(hostname, random);
}
int misses = 0;
int error = 0;
for (final String hostname : hostnames) {
if (countMinSketch.get(hostname) != numbers.get(hostname)) {
misses++;
error += Math.abs(countMinSketch.get(hostname) - numbers.get(hostname));
}
}
System.out.println("There were " + misses + " misses over " + hostnames.size() + " hostnames. Error = " + error);
}
}