Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COLLECTIONS-844 - allow counting Bloom filters with cell size other than Integer.SIZE #406

Merged
merged 24 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ default int estimateIntersection(final BloomFilter other) {
}

/**
* Most Bloom filter's create unique IndexProducers.
* Most Bloom filters create unique IndexProducers.
*/
@Override
default IndexProducer uniqueIndices() {
Expand Down
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file has lost the git history from BitCountProducer. It is showing as a new file and the old interface as deleted.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems to be an issue with github display local git showed that it was renamed during commit.

Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,6 @@ default boolean forEachIndex(final IntPredicate predicate) {
return forEachCell((i, v) -> predicate.test(i));
}

@Override
default IndexProducer uniqueIndices() {
return this;
}

/**
* Creates a CellProducer from an IndexProducer.
*
Expand Down Expand Up @@ -115,14 +110,14 @@ private void populate() {
@Override
public int[] asIndexArray() {
populate();
return counterCells.keySet().stream().mapToInt( c -> c.idx ).toArray();
return counterCells.keySet().stream().mapToInt(c -> c.idx ).toArray();
Claudenw marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
public boolean forEachCell(CellConsumer consumer) {
populate();
for (CounterCell cell : counterCells.values()) {
if (!consumer.test(cell.idx, cell.count) ) {
if (!consumer.test(cell.idx, cell.count)) {
return false;
}
}
Expand All @@ -143,14 +138,14 @@ final class CounterCell implements Comparable<CounterCell> {

@Override
public int compareTo(CounterCell other) {
return Integer.compare( idx, other.idx);
return Integer.compare(idx, other.idx);
Claudenw marked this conversation as resolved.
Show resolved Hide resolved
}
}
};
}

/**
* Represents an operation that accepts an {@code <index, cell>} pair.
* Represents an operation that accepts an {@code <index, count>} pair.
* Returns {@code true} if processing should continue, {@code false} otherwise.
*
* <p>Note: This is a functional interface as a specialization of
Expand All @@ -168,3 +163,4 @@ interface CellConsumer {
boolean test(int index, int count);
}
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All these changes are not related to this PR. Please revert.

They can be considered separately. By chaining predicates using and, or and negate you are adding a layer of obfuscation where the method calls may make the filter less efficient. A performance benchmark would useful here.

Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
* @since 4.5
*/
public final class IndexFilter {

// do not instantiate.
private IndexFilter() {}
private final IntPredicate tracker;
private final int size;
private final IntPredicate consumer;

/**
* Creates an instance optimized for the specified shape.
Expand All @@ -38,20 +38,44 @@ private IndexFilter() {}
* @return an IndexFilter optimized for the specified shape.
*/
public static IntPredicate create(final Shape shape, final IntPredicate consumer) {
int size = shape.getNumberOfBits();
IntPredicate result = number -> {
if (number >= size) {
throw new IndexOutOfBoundsException(String.format("number too large %d >= %d", number, size));
}
return true;
};
return new IndexFilter(shape, consumer)::test;
}

/**
* Creates an instance optimized for the specified shape.
* @param shape The shape that is being generated.
* @param consumer The consumer to accept the values.
*/
private IndexFilter(final Shape shape, final IntPredicate consumer) {
this.size = shape.getNumberOfBits();
this.consumer = consumer;
if (BitMap.numberOfBitMaps(shape.getNumberOfBits()) * Long.BYTES < (long) shape.getNumberOfHashFunctions()
* Integer.BYTES) {
result = result.and(new BitMapTracker(shape).negate());
this.tracker = new BitMapTracker(shape);
} else {
result = result.and(new ArrayTracker(shape).negate());
this.tracker = new ArrayTracker(shape);
}
}

/**
* Test if the number should be processed by the {@code consumer}.
*
* <p>If the number has <em>not</em> been seen before it is passed to the {@code consumer} and the result returned.
* If the number has been seen before the {@code consumer} is not called and {@code true} returned.</p>
*
* <p><em>If the input is not in the range [0,size) an IndexOutOfBoundsException exception is thrown.</em></p>
*
* @param number the number to check.
* @return {@code true} if processing should continue, {@code false} otherwise.
*/
public boolean test(final int number) {
if (number >= size) {
throw new IndexOutOfBoundsException(String.format("number too large %d >= %d", number, size));
}
if (tracker.test(number)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You reverted the IndexFilter file but left this change. I think the statements are identical. Since the plan is to replace this with a separate change to the IndexFilter then can you revert this too. The IndexFilter should be unchanged by this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change fixes code coverage check. In the original case there are 4 branches, one of which can not be tested. In this case there are only 3 and the code coverage report shows 100% coverage for the Bloomfilter code.

return consumer.test(number);
}
return result.or(consumer);
return true;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
@FunctionalInterface
public interface IndexProducer {

int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;

/**
* Each index is passed to the predicate. The predicate is applied to each
* index value, if the predicate returns {@code false} the execution is stopped, {@code false}
Expand Down Expand Up @@ -110,13 +108,11 @@ public boolean test(long word) {
* <p>Indices ordering and uniqueness is not guaranteed.</p>
*
* <p><em>
* The default implementation of this method is slow. It is recommended
* that implementing classes reimplement this method.
* The default implementation of this method creates an array and populates
* it. Implementations that have access to an index array should consider
* returning a copy of that array if possible.
* </em></p>
*
* <p><em>
* The default implementation of this method returns unique values in order.
* </em></p>
* @return An int array of the data.
*/
default int[] asIndexArray() {
Expand All @@ -125,9 +121,7 @@ class Indices {
private int size;

boolean add(final int index) {
if (size == data.length) {
data = Arrays.copyOf(data, (int) Math.min(MAX_ARRAY_SIZE, size * 2L));
}
data = IndexUtils.ensureCapacityForAdd(data, index);
Claudenw marked this conversation as resolved.
Show resolved Hide resolved
data[size++] = index;
return true;
}
Expand All @@ -143,14 +137,17 @@ int[] toArray() {
}

/**
* Creates an IndexProducer of unique indices for this index.
* Creates an IndexProducer comprising the unique indices for this producer.
*
* <p>By default creates a new producer with some overhead to remove
* duplicates. IndexProducers that return unique indices by default
* should override this to return {@code this}.</p>
*
* <p>This is like the `indices(Shape)` method except that it adds the guarantee that no
* duplicate values will be returned. The indices produced are equivalent to those returned
* from by a Bloom filter created from this hasher.</p>
* <p>The default implementation will filter the indices from this instance
* and return them in ascending order.</p>
*
* @return the iterator of integers
* @throws IndexOutOfBoundsException if any index is less than 0
* @return the IndexProducer of unique values.
* @throws IndexOutOfBoundsException if any index is less than zero.
*/
default IndexProducer uniqueIndices() {
final BitSet bitSet = new BitSet();
Expand All @@ -162,7 +159,7 @@ default IndexProducer uniqueIndices() {
return new IndexProducer() {
@Override
public boolean forEachIndex(IntPredicate predicate) {
for (int idx = bitSet.nextSetBit(0); idx >= 0; idx = bitSet.nextSetBit(idx+1)) {
for (int idx = bitSet.nextSetBit(0); idx >= 0; idx = bitSet.nextSetBit(idx + 1)) {
if (!predicate.test(idx)) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;

import java.util.Arrays;

/**
* Provides functions to assist in IndexProducer creation and manipulation.
* @see IndexProducer
*/
public class IndexUtils {
Claudenw marked this conversation as resolved.
Show resolved Hide resolved

/**
* The maximum array size for the methods in this class.
*/
public static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
Claudenw marked this conversation as resolved.
Show resolved Hide resolved

// do not instantiate
private IndexUtils() {}

/**
* Ensure the array can add an element at the specified index.
* @param array the array to check.
* @param index the index to add at.
* @return the array or a newly allocated copy of the array.
*/
static int[] ensureCapacityForAdd(int[] array, int index) {
if (index >= array.length) {
return Arrays.copyOf(array, (int) Math.min(IndexUtils.MAX_ARRAY_SIZE, Math.max( array.length * 2L, index+1)));
Claudenw marked this conversation as resolved.
Show resolved Hide resolved
}
return array;
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file has lost the git history from AbstractBitCountProducerTest. It is showing as a new file and the old test as deleted.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Every action I tried left github with delete and new. Local git showed renamed during commit.

Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public abstract class AbstractCellProducerTest extends AbstractIndexProducerTest
private static final CellConsumer FALSE_CONSUMER = (i, j) -> false;

/**
* Creates an array of expected values that alignes with the expected indices entries.
* Creates an array of expected values that aligns with the expected indices entries.
* @return an array of expected values.
* @see AbstractIndexProducerTest#getExpectedIndices()
*/
Expand Down Expand Up @@ -108,11 +108,11 @@ public final void testIndexConsistency() {
public void testForEachCellValues() {
int[] expectedIdx = getExpectedIndices();
int[] expectedValue = getExpectedValues();
assertEquals( expectedIdx.length, expectedValue.length, "expected index length and value length do not match");
assertEquals(expectedIdx.length, expectedValue.length, "expected index length and value length do not match");
int[] idx = {0};
createProducer().forEachCell((i, j) -> {
assertEquals(expectedIdx[idx[0]], i, "bad index at "+idx[0]);
assertEquals(expectedValue[idx[0]], j, "bad value at "+idx[0]);
assertEquals(expectedIdx[idx[0]], i, "bad index at " + idx[0]);
assertEquals(expectedValue[idx[0]], j, "bad value at " + idx[0]);
idx[0]++;
return true;
});
Expand Down Expand Up @@ -152,3 +152,4 @@ public void testForEachCellEarlyExit() {
assertEquals(1, passes[0]);
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ public void testExcludesDuplicates() {
assertTrue(bf1.forEachCell((x, y) -> false), "Hasher in removes results in value not equal to 0");
}

private void verifyMaxInsert( CountingBloomFilter bf, int from1, int from11) {
private void verifyMaxInsert(CountingBloomFilter bf, int from1, int from11) {
BloomFilter bfFrom0 = new DefaultBloomFilterTest.SparseDefaultBloomFilter(getTestShape());
bfFrom0.merge(new IncrementingHasher(0, 1));
BloomFilter bfFrom1 = new DefaultBloomFilterTest.SparseDefaultBloomFilter(getTestShape());
Expand Down Expand Up @@ -346,9 +346,9 @@ public void testGetMaxInsert() {
private void assertCell3(CountingBloomFilter bf, int value) {
bf.forEachCell((k, v) -> {
if (k == 3) {
assertEquals(value, v, "Mismatch at position (3) "+k);
assertEquals(value, v, "Mismatch at position 3");
} else {
assertEquals(0, v, "Mismatch at position "+k);
assertEquals(0, v, "Mismatch at position " + k);
}
return true;
});
Expand All @@ -372,10 +372,6 @@ public void mergeIncrementsAllCellsTest() {
// The add should increment cells 3 by 2
f2.add(CellProducer.from(ip));
assertCell3(f2, 2);

// This merge will increment by 1 as the round-trip makes the indices unique
f3.merge(IndexProducer.fromIndexArray(ip.asIndexArray()));
assertCell3(f3, 1);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,6 @@ protected int[] getExpectedIndices() {

@Override
protected int[] getExpectedValues() {
return new int[] { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1};
return new int[] {1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

public class CellProducerFromDefaultIndexProducerTest extends AbstractCellProducerTest {

int[] data = { 0, 63, 1, 64, 128, 1, 127 };
int[] data = {0, 63, 1, 64, 128, 1, 127};
int[] indices = {0, 1, 63, 64, 127, 128};
int[] values = {1, 2, 1, 1, 1, 1 };

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ protected int[] getExpectedValues() {
@Override
protected CellProducer createProducer() {
return consumer -> {
for (int i=0; i<indices.length; i++) {
for (int i = 0; i < indices.length; i++) {
if (!consumer.test(indices[i], values[i] )) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@
import java.util.BitSet;
import java.util.Objects;
import java.util.concurrent.ThreadLocalRandom;
import java.util.stream.IntStream;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

public class DefaultIndexProducerTest extends AbstractIndexProducerTest {

Expand Down Expand Up @@ -119,31 +122,10 @@ public void testFromIndexArray() {
}
}

@Test
public void testMoreThan32Entries() {
int[] values = new int[33];
for (int i=0; i<33; i++) {
values[i] = i;
}
IndexProducer producer = predicate -> {
Objects.requireNonNull(predicate);
for (final int i : values) {
if (!predicate.test(i)) {
return false;
}
}
return true;
};
int[] other = producer.asIndexArray();
assertArrayEquals(values, other);
}

@Test
public void test32Entries() {
int[] values = new int[32];
for (int i=0; i<32; i++) {
values[i] = i;
}
@ParameterizedTest
@ValueSource(ints = {32, 33})
public void testEntries(int size) {
int[] values = IntStream.range(0, size).toArray();
IndexProducer producer = predicate -> {
Objects.requireNonNull(predicate);
for (final int i : values) {
Expand Down