Skip to content

Commit

Permalink
[COLLECTIONS-843] Implement Layered Bloom filter (#402)
Browse files Browse the repository at this point in the history
* Adjusted tests to handle bloom filter implementations that utilized
automatic decay.

* cleaned up spacing

* fixed indent

* updated for layered testing

* removed spaces

* fixed merge issue

* initial checkin

* cleaned up tests

* fixed timing on test

* fixed formatting

* added javadoc

* fixed typos

* removed blank lines

* fixed javadocs

* Fix Javadoc

* Add Javadoc  @SInCE 4.5

* Add Javadoc  @SInCE 4.5

* updated tests and added BloomFilterProducer code

* Cleaned up javadoc and BiPredicate<BloomFilter,BloomFilter> processing

* fixed javadoc issues

* fixed typography issue

* Fixed a documentation error

* code format cleanup

* code simplification and documentation

* added isEmpty and associated tests

* Changes as requested by review

* cleaned up formatting errors

* fixed javadoc issues

* added LayeredBloomFilter to overview.

* added coco driven test cases.

* attempt to fix formatting

* cleaned up javadoc differences

* cleaned up javadoc

* Made flatten() part of BloomFilterProducer

* fixed since tag.

* changed X() methods to setX()

* updated javadoc

* fixed javadoc errors

* merged changes from master

* renamed to Test to CellProducerFromLayeredBloomFilterTest

* changed to jupiter from junit.

* added override for uniqueIndices as optimization.

* fixed checkstyle issue

* modified as per review

* Updated tests as per review

* fixed variable initialization issues

* made suggested test changes

* fixed broken test

* Remove dead comments per code reviews

---------

Co-authored-by: Gary Gregory <[email protected]>
  • Loading branch information
Claudenw and garydgregory committed Dec 22, 2023
1 parent 3b8dce4 commit 0438ede
Show file tree
Hide file tree
Showing 27 changed files with 2,379 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,21 @@ default boolean isFull() {
*/
int cardinality();

/**
* Determines if all the bits are off. This is equivalent to
* {@code cardinality() == 0}.
*
* <p>
* <em>Note: This method is optimised for non-sparse filters.</em> Implementers
* are encouraged to implement faster checks if possible.
* </p>
*
* @return {@code true} if no bits are enabled, {@code false} otherwise.
*/
default boolean isEmpty() {
return forEachBitMap(y -> y == 0);
}

/**
* Estimates the number of items in the Bloom filter.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.function.BiPredicate;
import java.util.function.Predicate;

/**
* Produces Bloom filters from a collection (e.g. LayeredBloomFilter).
*
* @since 4.5
*/
public interface BloomFilterProducer {

/**
* Executes a Bloom filter Predicate on each Bloom filter in the collection. The
* ordering of the Bloom filters is not specified by this interface.
*
* @param bloomFilterPredicate the predicate to evaluate each Bloom filter with.
* @return {@code false} when the first filter fails the predicate test. Returns
* {@code true} if all filters pass the test.
*/
boolean forEachBloomFilter(Predicate<BloomFilter> bloomFilterPredicate);

/**
* Return an array of the Bloom filters in the collection.
* <p><em>Implementations should specify if the array contains deep copies, immutable instances,
* or references to the filters in the collection.</em></p>
* <p>The default method returns a deep copy of the enclosed filters.</p>
*
* @return An array of Bloom filters.
*/
default BloomFilter[] asBloomFilterArray() {
final List<BloomFilter> filters = new ArrayList<>();
forEachBloomFilter(f -> filters.add(f.copy()));
return filters.toArray(new BloomFilter[0]);
}

/**
* Applies the {@code func} to each Bloom filter pair in order. Will apply all
* of the Bloom filters from the other BloomFilterProducer to this producer. If
* either {@code this} producer or {@code other} producer has fewer BloomFilters
* ths method will provide {@code null} for all excess calls to the {@code func}.
*
* <p><em>This implementation returns references to the Bloom filter. Other implementations
* should specify if the array contains deep copies, immutable instances,
* or references to the filters in the collection.</em></p>
*
* @param other The other BloomFilterProducer that provides the y values in the
* (x,y) pair.
* @param func The function to apply.
* @return {@code true} if the {@code func} returned {@code true} for every pair,
* {@code false} otherwise.
*/
default boolean forEachBloomFilterPair(final BloomFilterProducer other,
final BiPredicate<BloomFilter, BloomFilter> func) {
final CountingPredicate<BloomFilter> p = new CountingPredicate<>(asBloomFilterArray(), func);
return other.forEachBloomFilter(p) && p.forEachRemaining();
}

/**
* Create a standard (non-layered) Bloom filter by merging all of the layers. If
* the filter is empty this method will return an empty Bloom filter.
*
* @return the merged bloom filter.
*/
default BloomFilter flatten() {
BloomFilter[] bf = {null};
forEachBloomFilter( x -> {
if (bf[0] == null) {
bf[0] = new SimpleBloomFilter( x.getShape());
}
return bf[0].merge( x );
});
return bf[0];
}

/**
* Creates a BloomFilterProducer from an array of Bloom filters.
*
* <ul>
* <li>The asBloomFilterArray() method returns a copy of the original array
* with references to the original filters.</li>
* <li>The forEachBloomFilterPair() method uses references to the original filters.</li>
* </ul>
* <p><em>All modifications to the Bloom filters are reflected in the original filters</em></p>
*
* @param filters The filters to be returned by the producer.
* @return THe BloomFilterProducer containing the filters.
*/
static BloomFilterProducer fromBloomFilterArray(BloomFilter... filters) {
Objects.requireNonNull(filters, "filters");
return new BloomFilterProducer() {
@Override
public boolean forEachBloomFilter(final Predicate<BloomFilter> predicate) {
for (final BloomFilter filter : filters) {
if (!predicate.test(filter)) {
return false;
}
}
return true;
}

/**
* This implementation returns a copy the original array, the contained Bloom filters
* are references to the originals, any modifications to them are reflected in the original
* filters.
*/
@Override
public BloomFilter[] asBloomFilterArray() {
return filters.clone();
}

/**
* This implementation uses references to the original filters. Any modifications to the
* filters are reflected in the originals.
*/
@Override
public boolean forEachBloomFilterPair(final BloomFilterProducer other,
final BiPredicate<BloomFilter, BloomFilter> func) {
final CountingPredicate<BloomFilter> p = new CountingPredicate<>(filters, func);
return other.forEachBloomFilter(p) && p.forEachRemaining();
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ default boolean forEachIndex(final IntPredicate predicate) {
return forEachCell((i, v) -> predicate.test(i));
}

@Override
default IndexProducer uniqueIndices() {
return this;
}

/**
* Creates a CellProducer from an IndexProducer.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
* A long predicate that applies the test func to each member of the {@code ary} in sequence for each call to {@code test()}.
* if the {@code ary} is exhausted, the subsequent calls to {@code test} are executed with a zero value.
* If the calls to {@code test} do not exhaust the {@code ary} the {@code forEachRemaining} method can be called to
* execute the @code{text} with a zero value for each remaining {@code idx} value.
* execute the @{code test} with a zero value for each remaining {@code idx} value.
* @since 4.5
*/
class CountingLongPredicate implements LongPredicate {
private int idx;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;

import java.util.function.BiPredicate;
import java.util.function.Predicate;

/**
* A predicate that applies the test {@code func} to each member of the {@code ary} in
* sequence for each call to {@code test()}. if the {@code ary} is exhausted,
* the subsequent calls to {@code test} are executed with a {@code null} value.
* If the calls to {@code test} do not exhaust the {@code ary} the {@code
* forEachRemaining} method can be called to execute the @{code test} with a
* {@code null} value for each remaining {@code idx} value.
*
* @param <T> the type of object being compared.
* @since 4.5
*/
class CountingPredicate<T> implements Predicate<T> {
private int idx;
private final T[] ary;
private final BiPredicate<T, T> func;

/**
* Constructs an instance that will compare the elements in {@code ary} with the
* elements returned by {@code func}. function is called as {@code func.test(
* idxValue, otherValue )}. If there are more {@code otherValue} values than
* {@code idxValues} then {@code func} is called as {@code func.test(null, otherValue)}.
*
* @param ary The array of long values to compare.
* @param func The function to apply to the pairs of long values.
*/
CountingPredicate(final T[] ary, final BiPredicate<T, T> func) {
this.ary = ary;
this.func = func;
}

@Override
public boolean test(final T other) {
return func.test(idx == ary.length ? null : ary[idx++], other);
}

/**
* Call {@code BiPredicate<T, T>} for each remaining unpaired {@code <T>} in the
* input array. This method should be invoked after the predicate has been
* passed to a {@code Producer.forEach<T>(BiPredicate<T, T>)} to consume any
* unpaired {@code <T>}s. The second argument to the BiPredicate will be {@code null}.
*
* @return true if all calls the predicate were successful
*/
boolean forEachRemaining() {
// uses local references for optimization benefit.
int i = idx;
final T[] a = ary;
final int limit = a.length;
while (i != limit && func.test(a[i], null)) {
i++;
}
return i == limit;
}
}
Loading

0 comments on commit 0438ede

Please sign in to comment.