diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 21d7ce7c9e..5fa0296f84 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -23,17 +23,16 @@ import java.util.stream.IntStream; /** - * A counting Bloom filter using an int array to track counts for each enabled bit - * index. + * A counting Bloom filter using an int array to track cells for each enabled bit. * *
Any operation that results in negative counts or integer overflow of * counts will mark this filter as invalid. This transition is not reversible. * The operation is completed in full, no exception is raised and the state is - * set to invalid. This allows the counts for the filter immediately prior to the + * set to invalid. This allows the cells for the filter immediately prior to the * operation that created the invalid state to be recovered. See the documentation * in {@link #isValid()} for details.
* - *All the operations in the filter assume the counts are currently valid, + *
All the operations in the filter assume the cells are currently valid, * for example {@code cardinality} or {@code contains} operations. Behavior of an invalid * filter is undefined. It will no longer function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added @@ -47,6 +46,7 @@ * consumption of approximately 8 GB. * * @see Shape + * @see CellProducer * @since 4.5 */ public final class ArrayCountingBloomFilter implements CountingBloomFilter { @@ -57,30 +57,30 @@ public final class ArrayCountingBloomFilter implements CountingBloomFilter { private final Shape shape; /** - * The count of each bit index in the filter. + * The cell for each bit index in the filter. */ - private final int[] counts; + private final int[] cells; /** * The state flag. This is a bitwise @{code OR} of the entire history of all updated - * counts. If negative then a negative count or integer overflow has occurred on - * one or more counts in the history of the filter and the state is invalid. + * cells. If negative then a negative cell or integer overflow has occurred on + * one or more cells in the history of the filter and the state is invalid. * *
Maintenance of this state flag is branch-free for improved performance. It - * eliminates a conditional check for a negative count during remove/subtract + * eliminates a conditional check for a negative cell during remove/subtract * operations and a conditional check for integer overflow during merge/add * operations.
* - *Note: Integer overflow is unlikely in realistic usage scenarios. A count + *
Note: Integer overflow is unlikely in realistic usage scenarios. A cell * that overflows indicates that the number of items in the filter exceeds the * maximum possible size (number of bits) of any Bloom filter constrained by * integer indices. At this point the filter is most likely full (all bits are * non-zero) and thus useless.
* - *Negative counts are a concern if the filter is used incorrectly by + *
Negative cells are a concern if the filter is used incorrectly by * removing an item that was never added. It is expected that a user of a * counting Bloom filter will not perform this action as it is a mistake. - * Enabling an explicit recovery path for negative or overflow counts is a major + * Enabling an explicit recovery path for negative or overflow cells is a major * performance burden not deemed necessary for the unlikely scenarios when an * invalid state is created. Maintenance of the state flag is a concession to * flag improper use that should not have a major performance impact.
@@ -96,18 +96,23 @@ public final class ArrayCountingBloomFilter implements CountingBloomFilter { public ArrayCountingBloomFilter(final Shape shape) { Objects.requireNonNull(shape, "shape"); this.shape = shape; - counts = new int[shape.getNumberOfBits()]; + cells = new int[shape.getNumberOfBits()]; } private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) { this.shape = source.shape; this.state = source.state; - this.counts = source.counts.clone(); + this.cells = source.cells.clone(); } @Override public void clear() { - Arrays.fill(counts, 0); + Arrays.fill(cells, 0); + } + + @Override + public int getMaxCell() { + return Integer.MAX_VALUE; } @Override @@ -122,20 +127,20 @@ public int characteristics() { @Override public int cardinality() { - return (int) IntStream.range(0, counts.length).filter(i -> counts[i] > 0).count(); + return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count(); } @Override - public boolean add(final BitCountProducer other) { + public boolean add(final CellProducer other) { Objects.requireNonNull(other, "other"); - other.forEachCount(this::add); + other.forEachCell(this::add); return isValid(); } @Override - public boolean subtract(final BitCountProducer other) { + public boolean subtract(final CellProducer other) { Objects.requireNonNull(other, "other"); - other.forEachCount(this::subtract); + other.forEachCell(this::subtract); return isValid(); } @@ -146,12 +151,12 @@ public boolean subtract(final BitCountProducer other) { * *The state transition to invalid is permanent.
* - *This implementation does not correct negative counts to zero or integer - * overflow counts to {@link Integer#MAX_VALUE}. Thus the operation that - * generated invalid counts can be reversed by using the complement of the - * original operation with the same Bloom filter. This will restore the counts - * to the state prior to the invalid operation. Counts can then be extracted - * using {@link #forEachCount(BitCountConsumer)}.
+ *This implementation does not correct negative cells to zero or integer + * overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that + * generated invalid cells can be reversed by using the complement of the + * original operation with the same Bloom filter. This will restore the cells + * to the state prior to the invalid operation. Cells can then be extracted + * using {@link #forEachCell(CellConsumer)}.
*/ @Override public boolean isValid() { @@ -159,10 +164,10 @@ public boolean isValid() { } @Override - public boolean forEachCount(final BitCountProducer.BitCountConsumer consumer) { + public boolean forEachCell(final CellProducer.CellConsumer consumer) { Objects.requireNonNull(consumer, "consumer"); - for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0 && !consumer.test(i, counts[i])) { + for (int i = 0; i < cells.length; i++) { + if (cells[i] != 0 && !consumer.test(i, cells[i])) { return false; } } @@ -172,8 +177,8 @@ public boolean forEachCount(final BitCountProducer.BitCountConsumer consumer) { @Override public boolean forEachIndex(final IntPredicate consumer) { Objects.requireNonNull(consumer, "consumer"); - for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0 && !consumer.test(i)) { + for (int i = 0; i < cells.length; i++) { + if (cells[i] != 0 && !consumer.test(i)) { return false; } } @@ -183,14 +188,14 @@ public boolean forEachIndex(final IntPredicate consumer) { @Override public boolean forEachBitMap(final LongPredicate consumer) { Objects.requireNonNull(consumer, "consumer"); - final int blocksm1 = BitMap.numberOfBitMaps(counts.length) - 1; + final int blocksm1 = BitMap.numberOfBitMaps(cells.length) - 1; int i = 0; long value; // must break final block separate as the number of bits may not fall on the long boundary for (int j = 0; j < blocksm1; j++) { value = 0; for (int k = 0; k < Long.SIZE; k++) { - if (counts[i++] != 0) { + if (cells[i++] != 0) { value |= BitMap.getLongBit(k); } } @@ -200,8 +205,8 @@ public boolean forEachBitMap(final LongPredicate consumer) { } // Final block value = 0; - for (int k = 0; i < counts.length; k++) { - if (counts[i++] != 0) { + for (int k = 0; i < cells.length; k++) { + if (cells[i++] != 0) { value |= BitMap.getLongBit(k); } } @@ -209,31 +214,41 @@ public boolean forEachBitMap(final LongPredicate consumer) { } /** - * Add to the count for the bit index. + * Add to the cell for the bit index. * * @param idx the index * @param addend the amount to add * @return {@code true} always. */ private boolean add(final int idx, final int addend) { - final int updated = counts[idx] + addend; - state |= updated; - counts[idx] = updated; - return true; + try { + final int updated = cells[idx] + addend; + state |= updated; + cells[idx] = updated; + return true; + } catch (final IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); + } } /** - * Subtract from the count for the bit index. + * Subtract from the cell for the bit index. * * @param idx the index * @param subtrahend the amount to subtract * @return {@code true} always. */ private boolean subtract(final int idx, final int subtrahend) { - final int updated = counts[idx] - subtrahend; - state |= updated; - counts[idx] = updated; - return true; + try { + final int updated = cells[idx] - subtrahend; + state |= updated; + cells[idx] = updated; + return true; + } catch (final IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); + } } @Override @@ -243,7 +258,7 @@ public Shape getShape() { @Override public boolean contains(final IndexProducer indexProducer) { - return indexProducer.forEachIndex(idx -> this.counts[idx] != 0); + return indexProducer.forEachIndex(idx -> this.cells[idx] != 0); } @Override @@ -253,6 +268,19 @@ public boolean contains(final BitMapProducer bitMapProducer) { @Override public int[] asIndexArray() { - return IntStream.range(0, counts.length).filter(i -> counts[i] > 0).toArray(); + return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray(); + } + + @Override + public int getMaxInsert(CellProducer cellProducer) { + int[] max = {Integer.MAX_VALUE}; + cellProducer.forEachCell( (x, y) -> { + int count = cells[x] / y; + if (count < max[0]) { + max[0] = count; + } + return max[0] > 0; + }); + return max[0]; } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java deleted file mode 100644 index 7ccd8bc924..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.function.IntPredicate; - -/** - * Defines a mapping of index to counts. - * - *Note that a BitCountProducer may return duplicate indices and may be unordered. - * - *
Implementations must guarantee that: - * - *
Note that implementations that do not output duplicate indices for BitCountProducer and - * do for IndexProducer, or vice versa, are consistent if the distinct indices from each are - * the same. - * - *
For example the mapping [(1,2),(2,3),(3,1)] can be output with many combinations including: - *
- * [(1,2),(2,3),(3,1)] - * [(1,1),(1,1),(2,1),(2,1),(2,1),(3,1)] - * [(1,1),(3,1),(1,1),(2,1),(2,1),(2,1)] - * [(3,1),(1,1),(2,2),(1,1),(2,1)] - * ... - *- * - * @since 4.5 - */ -@FunctionalInterface -public interface BitCountProducer extends IndexProducer { - - /** - * Performs the given action for each {@code
Note that the BitCountProducer does not remove duplicates. Any use of the - * BitCountProducer to create an aggregate mapping of index to counts, such as a - * CountingBloomFilter, should use the same BitCountProducer in both add and - * subtract operations to maintain consistency. - *
- * @param idx An index producer. - * @return A BitCountProducer with the same indices as the IndexProducer. - */ - static BitCountProducer from(final IndexProducer idx) { - return new BitCountProducer() { - @Override - public boolean forEachCount(final BitCountConsumer consumer) { - return idx.forEachIndex(i -> consumer.test(i, 1)); - } - - @Override - public int[] asIndexArray() { - return idx.asIndexArray(); - } - - @Override - public boolean forEachIndex(final IntPredicate predicate) { - return idx.forEachIndex(predicate); - } - }; - } - - /** - * Represents an operation that accepts an {@codeNote: This is a functional interface as a specialization of - * {@link java.util.function.BiPredicate} for {@code int}.
- */ - @FunctionalInterface - interface BitCountConsumer { - /** - * Performs an operation on the given {@codeNote that a CellProducer must not return duplicate indices and must be ordered.
+ * + *Implementations must guarantee that:
+ * + *Some Bloom filter implementations use a count rather than a bit flag. The term {@code Cell} is used to + * refer to these counts.
+ * + *Any exceptions thrown by the action are relayed to the caller. The consumer is applied to each + * cell. If the consumer returns {@code false} the execution is stopped, {@code false} + * is returned, and no further pairs are processed.
+ * + * @param consumer the action to be performed for each non-zero cell. + * @return {@code true} if all cells return true from consumer, {@code false} otherwise. + * @throws NullPointerException if the specified consumer is null + */ + boolean forEachCell(CellConsumer consumer); + + /** + * The default implementation returns distinct and ordered indices for all cells with a non-zero count. + */ + @Override + default boolean forEachIndex(final IntPredicate predicate) { + return forEachCell((i, v) -> predicate.test(i)); + } + + /** + * Creates a CellProducer from an IndexProducer. + * + *Note the following properties: + *
A CellProducer that outputs the mapping [(1,2),(2,3),(3,1)] can be created from many combinations + * of indices including: + *
+ * [1, 1, 2, 2, 2, 3] + * [1, 3, 1, 2, 2, 2] + * [3, 2, 1, 2, 1, 2] + * ... + *+ * + * @param producer An index producer. + * @return A CellProducer with the same indices as the IndexProducer. + */ + static CellProducer from(final IndexProducer producer) { + return new CellProducer() { + TreeMap
Note: This is a functional interface as a specialization of + * {@link java.util.function.BiPredicate} for {@code int}.
+ */ + @FunctionalInterface + interface CellConsumer { + /** + * Performs an operation on the given {@codeA counting Bloom filter is expected to function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added @@ -30,29 +31,30 @@ * remove order, is expected to be the same.
* *Removal of a filter that has not previously been merged results in an - * invalid state where the counts no longer represent a sum of merged Bloom + * invalid state where the cells no longer represent a sum of merged Bloom * filters. It is impossible to validate merge and remove exactly without * explicitly storing all filters. Consequently such an operation may go * undetected. The CountingBloomFilter maintains a state flag that is used as a - * warning that an operation was performed that resulted in invalid counts and - * thus an invalid state. For example this may occur if a count for an index was + * warning that an operation was performed that resulted in invalid cells and + * thus an invalid state. For example this may occur if a cell for an index was * set to negative following a remove operation.
* *Implementations should document the expected state of the filter after an - * operation that generates invalid counts, and any potential recovery options. + * operation that generates invalid cells, and any potential recovery options. * An implementation may support a reversal of the operation to restore the - * state to that prior to the operation. In the event that invalid counts are + * state to that prior to the operation. In the event that invalid cells are * adjusted to a valid range then it should be documented if there has been * irreversible information loss.
* *Implementations may choose to throw an exception during an operation that - * generates invalid counts. Implementations should document the expected state - * of the filter after such an operation. For example are the counts not updated, + * generates invalid cells. Implementations should document the expected state + * of the filter after such an operation. For example are the cells not updated, * partially updated or updated entirely before the exception is raised.
* + * @see CellProducer * @since 4.5 */ -public interface CountingBloomFilter extends BloomFilter, BitCountProducer { +public interface CountingBloomFilter extends BloomFilter, CellProducer { // Query Operations @@ -60,9 +62,9 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * Returns {@code true} if the internal state is valid. * *This flag is a warning that an addition or - * subtraction of counts from this filter resulted in an invalid count for one or more - * indexes. For example this may occur if a count for an index was - * set to negative following a subtraction operation, or overflows an {@code int} following an + * subtraction of cells from this filter resulted in an invalid cell for one or more + * indexes. For example this may occur if a cell for an index was + * set to negative following a subtraction operation, or overflows the value specified by {@code getMaxCell()} following an * addition operation.
* *A counting Bloom filter that has an invalid state is no longer ensured to function @@ -77,14 +79,81 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { */ boolean isValid(); + /** + * Returns the maximum allowable value for a cell count in this Counting filter. + * @return the maximum allowable value for a cell count in this Counting filter. + */ + int getMaxCell(); + + /** + * Determines the maximum number of times the Bloom filter could have been merged + * into this counting filter. + * @param bloomFilter the Bloom filter the check for. + * @return the maximum number of times the Bloom filter could have been inserted. + */ + default int getMaxInsert(BloomFilter bloomFilter) { + return getMaxInsert((BitMapProducer) bloomFilter); + } + + /** + * Determines the maximum number of times the IndexProducer could have been merged + * into this counting filter. + *
To determine how many times an indxProducer could have been added create a CellProducer + * from the indexProducer and check that
+ * @param idxProducer the producer to drive the count check. + * @return the maximum number of times the IndexProducer could have been inserted. + * @see #getMaxInsert(CellProducer) + */ + default int getMaxInsert(IndexProducer idxProducer) { + return getMaxInsert(CellProducer.from(idxProducer.uniqueIndices()) ); + } + + /** + * Determines the maximum number of times the Cell Producer could have been add. + * @param cellProducer the producer of cells. + * @return the maximum number of times the CellProducer could have been inserted. + */ + int getMaxInsert(CellProducer cellProducer); + + /** + * Determines the maximum number of times the Hasher could have been merged into this + * counting filter. + * @param hasher the Hasher to provide the indices. + * @return the maximum number of times the hasher could have been inserted. + */ + default int getMaxInsert(Hasher hasher) { + return getMaxInsert(hasher.indices(getShape())); + } + + /** + * Determines the maximum number of times the BitMapProducer could have been merged into this + * counting filter. + * @param bitMapProducer the BitMapProducer to provide the indices. + * @return the maximum number of times the BitMapProducer could have been inserted. + */ + default int getMaxInsert(BitMapProducer bitMapProducer) { + if (!contains(bitMapProducer)) { + return 0; + } + long[] bitMaps = bitMapProducer.asBitMapArray(); + int[] max = { Integer.MAX_VALUE }; + forEachCell((x, y) -> { + if ((bitMaps[BitMap.getLongIndex(x)] & BitMap.getLongBit(x)) != 0) { + max[0] = max[0] <= y ? max[0] : y; + } + return true; + }); + return max[0]; + } + // Modification Operations /** * Merges the specified Bloom filter into this Bloom filter. * - *Specifically: all counts for the indexes identified by the {@code other} filter will be incremented by 1.
+ *Specifically: all cells for the indexes identified by the {@code other} filter will be incremented by 1.
* - *Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + *
Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an * IndexProducer.
* *This method will return {@code true} if the filter is valid after the operation.
@@ -92,7 +161,7 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * @param other the other Bloom filter * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final BloomFilter other) { @@ -103,40 +172,41 @@ default boolean merge(final BloomFilter other) { /** * Merges the specified Hasher into this Bloom filter. * - *Specifically: all counts for the unique indexes identified by the {@code hasher} will be incremented by 1.
+ *Specifically: all cells for the unique indexes identified by the {@code hasher} will be incremented by 1.
* *This method will return {@code true} if the filter is valid after the operation.
* * @param hasher the hasher * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final Hasher hasher) { Objects.requireNonNull(hasher, "hasher"); - return merge(hasher.uniqueIndices(getShape())); + return merge(hasher.indices(getShape())); } /** * Merges the specified index producer into this Bloom filter. * - *Specifically: all counts for the indexes identified by the {@code indexProducer} will be incremented by 1.
+ *Specifically: all unique cells for the indices identified by the {@code indexProducer} will be incremented by 1.
* *This method will return {@code true} if the filter is valid after the operation.
* - *Note: Indices that are returned multiple times will be incremented multiple times.
+ *Note: If indices that are returned multiple times should be incremented multiple times convert the IndexProducer + * to a CellProducer and add that.
* * @param indexProducer the IndexProducer * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final IndexProducer indexProducer) { Objects.requireNonNull(indexProducer, "indexProducer"); try { - return add(BitCountProducer.from(indexProducer)); + return add(CellProducer.from(indexProducer.uniqueIndices())); } catch (final IndexOutOfBoundsException e) { throw new IllegalArgumentException( String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); @@ -146,14 +216,14 @@ default boolean merge(final IndexProducer indexProducer) { /** * Merges the specified BitMap producer into this Bloom filter. * - *Specifically: all counts for the indexes identified by the {@code bitMapProducer} will be incremented by 1.
+ *Specifically: all cells for the indexes identified by the {@code bitMapProducer} will be incremented by 1.
* *This method will return {@code true} if the filter is valid after the operation.
* * @param bitMapProducer the BitMapProducer * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final BitMapProducer bitMapProducer) { @@ -164,9 +234,9 @@ default boolean merge(final BitMapProducer bitMapProducer) { /** * Removes the specified Bloom filter from this Bloom filter. * - *Specifically: all counts for the indexes identified by the {@code other} filter will be decremented by 1.
+ *Specifically: all cells for the indexes identified by the {@code other} filter will be decremented by 1.
* - *Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + *
Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an * IndexProducer.
* *This method will return {@code true} if the filter is valid after the operation.
@@ -174,7 +244,7 @@ default boolean merge(final BitMapProducer bitMapProducer) { * @param other the other Bloom filter * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final BloomFilter other) { Objects.requireNonNull(other, "other"); @@ -184,7 +254,7 @@ default boolean remove(final BloomFilter other) { /** * Removes the unique values from the specified hasher from this Bloom filter. * - *Specifically all counts for the unique indices produced by the {@code hasher} will be + *
Specifically all cells for the unique indices produced by the {@code hasher} will be * decremented by 1.
* *This method will return {@code true} if the filter is valid after the operation.
@@ -192,32 +262,33 @@ default boolean remove(final BloomFilter other) { * @param hasher the hasher to provide the indexes * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final Hasher hasher) { Objects.requireNonNull(hasher, "hasher"); - return remove(hasher.uniqueIndices(getShape())); + return remove(hasher.indices(getShape())); } /** * Removes the values from the specified IndexProducer from the Bloom filter from this Bloom filter. * - *Specifically all counts for the unique indices produced by the {@code hasher} will be + *
Specifically all cells for the unique indices produced by the {@code hasher} will be * decremented by 1.
* *This method will return {@code true} if the filter is valid after the operation.
* - *Node: This method expects index producers that produce unique values.
+ *Note: If indices that are returned multiple times should be decremented multiple times convert the IndexProducer + * to a CellProducer and subtract that.
* * @param indexProducer the IndexProducer to provide the indexes * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final IndexProducer indexProducer) { Objects.requireNonNull(indexProducer, "indexProducer"); try { - return subtract(BitCountProducer.from(indexProducer)); + return subtract(CellProducer.from(indexProducer.uniqueIndices())); } catch (final IndexOutOfBoundsException e) { throw new IllegalArgumentException( String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits())); @@ -227,7 +298,7 @@ default boolean remove(final IndexProducer indexProducer) { /** * Removes the specified BitMapProducer from this Bloom filter. * - *Specifically all counts for the indices produced by the {@code bitMapProducer} will be + *
Specifically all cells for the indices produced by the {@code bitMapProducer} will be * decremented by 1.
* *This method will return {@code true} if the filter is valid after the operation.
@@ -235,7 +306,7 @@ default boolean remove(final IndexProducer indexProducer) { * @param bitMapProducer the BitMapProducer to provide the indexes * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final BitMapProducer bitMapProducer) { Objects.requireNonNull(bitMapProducer, "bitMapProducer"); @@ -243,36 +314,36 @@ default boolean remove(final BitMapProducer bitMapProducer) { } /** - * Adds the specified BitCountProducer to this Bloom filter. + * Adds the specified CellProducer to this Bloom filter. * *Specifically - * all counts for the indexes identified by the {@code other} will be incremented + * all cells for the indexes identified by the {@code other} will be incremented * by their corresponding values in the {@code other}.
* *This method will return {@code true} if the filter is valid after the operation.
* - * @param other the BitCountProducer to add. + * @param other the CellProducer to add. * @return {@code true} if the addition was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ - boolean add(BitCountProducer other); + boolean add(CellProducer other); /** - * Adds the specified BitCountProducer to this Bloom filter. + * Adds the specified CellProducer to this Bloom filter. * *Specifically - * all counts for the indexes identified by the {@code other} will be decremented + * all cells for the indexes identified by the {@code other} will be decremented * by their corresponding values in the {@code other}.
* *This method will return true if the filter is valid after the operation.
* - * @param other the BitCountProducer to subtract. + * @param other the CellProducer to subtract. * @return {@code true} if the subtraction was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ - boolean subtract(BitCountProducer other); + boolean subtract(CellProducer other); /** @@ -281,4 +352,9 @@ default boolean remove(final BitMapProducer bitMapProducer) { */ @Override CountingBloomFilter copy(); + + @Override + default IndexProducer uniqueIndices() { + return this; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java index d8b3a43aa9..5b1b6a127b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java @@ -16,8 +16,6 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.Objects; - /** * A Hasher creates IndexProducer based on the hash implementation and the * provided Shape. @@ -44,21 +42,4 @@ public interface Hasher { * @return the iterator of integers */ IndexProducer indices(Shape shape); - - /** - * Creates an IndexProducer of unique indices for this hasher based on the Shape. - * - *This is like the `indices(Shape)` method except that it adds the guarantee that no - * duplicate values will be returned. The indices produced are equivalent to those returned - * from by a Bloom filter created from this hasher.
- * - * @param shape the shape of the desired Bloom filter. - * @return the iterator of integers - */ - default IndexProducer uniqueIndices(final Shape shape) { - return consumer -> { - Objects.requireNonNull(consumer, "consumer"); - return indices(shape).forEachIndex(IndexFilter.create(shape, consumer)); - }; - } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java index c7e6ca1861..57f70f5638 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java @@ -72,7 +72,10 @@ public boolean test(final int number) { if (number >= size) { throw new IndexOutOfBoundsException(String.format("number too large %d >= %d", number, size)); } - return !tracker.test(number) || consumer.test(number); + if (tracker.test(number)) { + return consumer.test(number); + } + return true; } /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index dbaf0908c7..0269d34eac 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -16,6 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Arrays; import java.util.BitSet; import java.util.Objects; import java.util.function.IntPredicate; @@ -107,21 +108,69 @@ public boolean test(long word) { *Indices ordering and uniqueness is not guaranteed.
* *- * The default implementation of this method is slow. It is recommended - * that implementing classes reimplement this method. + * The default implementation of this method creates an array and populates + * it. Implementations that have access to an index array should consider + * returning a copy of that array if possible. *
* - *- * The default implementation of this method returns unique values in order. - *
* @return An int array of the data. */ default int[] asIndexArray() { - final BitSet result = new BitSet(); + class Indices { + private int[] data = new int[32]; + private int size; + + boolean add(final int index) { + data = IndexUtils.ensureCapacityForAdd(data, size); + data[size++] = index; + return true; + } + + int[] toArray() { + // Edge case to avoid a large array copy + return size == data.length ? data : Arrays.copyOf(data, size); + } + } + Indices indices = new Indices(); + forEachIndex(indices::add); + return indices.toArray(); + } + + /** + * Creates an IndexProducer comprising the unique indices for this producer. + * + *By default creates a new producer with some overhead to remove + * duplicates. IndexProducers that return unique indices by default + * should override this to return {@code this}.
+ * + *The default implementation will filter the indices from this instance + * and return them in ascending order.
+ * + * @return the IndexProducer of unique values. + * @throws IndexOutOfBoundsException if any index is less than zero. + */ + default IndexProducer uniqueIndices() { + final BitSet bitSet = new BitSet(); forEachIndex(i -> { - result.set(i); + bitSet.set(i); return true; }); - return result.stream().toArray(); + + return new IndexProducer() { + @Override + public boolean forEachIndex(IntPredicate predicate) { + for (int idx = bitSet.nextSetBit(0); idx >= 0; idx = bitSet.nextSetBit(idx + 1)) { + if (!predicate.test(idx)) { + return false; + } + } + return true; + } + + @Override + public IndexProducer uniqueIndices() { + return this; + } + }; } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexUtils.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexUtils.java new file mode 100644 index 0000000000..96bfefec02 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexUtils.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; + +/** + * Provides functions to assist in IndexProducer creation and manipulation. + * @see IndexProducer + */ +final class IndexUtils { + + /** + * The maximum array size for the methods in this class. + */ + static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + // do not instantiate + private IndexUtils() {} + + /** + * Ensure the array can add an element at the specified index. + * @param array the array to check. + * @param index the index to add at. + * @return the array or a newly allocated copy of the array. + */ + static int[] ensureCapacityForAdd(int[] array, int index) { + if (index >= array.length) { + return Arrays.copyOf(array, (int) Math.min(IndexUtils.MAX_ARRAY_SIZE, Math.max(array.length * 2L, index + 1))); + } + return array; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index a7fb009540..7df764182d 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -32,6 +32,9 @@ * list. There are lots of other uses, and in most cases the reason is to perform a fast check as a gateway for a longer * operation. * + *Some Bloom filters (e.g. CountingBloomFilter) use counters rather than bits. In this case each counter + * is called a {@code cell}.
+ * *The Bloom filter architecture here is designed for speed of execution, so some methods like {@code merge}, {@code remove},
diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java
deleted file mode 100644
index 2a5aa0a622..0000000000
--- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.collections4.bloomfilter;
-
-import static org.junit.jupiter.api.Assertions.assertArrayEquals;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-import static org.junit.jupiter.api.Assertions.fail;
-import static org.junit.jupiter.api.Assumptions.assumeTrue;
-
-import java.util.Arrays;
-import java.util.BitSet;
-
-import org.apache.commons.collections4.bag.TreeBag;
-import org.apache.commons.collections4.bloomfilter.BitCountProducer.BitCountConsumer;
-import org.junit.jupiter.api.Test;
-
-public abstract class AbstractBitCountProducerTest extends AbstractIndexProducerTest {
-
- /**
- * A testing BitCountConsumer that always returns true.
- */
- private static final BitCountConsumer TRUE_CONSUMER = (i, j) -> true;
- /**
- * A testing BitCountConsumer that always returns false.
- */
- private static final BitCountConsumer FALSE_CONSUMER = (i, j) -> false;
-
- /**
- * Creates an array of integer pairs comprising the index and the expected count for the index.
- * The order and count for each index is dependent upon the producer created by the {@code createProducer()}
- * method.
- * By default returns the each {@code getExpectedIndices()} value paired with 1 (one).
- * @return an array of integer pairs comprising the index and the expected count for the index.
- */
- protected int[][] getExpectedBitCount() {
- return Arrays.stream(getExpectedIndices()).mapToObj(x -> new int[] {x, 1}).toArray(int[][]::new);
- }
-
- /**
- * Creates a producer with some data.
- * @return a producer with some data
- */
- @Override
- protected abstract BitCountProducer createProducer();
-
- /**
- * Creates a producer without data.
- * @return a producer that has no data.
- */
- @Override
- protected abstract BitCountProducer createEmptyProducer();
-
- /**
- * Gets the behavior of the {@link BitCountProducer#forEachCount(BitCountConsumer)} method.
- * By default returns the value of {@code getAsIndexArrayBehaviour()} method.
- * @return the behavior.
- */
- protected int getForEachCountBehaviour() {
- return getAsIndexArrayBehaviour();
- }
-
- @Test
- public final void testForEachCountPredicates() {
- final BitCountProducer populated = createProducer();
- final BitCountProducer empty = createEmptyProducer();
-
- assertFalse(populated.forEachCount(FALSE_CONSUMER), "non-empty should be false");
- assertTrue(empty.forEachCount(FALSE_CONSUMER), "empty should be true");
-
- assertTrue(populated.forEachCount(TRUE_CONSUMER), "non-empty should be true");
- assertTrue(empty.forEachCount(TRUE_CONSUMER), "empty should be true");
- }
-
- @Test
- public final void testEmptyBitCountProducer() {
- final BitCountProducer empty = createEmptyProducer();
- final int ary[] = empty.asIndexArray();
- assertEquals(0, ary.length);
- assertTrue(empty.forEachCount((i, j) -> {
- fail("forEachCount consumer should not be called");
- return false;
- }));
- }
-
- @Test
- public final void testIndexConsistency() {
- final BitCountProducer producer = createProducer();
- final BitSet bs1 = new BitSet();
- final BitSet bs2 = new BitSet();
- producer.forEachIndex(i -> {
- bs1.set(i);
- return true;
- });
- producer.forEachCount((i, j) -> {
- bs2.set(i);
- return true;
- });
- assertEquals(bs1, bs2);
- }
-
- @Test
- public void testForEachCountValues() {
- // Assumes the collections bag works. Could be replaced with Map