From 488393e60081b89fb1178bd93a66f414008ea6e1 Mon Sep 17 00:00:00 2001 From: Claude Warren Date: Tue, 15 Aug 2023 09:34:38 +0200 Subject: [PATCH] COLLECTIONS-844: Allow counting Bloom filters with cell size other than Integer.SIZE (#406) * Added getMaxInsert() and getMaxValue() to CountingBloomFilter. * Changed 'BitCount' to 'Cell' to match the literature for counting Bloom filters. * Updated documentation. * Changed CellProducer to require ordered distinct cell indices. * Updated asIndexArray to respect the order of forEachIndex. --- .../bloomfilter/ArrayCountingBloomFilter.java | 122 ++++++++----- .../bloomfilter/BitCountProducer.java | 126 ------------- .../collections4/bloomfilter/BloomFilter.java | 8 + .../bloomfilter/CellProducer.java | 166 +++++++++++++++++ .../bloomfilter/CountingBloomFilter.java | 168 +++++++++++++----- .../collections4/bloomfilter/Hasher.java | 19 -- .../collections4/bloomfilter/IndexFilter.java | 5 +- .../bloomfilter/IndexProducer.java | 65 ++++++- .../collections4/bloomfilter/IndexUtils.java | 47 +++++ .../bloomfilter/package-info.java | 3 + .../AbstractBitCountProducerTest.java | 168 ------------------ .../bloomfilter/AbstractBloomFilterTest.java | 8 - .../bloomfilter/AbstractCellProducerTest.java | 155 ++++++++++++++++ .../AbstractCountingBloomFilterTest.java | 149 ++++++++++++++-- .../AbstractIndexProducerTest.java | 7 + .../collections4/bloomfilter/ArrayHasher.java | 6 - .../BitCountProducerFromHasherTest.java | 47 ----- ...ducerFromArrayCountingBloomFilterTest.java | 45 +++++ ...lProducerFromDefaultIndexProducerTest.java | 45 +++++ ...Test.java => DefaultCellProducerTest.java} | 30 ++-- .../bloomfilter/DefaultIndexProducerTest.java | 23 ++- ...ucerFromArrayCountingBloomFilterTest.java} | 21 +-- ....java => IndexProducerFromHasherTest.java} | 19 +- ...ava => IndexProducerFromIntArrayTest.java} | 19 +- ...dexProducerFromSimpleBloomFilterTest.java} | 19 +- ...dexProducerFromSparseBloomFilterTest.java} | 21 +-- ...=> IndexProducerFromUniqueHasherTest.java} | 19 +- .../bloomfilter/IndexProducerTest.java | 16 ++ .../collections4/bloomfilter/NullHasher.java | 6 - 29 files changed, 966 insertions(+), 586 deletions(-) delete mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/CellProducer.java create mode 100644 src/main/java/org/apache/commons/collections4/bloomfilter/IndexUtils.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCellProducerTest.java delete mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromHasherTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromArrayCountingBloomFilterTest.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromDefaultIndexProducerTest.java rename src/test/java/org/apache/commons/collections4/bloomfilter/{DefaultBitCountProducerTest.java => DefaultCellProducerTest.java} (69%) rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitCountProducerFromArrayCountingBloomFilterTest.java => IndexProducerFromArrayCountingBloomFilterTest.java} (71%) rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitCountProducerFromDefaultIndexProducerTest.java => IndexProducerFromHasherTest.java} (69%) rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitCountProducerFromIntArrayTest.java => IndexProducerFromIntArrayTest.java} (76%) rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitCountProducerFromSimpleBloomFilterTest.java => IndexProducerFromSimpleBloomFilterTest.java} (77%) rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitCountProducerFromSparseBloomFilterTest.java => IndexProducerFromSparseBloomFilterTest.java} (70%) rename src/test/java/org/apache/commons/collections4/bloomfilter/{BitCountProducerFromUniqueHasherTest.java => IndexProducerFromUniqueHasherTest.java} (73%) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index 21d7ce7c9e..5fa0296f84 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -23,17 +23,16 @@ import java.util.stream.IntStream; /** - * A counting Bloom filter using an int array to track counts for each enabled bit - * index. + * A counting Bloom filter using an int array to track cells for each enabled bit. * *

Any operation that results in negative counts or integer overflow of * counts will mark this filter as invalid. This transition is not reversible. * The operation is completed in full, no exception is raised and the state is - * set to invalid. This allows the counts for the filter immediately prior to the + * set to invalid. This allows the cells for the filter immediately prior to the * operation that created the invalid state to be recovered. See the documentation * in {@link #isValid()} for details.

* - *

All the operations in the filter assume the counts are currently valid, + *

All the operations in the filter assume the cells are currently valid, * for example {@code cardinality} or {@code contains} operations. Behavior of an invalid * filter is undefined. It will no longer function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added @@ -47,6 +46,7 @@ * consumption of approximately 8 GB. * * @see Shape + * @see CellProducer * @since 4.5 */ public final class ArrayCountingBloomFilter implements CountingBloomFilter { @@ -57,30 +57,30 @@ public final class ArrayCountingBloomFilter implements CountingBloomFilter { private final Shape shape; /** - * The count of each bit index in the filter. + * The cell for each bit index in the filter. */ - private final int[] counts; + private final int[] cells; /** * The state flag. This is a bitwise @{code OR} of the entire history of all updated - * counts. If negative then a negative count or integer overflow has occurred on - * one or more counts in the history of the filter and the state is invalid. + * cells. If negative then a negative cell or integer overflow has occurred on + * one or more cells in the history of the filter and the state is invalid. * *

Maintenance of this state flag is branch-free for improved performance. It - * eliminates a conditional check for a negative count during remove/subtract + * eliminates a conditional check for a negative cell during remove/subtract * operations and a conditional check for integer overflow during merge/add * operations.

* - *

Note: Integer overflow is unlikely in realistic usage scenarios. A count + *

Note: Integer overflow is unlikely in realistic usage scenarios. A cell * that overflows indicates that the number of items in the filter exceeds the * maximum possible size (number of bits) of any Bloom filter constrained by * integer indices. At this point the filter is most likely full (all bits are * non-zero) and thus useless.

* - *

Negative counts are a concern if the filter is used incorrectly by + *

Negative cells are a concern if the filter is used incorrectly by * removing an item that was never added. It is expected that a user of a * counting Bloom filter will not perform this action as it is a mistake. - * Enabling an explicit recovery path for negative or overflow counts is a major + * Enabling an explicit recovery path for negative or overflow cells is a major * performance burden not deemed necessary for the unlikely scenarios when an * invalid state is created. Maintenance of the state flag is a concession to * flag improper use that should not have a major performance impact.

@@ -96,18 +96,23 @@ public final class ArrayCountingBloomFilter implements CountingBloomFilter { public ArrayCountingBloomFilter(final Shape shape) { Objects.requireNonNull(shape, "shape"); this.shape = shape; - counts = new int[shape.getNumberOfBits()]; + cells = new int[shape.getNumberOfBits()]; } private ArrayCountingBloomFilter(final ArrayCountingBloomFilter source) { this.shape = source.shape; this.state = source.state; - this.counts = source.counts.clone(); + this.cells = source.cells.clone(); } @Override public void clear() { - Arrays.fill(counts, 0); + Arrays.fill(cells, 0); + } + + @Override + public int getMaxCell() { + return Integer.MAX_VALUE; } @Override @@ -122,20 +127,20 @@ public int characteristics() { @Override public int cardinality() { - return (int) IntStream.range(0, counts.length).filter(i -> counts[i] > 0).count(); + return (int) IntStream.range(0, cells.length).filter(i -> cells[i] > 0).count(); } @Override - public boolean add(final BitCountProducer other) { + public boolean add(final CellProducer other) { Objects.requireNonNull(other, "other"); - other.forEachCount(this::add); + other.forEachCell(this::add); return isValid(); } @Override - public boolean subtract(final BitCountProducer other) { + public boolean subtract(final CellProducer other) { Objects.requireNonNull(other, "other"); - other.forEachCount(this::subtract); + other.forEachCell(this::subtract); return isValid(); } @@ -146,12 +151,12 @@ public boolean subtract(final BitCountProducer other) { * *

The state transition to invalid is permanent.

* - *

This implementation does not correct negative counts to zero or integer - * overflow counts to {@link Integer#MAX_VALUE}. Thus the operation that - * generated invalid counts can be reversed by using the complement of the - * original operation with the same Bloom filter. This will restore the counts - * to the state prior to the invalid operation. Counts can then be extracted - * using {@link #forEachCount(BitCountConsumer)}.

+ *

This implementation does not correct negative cells to zero or integer + * overflow cells to {@link Integer#MAX_VALUE}. Thus the operation that + * generated invalid cells can be reversed by using the complement of the + * original operation with the same Bloom filter. This will restore the cells + * to the state prior to the invalid operation. Cells can then be extracted + * using {@link #forEachCell(CellConsumer)}.

*/ @Override public boolean isValid() { @@ -159,10 +164,10 @@ public boolean isValid() { } @Override - public boolean forEachCount(final BitCountProducer.BitCountConsumer consumer) { + public boolean forEachCell(final CellProducer.CellConsumer consumer) { Objects.requireNonNull(consumer, "consumer"); - for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0 && !consumer.test(i, counts[i])) { + for (int i = 0; i < cells.length; i++) { + if (cells[i] != 0 && !consumer.test(i, cells[i])) { return false; } } @@ -172,8 +177,8 @@ public boolean forEachCount(final BitCountProducer.BitCountConsumer consumer) { @Override public boolean forEachIndex(final IntPredicate consumer) { Objects.requireNonNull(consumer, "consumer"); - for (int i = 0; i < counts.length; i++) { - if (counts[i] != 0 && !consumer.test(i)) { + for (int i = 0; i < cells.length; i++) { + if (cells[i] != 0 && !consumer.test(i)) { return false; } } @@ -183,14 +188,14 @@ public boolean forEachIndex(final IntPredicate consumer) { @Override public boolean forEachBitMap(final LongPredicate consumer) { Objects.requireNonNull(consumer, "consumer"); - final int blocksm1 = BitMap.numberOfBitMaps(counts.length) - 1; + final int blocksm1 = BitMap.numberOfBitMaps(cells.length) - 1; int i = 0; long value; // must break final block separate as the number of bits may not fall on the long boundary for (int j = 0; j < blocksm1; j++) { value = 0; for (int k = 0; k < Long.SIZE; k++) { - if (counts[i++] != 0) { + if (cells[i++] != 0) { value |= BitMap.getLongBit(k); } } @@ -200,8 +205,8 @@ public boolean forEachBitMap(final LongPredicate consumer) { } // Final block value = 0; - for (int k = 0; i < counts.length; k++) { - if (counts[i++] != 0) { + for (int k = 0; i < cells.length; k++) { + if (cells[i++] != 0) { value |= BitMap.getLongBit(k); } } @@ -209,31 +214,41 @@ public boolean forEachBitMap(final LongPredicate consumer) { } /** - * Add to the count for the bit index. + * Add to the cell for the bit index. * * @param idx the index * @param addend the amount to add * @return {@code true} always. */ private boolean add(final int idx, final int addend) { - final int updated = counts[idx] + addend; - state |= updated; - counts[idx] = updated; - return true; + try { + final int updated = cells[idx] + addend; + state |= updated; + cells[idx] = updated; + return true; + } catch (final IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); + } } /** - * Subtract from the count for the bit index. + * Subtract from the cell for the bit index. * * @param idx the index * @param subtrahend the amount to subtract * @return {@code true} always. */ private boolean subtract(final int idx, final int subtrahend) { - final int updated = counts[idx] - subtrahend; - state |= updated; - counts[idx] = updated; - return true; + try { + final int updated = cells[idx] - subtrahend; + state |= updated; + cells[idx] = updated; + return true; + } catch (final IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); + } } @Override @@ -243,7 +258,7 @@ public Shape getShape() { @Override public boolean contains(final IndexProducer indexProducer) { - return indexProducer.forEachIndex(idx -> this.counts[idx] != 0); + return indexProducer.forEachIndex(idx -> this.cells[idx] != 0); } @Override @@ -253,6 +268,19 @@ public boolean contains(final BitMapProducer bitMapProducer) { @Override public int[] asIndexArray() { - return IntStream.range(0, counts.length).filter(i -> counts[i] > 0).toArray(); + return IntStream.range(0, cells.length).filter(i -> cells[i] > 0).toArray(); + } + + @Override + public int getMaxInsert(CellProducer cellProducer) { + int[] max = {Integer.MAX_VALUE}; + cellProducer.forEachCell( (x, y) -> { + int count = cells[x] / y; + if (count < max[0]) { + max[0] = count; + } + return max[0] > 0; + }); + return max[0]; } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java deleted file mode 100644 index 7ccd8bc924..0000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BitCountProducer.java +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.function.IntPredicate; - -/** - * Defines a mapping of index to counts. - * - *

Note that a BitCountProducer may return duplicate indices and may be unordered. - * - *

Implementations must guarantee that: - * - *

- * - *

Note that implementations that do not output duplicate indices for BitCountProducer and - * do for IndexProducer, or vice versa, are consistent if the distinct indices from each are - * the same. - * - *

For example the mapping [(1,2),(2,3),(3,1)] can be output with many combinations including: - *

- * [(1,2),(2,3),(3,1)]
- * [(1,1),(1,1),(2,1),(2,1),(2,1),(3,1)]
- * [(1,1),(3,1),(1,1),(2,1),(2,1),(2,1)]
- * [(3,1),(1,1),(2,2),(1,1),(2,1)]
- * ...
- * 
- * - * @since 4.5 - */ -@FunctionalInterface -public interface BitCountProducer extends IndexProducer { - - /** - * Performs the given action for each {@code } pair where the count is non-zero. - * Any exceptions thrown by the action are relayed to the caller. The consumer is applied to each - * index-count pair, if the consumer returns {@code false} the execution is stopped, {@code false} - * is returned, and no further pairs are processed. - * - * Duplicate indices are not required to be aggregated. Duplicates may be output by the producer as - * noted in the class javadoc. - * - * @param consumer the action to be performed for each non-zero bit count - * @return {@code true} if all count pairs return true from consumer, {@code false} otherwise. - * @throws NullPointerException if the specified consumer is null - */ - boolean forEachCount(BitCountConsumer consumer); - - /** - * The default implementation returns indices with ordering and uniqueness of {@code forEachCount()}. - */ - @Override - default boolean forEachIndex(final IntPredicate predicate) { - return forEachCount((i, v) -> predicate.test(i)); - } - - /** - * Creates a BitCountProducer from an IndexProducer. The resulting - * producer will return every index from the IndexProducer with a count of 1. - * - *

Note that the BitCountProducer does not remove duplicates. Any use of the - * BitCountProducer to create an aggregate mapping of index to counts, such as a - * CountingBloomFilter, should use the same BitCountProducer in both add and - * subtract operations to maintain consistency. - *

- * @param idx An index producer. - * @return A BitCountProducer with the same indices as the IndexProducer. - */ - static BitCountProducer from(final IndexProducer idx) { - return new BitCountProducer() { - @Override - public boolean forEachCount(final BitCountConsumer consumer) { - return idx.forEachIndex(i -> consumer.test(i, 1)); - } - - @Override - public int[] asIndexArray() { - return idx.asIndexArray(); - } - - @Override - public boolean forEachIndex(final IntPredicate predicate) { - return idx.forEachIndex(predicate); - } - }; - } - - /** - * Represents an operation that accepts an {@code } pair representing - * the count for a bit index. Returns {@code true} - * if processing should continue, {@code false} otherwise. - * - *

Note: This is a functional interface as a specialization of - * {@link java.util.function.BiPredicate} for {@code int}.

- */ - @FunctionalInterface - interface BitCountConsumer { - /** - * Performs an operation on the given {@code } pair. - * - * @param index the bit index. - * @param count the count at the specified bit index. - * @return {@code true} if processing should continue, {@code false} if processing should stop. - */ - boolean test(int index, int count); - } -} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index f51eb081fc..e4783bf3e1 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -309,4 +309,12 @@ default int estimateIntersection(final BloomFilter other) { } return estimate>Integer.MAX_VALUE?Integer.MAX_VALUE:(int) estimate; } + + /** + * Most Bloom filters create unique IndexProducers. + */ + @Override + default IndexProducer uniqueIndices() { + return this; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CellProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CellProducer.java new file mode 100644 index 0000000000..6949a13c64 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CellProducer.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.TreeMap; +import java.util.function.IntPredicate; + + +/** + * Some Bloom filter implementations use a count rather than a bit flag. The term {@code Cell} is used to + * refer to these counts and their associated index. This class is the equivalent of the index producer except + * that it produces cells. + * + *

Note that a CellProducer must not return duplicate indices and must be ordered.

+ * + *

Implementations must guarantee that:

+ * + *
    + *
  • The IndexProducer implementation returns unique ordered indices.
  • + *
  • The cells are produced in IndexProducer order.
  • + *
  • For every value produced by the IndexProducer there will be only one matching + * cell produced by the CellProducer.
  • + *
  • The CellProducer will not generate cells with indices that are not output by the IndexProducer.
  • + *
  • The IndexProducer will not generate indices that have a zero count for the cell.
  • + *
+ * + * @since 4.5 + */ +@FunctionalInterface +public interface CellProducer extends IndexProducer { + + /** + * Performs the given action for each {@code cell} where the cell count is non-zero. + * + *

Some Bloom filter implementations use a count rather than a bit flag. The term {@code Cell} is used to + * refer to these counts.

+ * + *

Any exceptions thrown by the action are relayed to the caller. The consumer is applied to each + * cell. If the consumer returns {@code false} the execution is stopped, {@code false} + * is returned, and no further pairs are processed.

+ * + * @param consumer the action to be performed for each non-zero cell. + * @return {@code true} if all cells return true from consumer, {@code false} otherwise. + * @throws NullPointerException if the specified consumer is null + */ + boolean forEachCell(CellConsumer consumer); + + /** + * The default implementation returns distinct and ordered indices for all cells with a non-zero count. + */ + @Override + default boolean forEachIndex(final IntPredicate predicate) { + return forEachCell((i, v) -> predicate.test(i)); + } + + /** + * Creates a CellProducer from an IndexProducer. + * + *

Note the following properties: + *

    + *
  • Each index returned from the IndexProducer is assumed to have a cell value of 1.
  • + *
  • The CellProducer aggregates duplicate indices from the IndexProducer.
  • + *
+ * + *

A CellProducer that outputs the mapping [(1,2),(2,3),(3,1)] can be created from many combinations + * of indices including: + *

+     * [1, 1, 2, 2, 2, 3]
+     * [1, 3, 1, 2, 2, 2]
+     * [3, 2, 1, 2, 1, 2]
+     * ...
+     * 
+ * + * @param producer An index producer. + * @return A CellProducer with the same indices as the IndexProducer. + */ + static CellProducer from(final IndexProducer producer) { + return new CellProducer() { + TreeMap counterCells = new TreeMap<>(); + + private void populate() { + if (counterCells.isEmpty()) { + producer.forEachIndex( idx -> { + CounterCell cell = new CounterCell(idx, 1); + CounterCell counter = counterCells.get(cell); + if (counter == null) { + counterCells.put(cell, cell); + } else { + counter.count++; + } + return true; + }); + } + } + + @Override + public int[] asIndexArray() { + populate(); + return counterCells.keySet().stream().mapToInt(c -> c.idx).toArray(); + } + + @Override + public boolean forEachCell(CellConsumer consumer) { + populate(); + for (CounterCell cell : counterCells.values()) { + if (!consumer.test(cell.idx, cell.count)) { + return false; + } + } + return true; + } + + /** + * Class to track cell values in the TreeMap. + */ + final class CounterCell implements Comparable { + final int idx; + int count; + + CounterCell(int idx, int count) { + this.idx = idx; + this.count = count; + } + + @Override + public int compareTo(CounterCell other) { + return Integer.compare(idx, other.idx); + } + } + }; + } + + /** + * Represents an operation that accepts an {@code } pair. + * Returns {@code true} if processing should continue, {@code false} otherwise. + * + *

Note: This is a functional interface as a specialization of + * {@link java.util.function.BiPredicate} for {@code int}.

+ */ + @FunctionalInterface + interface CellConsumer { + /** + * Performs an operation on the given {@code } pair. + * + * @param index the bit index. + * @param count the cell value at the specified bit index. + * @return {@code true} if processing should continue, {@code false} if processing should stop. + */ + boolean test(int index, int count); + } +} + diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index a86afa1f36..7c9310f406 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -20,7 +20,8 @@ /** * The interface that describes a Bloom filter that associates a count with each - * bit index to allow reversal of merge operations with remove operations. + * bit index rather than a bit. This allows reversal of merge operations with + * remove operations. * *

A counting Bloom filter is expected to function identically to a standard * Bloom filter that is the merge of all the Bloom filters that have been added @@ -30,29 +31,30 @@ * remove order, is expected to be the same.

* *

Removal of a filter that has not previously been merged results in an - * invalid state where the counts no longer represent a sum of merged Bloom + * invalid state where the cells no longer represent a sum of merged Bloom * filters. It is impossible to validate merge and remove exactly without * explicitly storing all filters. Consequently such an operation may go * undetected. The CountingBloomFilter maintains a state flag that is used as a - * warning that an operation was performed that resulted in invalid counts and - * thus an invalid state. For example this may occur if a count for an index was + * warning that an operation was performed that resulted in invalid cells and + * thus an invalid state. For example this may occur if a cell for an index was * set to negative following a remove operation.

* *

Implementations should document the expected state of the filter after an - * operation that generates invalid counts, and any potential recovery options. + * operation that generates invalid cells, and any potential recovery options. * An implementation may support a reversal of the operation to restore the - * state to that prior to the operation. In the event that invalid counts are + * state to that prior to the operation. In the event that invalid cells are * adjusted to a valid range then it should be documented if there has been * irreversible information loss.

* *

Implementations may choose to throw an exception during an operation that - * generates invalid counts. Implementations should document the expected state - * of the filter after such an operation. For example are the counts not updated, + * generates invalid cells. Implementations should document the expected state + * of the filter after such an operation. For example are the cells not updated, * partially updated or updated entirely before the exception is raised.

* + * @see CellProducer * @since 4.5 */ -public interface CountingBloomFilter extends BloomFilter, BitCountProducer { +public interface CountingBloomFilter extends BloomFilter, CellProducer { // Query Operations @@ -60,9 +62,9 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * Returns {@code true} if the internal state is valid. * *

This flag is a warning that an addition or - * subtraction of counts from this filter resulted in an invalid count for one or more - * indexes. For example this may occur if a count for an index was - * set to negative following a subtraction operation, or overflows an {@code int} following an + * subtraction of cells from this filter resulted in an invalid cell for one or more + * indexes. For example this may occur if a cell for an index was + * set to negative following a subtraction operation, or overflows the value specified by {@code getMaxCell()} following an * addition operation.

* *

A counting Bloom filter that has an invalid state is no longer ensured to function @@ -77,14 +79,81 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { */ boolean isValid(); + /** + * Returns the maximum allowable value for a cell count in this Counting filter. + * @return the maximum allowable value for a cell count in this Counting filter. + */ + int getMaxCell(); + + /** + * Determines the maximum number of times the Bloom filter could have been merged + * into this counting filter. + * @param bloomFilter the Bloom filter the check for. + * @return the maximum number of times the Bloom filter could have been inserted. + */ + default int getMaxInsert(BloomFilter bloomFilter) { + return getMaxInsert((BitMapProducer) bloomFilter); + } + + /** + * Determines the maximum number of times the IndexProducer could have been merged + * into this counting filter. + *

To determine how many times an indxProducer could have been added create a CellProducer + * from the indexProducer and check that

+ * @param idxProducer the producer to drive the count check. + * @return the maximum number of times the IndexProducer could have been inserted. + * @see #getMaxInsert(CellProducer) + */ + default int getMaxInsert(IndexProducer idxProducer) { + return getMaxInsert(CellProducer.from(idxProducer.uniqueIndices()) ); + } + + /** + * Determines the maximum number of times the Cell Producer could have been add. + * @param cellProducer the producer of cells. + * @return the maximum number of times the CellProducer could have been inserted. + */ + int getMaxInsert(CellProducer cellProducer); + + /** + * Determines the maximum number of times the Hasher could have been merged into this + * counting filter. + * @param hasher the Hasher to provide the indices. + * @return the maximum number of times the hasher could have been inserted. + */ + default int getMaxInsert(Hasher hasher) { + return getMaxInsert(hasher.indices(getShape())); + } + + /** + * Determines the maximum number of times the BitMapProducer could have been merged into this + * counting filter. + * @param bitMapProducer the BitMapProducer to provide the indices. + * @return the maximum number of times the BitMapProducer could have been inserted. + */ + default int getMaxInsert(BitMapProducer bitMapProducer) { + if (!contains(bitMapProducer)) { + return 0; + } + long[] bitMaps = bitMapProducer.asBitMapArray(); + int[] max = { Integer.MAX_VALUE }; + forEachCell((x, y) -> { + if ((bitMaps[BitMap.getLongIndex(x)] & BitMap.getLongBit(x)) != 0) { + max[0] = max[0] <= y ? max[0] : y; + } + return true; + }); + return max[0]; + } + // Modification Operations /** * Merges the specified Bloom filter into this Bloom filter. * - *

Specifically: all counts for the indexes identified by the {@code other} filter will be incremented by 1.

+ *

Specifically: all cells for the indexes identified by the {@code other} filter will be incremented by 1.

* - *

Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + *

Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an * IndexProducer.

* *

This method will return {@code true} if the filter is valid after the operation.

@@ -92,7 +161,7 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * @param other the other Bloom filter * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final BloomFilter other) { @@ -103,40 +172,41 @@ default boolean merge(final BloomFilter other) { /** * Merges the specified Hasher into this Bloom filter. * - *

Specifically: all counts for the unique indexes identified by the {@code hasher} will be incremented by 1.

+ *

Specifically: all cells for the unique indexes identified by the {@code hasher} will be incremented by 1.

* *

This method will return {@code true} if the filter is valid after the operation.

* * @param hasher the hasher * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final Hasher hasher) { Objects.requireNonNull(hasher, "hasher"); - return merge(hasher.uniqueIndices(getShape())); + return merge(hasher.indices(getShape())); } /** * Merges the specified index producer into this Bloom filter. * - *

Specifically: all counts for the indexes identified by the {@code indexProducer} will be incremented by 1.

+ *

Specifically: all unique cells for the indices identified by the {@code indexProducer} will be incremented by 1.

* *

This method will return {@code true} if the filter is valid after the operation.

* - *

Note: Indices that are returned multiple times will be incremented multiple times.

+ *

Note: If indices that are returned multiple times should be incremented multiple times convert the IndexProducer + * to a CellProducer and add that.

* * @param indexProducer the IndexProducer * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final IndexProducer indexProducer) { Objects.requireNonNull(indexProducer, "indexProducer"); try { - return add(BitCountProducer.from(indexProducer)); + return add(CellProducer.from(indexProducer.uniqueIndices())); } catch (final IndexOutOfBoundsException e) { throw new IllegalArgumentException( String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); @@ -146,14 +216,14 @@ default boolean merge(final IndexProducer indexProducer) { /** * Merges the specified BitMap producer into this Bloom filter. * - *

Specifically: all counts for the indexes identified by the {@code bitMapProducer} will be incremented by 1.

+ *

Specifically: all cells for the indexes identified by the {@code bitMapProducer} will be incremented by 1.

* *

This method will return {@code true} if the filter is valid after the operation.

* * @param bitMapProducer the BitMapProducer * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ @Override default boolean merge(final BitMapProducer bitMapProducer) { @@ -164,9 +234,9 @@ default boolean merge(final BitMapProducer bitMapProducer) { /** * Removes the specified Bloom filter from this Bloom filter. * - *

Specifically: all counts for the indexes identified by the {@code other} filter will be decremented by 1.

+ *

Specifically: all cells for the indexes identified by the {@code other} filter will be decremented by 1.

* - *

Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + *

Note: If the other filter is a counting Bloom filter the other filter's cells are ignored and it is treated as an * IndexProducer.

* *

This method will return {@code true} if the filter is valid after the operation.

@@ -174,7 +244,7 @@ default boolean merge(final BitMapProducer bitMapProducer) { * @param other the other Bloom filter * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final BloomFilter other) { Objects.requireNonNull(other, "other"); @@ -184,7 +254,7 @@ default boolean remove(final BloomFilter other) { /** * Removes the unique values from the specified hasher from this Bloom filter. * - *

Specifically all counts for the unique indices produced by the {@code hasher} will be + *

Specifically all cells for the unique indices produced by the {@code hasher} will be * decremented by 1.

* *

This method will return {@code true} if the filter is valid after the operation.

@@ -192,32 +262,33 @@ default boolean remove(final BloomFilter other) { * @param hasher the hasher to provide the indexes * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final Hasher hasher) { Objects.requireNonNull(hasher, "hasher"); - return remove(hasher.uniqueIndices(getShape())); + return remove(hasher.indices(getShape())); } /** * Removes the values from the specified IndexProducer from the Bloom filter from this Bloom filter. * - *

Specifically all counts for the unique indices produced by the {@code hasher} will be + *

Specifically all cells for the unique indices produced by the {@code hasher} will be * decremented by 1.

* *

This method will return {@code true} if the filter is valid after the operation.

* - *

Node: This method expects index producers that produce unique values.

+ *

Note: If indices that are returned multiple times should be decremented multiple times convert the IndexProducer + * to a CellProducer and subtract that.

* * @param indexProducer the IndexProducer to provide the indexes * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final IndexProducer indexProducer) { Objects.requireNonNull(indexProducer, "indexProducer"); try { - return subtract(BitCountProducer.from(indexProducer)); + return subtract(CellProducer.from(indexProducer.uniqueIndices())); } catch (final IndexOutOfBoundsException e) { throw new IllegalArgumentException( String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits())); @@ -227,7 +298,7 @@ default boolean remove(final IndexProducer indexProducer) { /** * Removes the specified BitMapProducer from this Bloom filter. * - *

Specifically all counts for the indices produced by the {@code bitMapProducer} will be + *

Specifically all cells for the indices produced by the {@code bitMapProducer} will be * decremented by 1.

* *

This method will return {@code true} if the filter is valid after the operation.

@@ -235,7 +306,7 @@ default boolean remove(final IndexProducer indexProducer) { * @param bitMapProducer the BitMapProducer to provide the indexes * @return {@code true} if the removal was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ default boolean remove(final BitMapProducer bitMapProducer) { Objects.requireNonNull(bitMapProducer, "bitMapProducer"); @@ -243,36 +314,36 @@ default boolean remove(final BitMapProducer bitMapProducer) { } /** - * Adds the specified BitCountProducer to this Bloom filter. + * Adds the specified CellProducer to this Bloom filter. * *

Specifically - * all counts for the indexes identified by the {@code other} will be incremented + * all cells for the indexes identified by the {@code other} will be incremented * by their corresponding values in the {@code other}.

* *

This method will return {@code true} if the filter is valid after the operation.

* - * @param other the BitCountProducer to add. + * @param other the CellProducer to add. * @return {@code true} if the addition was successful and the state is valid * @see #isValid() - * @see #subtract(BitCountProducer) + * @see #subtract(CellProducer) */ - boolean add(BitCountProducer other); + boolean add(CellProducer other); /** - * Adds the specified BitCountProducer to this Bloom filter. + * Adds the specified CellProducer to this Bloom filter. * *

Specifically - * all counts for the indexes identified by the {@code other} will be decremented + * all cells for the indexes identified by the {@code other} will be decremented * by their corresponding values in the {@code other}.

* *

This method will return true if the filter is valid after the operation.

* - * @param other the BitCountProducer to subtract. + * @param other the CellProducer to subtract. * @return {@code true} if the subtraction was successful and the state is valid * @see #isValid() - * @see #add(BitCountProducer) + * @see #add(CellProducer) */ - boolean subtract(BitCountProducer other); + boolean subtract(CellProducer other); /** @@ -281,4 +352,9 @@ default boolean remove(final BitMapProducer bitMapProducer) { */ @Override CountingBloomFilter copy(); + + @Override + default IndexProducer uniqueIndices() { + return this; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java index d8b3a43aa9..5b1b6a127b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java @@ -16,8 +16,6 @@ */ package org.apache.commons.collections4.bloomfilter; -import java.util.Objects; - /** * A Hasher creates IndexProducer based on the hash implementation and the * provided Shape. @@ -44,21 +42,4 @@ public interface Hasher { * @return the iterator of integers */ IndexProducer indices(Shape shape); - - /** - * Creates an IndexProducer of unique indices for this hasher based on the Shape. - * - *

This is like the `indices(Shape)` method except that it adds the guarantee that no - * duplicate values will be returned. The indices produced are equivalent to those returned - * from by a Bloom filter created from this hasher.

- * - * @param shape the shape of the desired Bloom filter. - * @return the iterator of integers - */ - default IndexProducer uniqueIndices(final Shape shape) { - return consumer -> { - Objects.requireNonNull(consumer, "consumer"); - return indices(shape).forEachIndex(IndexFilter.create(shape, consumer)); - }; - } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java index c7e6ca1861..57f70f5638 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexFilter.java @@ -72,7 +72,10 @@ public boolean test(final int number) { if (number >= size) { throw new IndexOutOfBoundsException(String.format("number too large %d >= %d", number, size)); } - return !tracker.test(number) || consumer.test(number); + if (tracker.test(number)) { + return consumer.test(number); + } + return true; } /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java index dbaf0908c7..0269d34eac 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java @@ -16,6 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Arrays; import java.util.BitSet; import java.util.Objects; import java.util.function.IntPredicate; @@ -107,21 +108,69 @@ public boolean test(long word) { *

Indices ordering and uniqueness is not guaranteed.

* *

- * The default implementation of this method is slow. It is recommended - * that implementing classes reimplement this method. + * The default implementation of this method creates an array and populates + * it. Implementations that have access to an index array should consider + * returning a copy of that array if possible. *

* - *

- * The default implementation of this method returns unique values in order. - *

* @return An int array of the data. */ default int[] asIndexArray() { - final BitSet result = new BitSet(); + class Indices { + private int[] data = new int[32]; + private int size; + + boolean add(final int index) { + data = IndexUtils.ensureCapacityForAdd(data, size); + data[size++] = index; + return true; + } + + int[] toArray() { + // Edge case to avoid a large array copy + return size == data.length ? data : Arrays.copyOf(data, size); + } + } + Indices indices = new Indices(); + forEachIndex(indices::add); + return indices.toArray(); + } + + /** + * Creates an IndexProducer comprising the unique indices for this producer. + * + *

By default creates a new producer with some overhead to remove + * duplicates. IndexProducers that return unique indices by default + * should override this to return {@code this}.

+ * + *

The default implementation will filter the indices from this instance + * and return them in ascending order.

+ * + * @return the IndexProducer of unique values. + * @throws IndexOutOfBoundsException if any index is less than zero. + */ + default IndexProducer uniqueIndices() { + final BitSet bitSet = new BitSet(); forEachIndex(i -> { - result.set(i); + bitSet.set(i); return true; }); - return result.stream().toArray(); + + return new IndexProducer() { + @Override + public boolean forEachIndex(IntPredicate predicate) { + for (int idx = bitSet.nextSetBit(0); idx >= 0; idx = bitSet.nextSetBit(idx + 1)) { + if (!predicate.test(idx)) { + return false; + } + } + return true; + } + + @Override + public IndexProducer uniqueIndices() { + return this; + } + }; } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/IndexUtils.java b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexUtils.java new file mode 100644 index 0000000000..96bfefec02 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/IndexUtils.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; + +/** + * Provides functions to assist in IndexProducer creation and manipulation. + * @see IndexProducer + */ +final class IndexUtils { + + /** + * The maximum array size for the methods in this class. + */ + static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + + // do not instantiate + private IndexUtils() {} + + /** + * Ensure the array can add an element at the specified index. + * @param array the array to check. + * @param index the index to add at. + * @return the array or a newly allocated copy of the array. + */ + static int[] ensureCapacityForAdd(int[] array, int index) { + if (index >= array.length) { + return Arrays.copyOf(array, (int) Math.min(IndexUtils.MAX_ARRAY_SIZE, Math.max(array.length * 2L, index + 1))); + } + return array; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index a7fb009540..7df764182d 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -32,6 +32,9 @@ * list. There are lots of other uses, and in most cases the reason is to perform a fast check as a gateway for a longer * operation.

* + *

Some Bloom filters (e.g. CountingBloomFilter) use counters rather than bits. In this case each counter + * is called a {@code cell}.

+ * *

BloomFilter

* *

The Bloom filter architecture here is designed for speed of execution, so some methods like {@code merge}, {@code remove}, diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java deleted file mode 100644 index 2a5aa0a622..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBitCountProducerTest.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import static org.junit.jupiter.api.Assertions.assertArrayEquals; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import java.util.Arrays; -import java.util.BitSet; - -import org.apache.commons.collections4.bag.TreeBag; -import org.apache.commons.collections4.bloomfilter.BitCountProducer.BitCountConsumer; -import org.junit.jupiter.api.Test; - -public abstract class AbstractBitCountProducerTest extends AbstractIndexProducerTest { - - /** - * A testing BitCountConsumer that always returns true. - */ - private static final BitCountConsumer TRUE_CONSUMER = (i, j) -> true; - /** - * A testing BitCountConsumer that always returns false. - */ - private static final BitCountConsumer FALSE_CONSUMER = (i, j) -> false; - - /** - * Creates an array of integer pairs comprising the index and the expected count for the index. - * The order and count for each index is dependent upon the producer created by the {@code createProducer()} - * method. - * By default returns the each {@code getExpectedIndices()} value paired with 1 (one). - * @return an array of integer pairs comprising the index and the expected count for the index. - */ - protected int[][] getExpectedBitCount() { - return Arrays.stream(getExpectedIndices()).mapToObj(x -> new int[] {x, 1}).toArray(int[][]::new); - } - - /** - * Creates a producer with some data. - * @return a producer with some data - */ - @Override - protected abstract BitCountProducer createProducer(); - - /** - * Creates a producer without data. - * @return a producer that has no data. - */ - @Override - protected abstract BitCountProducer createEmptyProducer(); - - /** - * Gets the behavior of the {@link BitCountProducer#forEachCount(BitCountConsumer)} method. - * By default returns the value of {@code getAsIndexArrayBehaviour()} method. - * @return the behavior. - */ - protected int getForEachCountBehaviour() { - return getAsIndexArrayBehaviour(); - } - - @Test - public final void testForEachCountPredicates() { - final BitCountProducer populated = createProducer(); - final BitCountProducer empty = createEmptyProducer(); - - assertFalse(populated.forEachCount(FALSE_CONSUMER), "non-empty should be false"); - assertTrue(empty.forEachCount(FALSE_CONSUMER), "empty should be true"); - - assertTrue(populated.forEachCount(TRUE_CONSUMER), "non-empty should be true"); - assertTrue(empty.forEachCount(TRUE_CONSUMER), "empty should be true"); - } - - @Test - public final void testEmptyBitCountProducer() { - final BitCountProducer empty = createEmptyProducer(); - final int ary[] = empty.asIndexArray(); - assertEquals(0, ary.length); - assertTrue(empty.forEachCount((i, j) -> { - fail("forEachCount consumer should not be called"); - return false; - })); - } - - @Test - public final void testIndexConsistency() { - final BitCountProducer producer = createProducer(); - final BitSet bs1 = new BitSet(); - final BitSet bs2 = new BitSet(); - producer.forEachIndex(i -> { - bs1.set(i); - return true; - }); - producer.forEachCount((i, j) -> { - bs2.set(i); - return true; - }); - assertEquals(bs1, bs2); - } - - @Test - public void testForEachCountValues() { - // Assumes the collections bag works. Could be replaced with Map with more work. - final TreeBag expected = new TreeBag<>(); - Arrays.stream(getExpectedBitCount()).forEach(c -> expected.add(c[0], c[1])); - final TreeBag actual = new TreeBag<>(); - // can not return actual.add as it returns false on duplicate 'i' - createProducer().forEachCount((i, j) -> { - actual.add(i, j); - return true; - }); - assertEquals(expected, actual); - } - - /** - * Test the behavior of {@link BitCountProducer#forEachCount(BitCountConsumer)} with respect - * to ordered and distinct indices. Currently the behavior is assumed to be the same as - * {@link IndexProducer#forEachIndex(java.util.function.IntPredicate)}. - */ - @Test - public final void testBehaviourForEachCount() { - final int flags = getForEachCountBehaviour(); - assumeTrue((flags & (ORDERED | DISTINCT)) != 0); - final IntList list = new IntList(); - createProducer().forEachCount((i, j) -> list.add(i)); - final int[] actual = list.toArray(); - if ((flags & ORDERED) != 0) { - final int[] expected = Arrays.stream(actual).sorted().toArray(); - assertArrayEquals(expected, actual); - } - if ((flags & DISTINCT) != 0) { - final long count = Arrays.stream(actual).distinct().count(); - assertEquals(count, actual.length); - } - } - - @Test - public void testForEachCountEarlyExit() { - final int[] passes = new int[1]; - assertTrue(createEmptyProducer().forEachCount((i, j) -> { - passes[0]++; - return false; - })); - assertEquals(0, passes[0]); - - assertFalse(createProducer().forEachCount((i, j) -> { - passes[0]++; - return false; - })); - assertEquals(1, passes[0]); - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 7e1666a074..e4a9082771 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -94,9 +94,6 @@ protected final T createFilter(final Shape shape, final IndexProducer producer) return bf; } - /** - * - */ @Test public void testMergeWithBadHasher() { // value too large @@ -451,10 +448,5 @@ public BadHasher(final int value) { public IndexProducer indices(final Shape shape) { return producer; } - - @Override - public IndexProducer uniqueIndices(final Shape shape) { - return producer; - } } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCellProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCellProducerTest.java new file mode 100644 index 0000000000..8433161d1d --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCellProducerTest.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.Arrays; +import java.util.BitSet; + +import org.apache.commons.collections4.bloomfilter.CellProducer.CellConsumer; +import org.junit.jupiter.api.Test; + +public abstract class AbstractCellProducerTest extends AbstractIndexProducerTest { + + /** + * A testing CellConsumer that always returns true. + */ + private static final CellConsumer TRUE_CONSUMER = (i, j) -> true; + /** + * A testing CellConsumer that always returns false. + */ + private static final CellConsumer FALSE_CONSUMER = (i, j) -> false; + + /** + * Creates an array of expected values that aligns with the expected indices entries. + * @return an array of expected values. + * @see AbstractIndexProducerTest#getExpectedIndices() + */ + protected abstract int[] getExpectedValues(); + + @Override + protected final int getAsIndexArrayBehaviour() { + return ORDERED | DISTINCT; + } + + /** + * Creates a producer with some data. + * @return a producer with some data + */ + @Override + protected abstract CellProducer createProducer(); + + /** + * Creates a producer without data. + * @return a producer that has no data. + */ + @Override + protected abstract CellProducer createEmptyProducer(); + + @Test + public final void testForEachCellPredicates() { + final CellProducer populated = createProducer(); + final CellProducer empty = createEmptyProducer(); + + assertFalse(populated.forEachCell(FALSE_CONSUMER), "non-empty should be false"); + assertTrue(empty.forEachCell(FALSE_CONSUMER), "empty should be true"); + + assertTrue(populated.forEachCell(TRUE_CONSUMER), "non-empty should be true"); + assertTrue(empty.forEachCell(TRUE_CONSUMER), "empty should be true"); + } + + @Test + public final void testEmptyCellProducer() { + final CellProducer empty = createEmptyProducer(); + final int ary[] = empty.asIndexArray(); + assertEquals(0, ary.length); + assertTrue(empty.forEachCell((i, j) -> { + fail("forEachCell consumer should not be called"); + return false; + })); + } + + @Test + public final void testIndexConsistency() { + final CellProducer producer = createProducer(); + final BitSet bs1 = new BitSet(); + final BitSet bs2 = new BitSet(); + producer.forEachIndex(i -> { + bs1.set(i); + return true; + }); + producer.forEachCell((i, j) -> { + bs2.set(i); + return true; + }); + assertEquals(bs1, bs2); + } + + @Test + public void testForEachCellValues() { + int[] expectedIdx = getExpectedIndices(); + int[] expectedValue = getExpectedValues(); + assertEquals(expectedIdx.length, expectedValue.length, "expected index length and value length do not match"); + int[] idx = {0}; + createProducer().forEachCell((i, j) -> { + assertEquals(expectedIdx[idx[0]], i, "bad index at " + idx[0]); + assertEquals(expectedValue[idx[0]], j, "bad value at " + idx[0]); + idx[0]++; + return true; + }); + } + + /** + * Test the behavior of {@link CellProducer#forEachCell(CellConsumer)} with respect + * to ordered and distinct indices. Currently the behavior is assumed to be the same as + * {@link IndexProducer#forEachIndex(java.util.function.IntPredicate)}. + */ + @Test + public final void testBehaviourForEachCell() { + final IntList list = new IntList(); + createProducer().forEachCell((i, j) -> list.add(i)); + final int[] actual = list.toArray(); + // check order + final int[] expected = Arrays.stream(actual).sorted().toArray(); + assertArrayEquals(expected, actual); + // check distinct + final long count = Arrays.stream(actual).distinct().count(); + assertEquals(count, actual.length); + } + + @Test + public void testForEachCellEarlyExit() { + final int[] passes = new int[1]; + assertTrue(createEmptyProducer().forEachCell((i, j) -> { + passes[0]++; + return false; + })); + assertEquals(0, passes[0]); + + assertFalse(createProducer().forEachCell((i, j) -> { + passes[0]++; + return false; + })); + assertEquals(1, passes[0]); + } +} + diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java index de424111f0..6d489d8d13 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -39,14 +39,16 @@ public abstract class AbstractCountingBloomFilterTest { - for (int i = 1; i < 18; i++) { - if (!consumer.test(i, Integer.MAX_VALUE)) { - return false; + protected final CellProducer getMaximumValueProducer(int maxValue) { + return consumer -> { + for (int i = 1; i < 18; i++) { + if (!consumer.test(i, maxValue)) { + return false; + } } - } - return true; - }; + return true; + }; + } /** * Assert the counts match the expected values. Values are for indices starting @@ -57,7 +59,7 @@ public abstract class AbstractCountingBloomFilterTest m = new HashMap<>(); - bf.forEachCount((i, c) -> { + bf.forEachCell((i, c) -> { m.put(i, c); return true; }); @@ -82,7 +84,7 @@ public final void testCountingSpecificConstructor() { // verify hasher duplicates are counted. // bit hasher has duplicates for 11, 12,13,14,15,16, and 17 final CountingBloomFilter bf = createFilter(getTestShape(), TestingHashers.FROM1); - bf.add(BitCountProducer.from(TestingHashers.FROM11.indices(getTestShape()))); + bf.add(CellProducer.from(TestingHashers.FROM11.indices(getTestShape()))); final long[] lb = bf.asBitMapArray(); assertEquals(2, lb.length); @@ -130,7 +132,7 @@ public final void testCountingSpecificMerge() { // test overflow final CountingBloomFilter bf5 = createEmptyFilter(getTestShape()); - assertTrue(bf5.add(maximumValueProducer), "Should add to empty"); + assertTrue(bf5.add(getMaximumValueProducer(bf5.getMaxCell())), "Should add to empty"); assertTrue(bf5.isValid(), "Should be valid"); final CountingBloomFilter bf6 = bf5.copy(); @@ -155,7 +157,7 @@ public void testAdd() { // test overflow final CountingBloomFilter bf2 = createEmptyFilter(getTestShape()); - assertTrue(bf2.add(maximumValueProducer), "Should add to empty"); + assertTrue(bf2.add(getMaximumValueProducer(bf2.getMaxCell())), "Should add to empty"); assertTrue(bf2.isValid(), "Should be valid"); assertFalse(bf2.add(createFilter(getTestShape(), TestingHashers.FROM1)), "Should not add"); @@ -169,7 +171,7 @@ public void testAdd() { @Test public final void testSubtract() { final CountingBloomFilter bf1 = createFilter(getTestShape(), TestingHashers.FROM1); - bf1.add(BitCountProducer.from(TestingHashers.FROM11.indices(getTestShape()))); + bf1.add(CellProducer.from(TestingHashers.FROM11.indices(getTestShape()))); final CountingBloomFilter bf2 = createFilter(getTestShape(), TestingHashers.FROM11); @@ -190,6 +192,9 @@ public final void testSubtract() { assertFalse(bf3.contains(bf4), "Should not contain"); assertCounts(bf3, new int[] {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + + assertThrows(IllegalArgumentException.class, () -> bf3.remove( new BadHasher(-1))); + assertThrows(IllegalArgumentException.class, () -> bf3.remove( new BadHasher(getTestShape().getNumberOfBits())));; } /** @@ -202,7 +207,7 @@ public final void testRemove() { simple.merge(TestingHashers.FROM11); final CountingBloomFilter bf1 = createFilter(getTestShape(), TestingHashers.FROM1); - bf1.add(BitCountProducer.from(TestingHashers.FROM11.indices(getTestShape()))); + bf1.add(CellProducer.from(TestingHashers.FROM11.indices(getTestShape()))); assertTrue(bf1.remove(simple), "Remove should work"); assertFalse(bf1.contains(TestingHashers.FROM11), "Should not contain"); @@ -212,7 +217,7 @@ public final void testRemove() { // with hasher final CountingBloomFilter bf2 = createFilter(getTestShape(), TestingHashers.FROM1); - bf2.add(BitCountProducer.from(TestingHashers.FROM11.indices(getTestShape()))); + bf2.add(CellProducer.from(TestingHashers.FROM11.indices(getTestShape()))); assertTrue(bf2.remove(TestingHashers.FROM11), "Remove should work"); assertFalse(bf2.contains(TestingHashers.FROM11), "Should not contain"); @@ -233,7 +238,7 @@ public final void testRemove() { final IndexProducer ip = TestingHashers.FROM11.indices(getTestShape()); final CountingBloomFilter bf4 = createFilter(getTestShape(), TestingHashers.FROM1); - bf4.add(BitCountProducer.from(TestingHashers.FROM11.indices(getTestShape()))); + bf4.add(CellProducer.from(TestingHashers.FROM11.indices(getTestShape()))); assertTrue(bf4.remove(ip), "Remove should work"); assertFalse(bf4.contains(TestingHashers.FROM11), "Should not contain"); @@ -244,7 +249,7 @@ public final void testRemove() { // with BitMapProducer final BitMapProducer bmp = BitMapProducer.fromIndexProducer(ip, getTestShape().getNumberOfBits()); final CountingBloomFilter bf5 = createFilter(getTestShape(), TestingHashers.FROM1); - bf5.add(BitCountProducer.from(TestingHashers.FROM11.indices(getTestShape()))); + bf5.add(CellProducer.from(TestingHashers.FROM11.indices(getTestShape()))); assertTrue(bf5.remove(bmp), "Remove should work"); assertFalse(bf5.contains(TestingHashers.FROM11), "Should not contain"); @@ -260,6 +265,8 @@ public final void testRemove() { final CountingBloomFilter bf7 = createFilter(getTestShape(), TestingHashers.FROM1); final BitMapProducer bmp2 = BitMapProducer.fromIndexProducer(ip2, getTestShape().getNumberOfBits()); assertThrows(IllegalArgumentException.class, () -> bf7.remove(bmp2)); + assertThrows(IllegalArgumentException.class, () -> bf7.remove( new BadHasher(-1))); + assertThrows(IllegalArgumentException.class, () -> bf7.remove( new BadHasher(getTestShape().getNumberOfBits())));; } @Test @@ -272,7 +279,7 @@ public void testExcludesDuplicates() { CountingBloomFilter bf1 = createFilter(shape, hasher); assertEquals(6, bf1.cardinality()); - bf1.forEachCount((x, y) -> { + bf1.forEachCell((x, y) -> { assertEquals(1, y, "Hasher in constructor results in value not equal to 1"); return true; }); @@ -280,7 +287,7 @@ public void testExcludesDuplicates() { bf1 = createEmptyFilter(shape); bf1.merge(hasher); assertEquals(6, bf1.cardinality()); - bf1.forEachCount((x, y) -> { + bf1.forEachCell((x, y) -> { assertEquals(1, y, "Hasher in merge results in value not equal to 1"); return true; }); @@ -289,6 +296,110 @@ public void testExcludesDuplicates() { bf1.merge(hasher); bf1.remove(hasher); assertEquals(0, bf1.cardinality()); - assertTrue(bf1.forEachCount((x, y) -> false), "Hasher in removes results in value not equal to 0"); + assertTrue(bf1.forEachCell((x, y) -> false), "Hasher in removes results in value not equal to 0"); + } + + private void verifyMaxInsert(CountingBloomFilter bf, int from1, int from11) { + BloomFilter bfFrom0 = new DefaultBloomFilterTest.SparseDefaultBloomFilter(getTestShape()); + bfFrom0.merge(new IncrementingHasher(0, 1)); + BloomFilter bfFrom1 = new DefaultBloomFilterTest.SparseDefaultBloomFilter(getTestShape()); + bfFrom1.merge(TestingHashers.FROM1); + BloomFilter bfFrom11 = new DefaultBloomFilterTest.SparseDefaultBloomFilter(getTestShape()); + bfFrom11.merge(TestingHashers.FROM11); + + assertEquals(0, bf.getMaxInsert(new IncrementingHasher(0, 1))); + assertEquals(0, bf.getMaxInsert(bfFrom0)); + assertEquals(0, bf.getMaxInsert((BitMapProducer) bfFrom0)); + assertEquals(0, bf.getMaxInsert((IndexProducer) bfFrom0)); + + assertEquals(from1, bf.getMaxInsert(TestingHashers.FROM1)); + assertEquals(from1, bf.getMaxInsert(bfFrom1)); + assertEquals(from1, bf.getMaxInsert((BitMapProducer) bfFrom1)); + assertEquals(from1, bf.getMaxInsert((IndexProducer) bfFrom1)); + + assertEquals(from11, bf.getMaxInsert(TestingHashers.FROM11)); + assertEquals(from11, bf.getMaxInsert(bfFrom11)); + assertEquals(from11, bf.getMaxInsert((BitMapProducer) bfFrom11)); + assertEquals(from11, bf.getMaxInsert((IndexProducer) bfFrom11)); + } + + @Test + public void testGetMaxInsert() { + CountingBloomFilter bf = createEmptyFilter(getTestShape()); + verifyMaxInsert(bf, 0, 0); + bf.merge(TestingHashers.FROM1); + verifyMaxInsert(bf, 1, 0); + bf.merge(TestingHashers.FROM1); + verifyMaxInsert(bf, 2, 0); + bf.merge(TestingHashers.FROM11); + verifyMaxInsert(bf, 2, 1); + bf.remove(TestingHashers.FROM1); + verifyMaxInsert(bf, 1, 1); + // verify remove false positive works + // Incrementing hasher 5,1 spans the single count cells for both FROM1 and FROM11 + assertEquals(1, bf.getMaxInsert(new IncrementingHasher(5, 1))); + bf.remove(new IncrementingHasher(5, 1)); + verifyMaxInsert(bf, 0, 0); + assertEquals(0, bf.getMaxInsert(new IncrementingHasher(5, 1))); + } + + private void assertCell3(CountingBloomFilter bf, int value) { + bf.forEachCell((k, v) -> { + if (k == 3) { + assertEquals(value, v, "Mismatch at position 3"); + } else { + assertEquals(0, v, "Mismatch at position " + k); + } + return true; + }); + } + + @Test + public void mergeIncrementsAllCellsTest() { + CountingBloomFilter f1 = createEmptyFilter(Shape.fromKM(1, 10)); + CountingBloomFilter f2 = f1.copy(); + CountingBloomFilter f3 = f1.copy(); + // index producer produces 3 two times. + IndexProducer ip = p -> { + p.test(3); + p.test(3); + return true; + }; + // The merge should increment cell 3 by 1 + f1.merge(ip); + assertCell3(f1, 1); + + // The add should increment cells 3 by 2 + f2.add(CellProducer.from(ip)); + assertCell3(f2, 2); + } + + @Test + public void removeDecrementsAllCellsTest() { + CountingBloomFilter f1 = createEmptyFilter(Shape.fromKM(1, 10)); + CellProducer cp = p -> { + p.test(3, 3); + return true; + }; + f1.add(cp); + CountingBloomFilter f2 = f1.copy(); + CountingBloomFilter f3 = f1.copy(); + // index producer produces 3 two times. + IndexProducer ip = p -> { + p.test(3); + p.test(3); + return true; + }; + // The merge should decrement cell 3 by 1 + f1.remove(ip); + assertCell3(f1, 2); + + // The add should decrement cells 3 by 2 + f2.subtract(CellProducer.from(ip)); + assertCell3(f2, 1); + + // This merge will decrement by 1 as the round-trip makes the indices unique + f3.remove(IndexProducer.fromIndexArray(ip.asIndexArray())); + assertCell3(f3, 2); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java index 542f9a6875..917f361a2d 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java @@ -16,6 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter; +import static org.junit.Assert.assertSame; import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -245,4 +246,10 @@ public void testForEachIndexEarlyExit() { })); assertEquals(0, passes[0]); } + + @Test + public void testUniqueReturnsSelf() { + IndexProducer expected = createProducer().uniqueIndices(); + assertSame(expected, expected.uniqueIndices()); + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java index f2a18c426a..753decc6e2 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java @@ -37,12 +37,6 @@ public IndexProducer indices(final Shape shape) { return new Producer(shape); } - @Override - public IndexProducer uniqueIndices(final Shape shape) { - Objects.requireNonNull(shape, "shape"); - return new Producer(shape); - } - private class Producer implements IndexProducer { Shape shape; diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromHasherTest.java deleted file mode 100644 index 8c1e846989..0000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromHasherTest.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -public class BitCountProducerFromHasherTest extends AbstractBitCountProducerTest { - - @Override - protected BitCountProducer createProducer() { - // hasher has collisions and wraps - return BitCountProducer.from(new IncrementingHasher(4, 8).indices(Shape.fromKM(17, 72))); - } - - @Override - protected BitCountProducer createEmptyProducer() { - return BitCountProducer.from(NullHasher.INSTANCE.indices(Shape.fromKM(17, 72))); - } - - @Override - protected int getAsIndexArrayBehaviour() { - // Hasher allows duplicates and may be unordered - return 0; - } - - @Override - protected int[] getExpectedIndices() { - return new int[] {4, 12, 20, 28, 36, 44, 52, 60, 68, 4, 12, 20, 28, 36, 44, 52, 60}; - } - - @Override - protected int[][] getExpectedBitCount() { - return new int[][] {{4, 2}, {12, 2}, {20, 2}, {28, 2}, {36, 2}, {44, 2}, {52, 2}, {60, 2}, {68, 1}}; - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromArrayCountingBloomFilterTest.java new file mode 100644 index 0000000000..454e16492a --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromArrayCountingBloomFilterTest.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class CellProducerFromArrayCountingBloomFilterTest extends AbstractCellProducerTest { + + protected Shape shape = Shape.fromKM(17, 72); + + @Override + protected CellProducer createProducer() { + final ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); + filter.merge(new IncrementingHasher(0, 1)); + filter.merge(new IncrementingHasher(5, 1)); + return filter; + } + + @Override + protected CellProducer createEmptyProducer() { + return new ArrayCountingBloomFilter(shape); + } + + @Override + protected int[] getExpectedIndices() { + return new int[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}; + } + + @Override + protected int[] getExpectedValues() { + return new int[] {1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1}; + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromDefaultIndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromDefaultIndexProducerTest.java new file mode 100644 index 0000000000..8f97d8388d --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/CellProducerFromDefaultIndexProducerTest.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +public class CellProducerFromDefaultIndexProducerTest extends AbstractCellProducerTest { + + int[] data = {0, 63, 1, 64, 128, 1, 127}; + int[] indices = {0, 1, 63, 64, 127, 128}; + int[] values = {1, 2, 1, 1, 1, 1 }; + + @Override + protected CellProducer createProducer() { + return CellProducer.from(IndexProducer.fromIndexArray(data)); + } + + @Override + protected CellProducer createEmptyProducer() { + return CellProducer.from(IndexProducer.fromIndexArray(new int[0])); + } + + @Override + protected int[] getExpectedIndices() { + return indices; + } + + @Override + protected int[] getExpectedValues() { + return values; + } + +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitCountProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultCellProducerTest.java similarity index 69% rename from src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitCountProducerTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/DefaultCellProducerTest.java index a85b90b29d..e99a9aaeb4 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBitCountProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultCellProducerTest.java @@ -16,21 +16,27 @@ */ package org.apache.commons.collections4.bloomfilter; -public class DefaultBitCountProducerTest extends AbstractBitCountProducerTest { +public class DefaultCellProducerTest extends AbstractCellProducerTest { /** Make forEachIndex unordered and contain duplicates. */ - private final int[] values = {10, 1, 10, 1}; + private final int[] indices = {1, 2, 3, 5}; + private final int[] values = {1, 4, 9, 25}; @Override protected int[] getExpectedIndices() { + return indices; + } + + @Override + protected int[] getExpectedValues() { return values; } @Override - protected BitCountProducer createProducer() { + protected CellProducer createProducer() { return consumer -> { - for (final int i : values) { - if (!consumer.test(i, 1)) { + for (int i = 0; i < indices.length; i++) { + if (!consumer.test(indices[i], values[i] )) { return false; } } @@ -39,25 +45,13 @@ protected BitCountProducer createProducer() { } @Override - protected BitCountProducer createEmptyProducer() { + protected CellProducer createEmptyProducer() { return consumer -> true; } - @Override - protected int getAsIndexArrayBehaviour() { - // The default method streams a BitSet so is distinct and ordered. - return ORDERED | DISTINCT; - } - @Override protected int getForEachIndexBehaviour() { // The default method has the same behavior as the forEachCount() method. return 0; } - - @Override - protected int getForEachCountBehaviour() { - // The implemented method returns unordered duplicates. - return 0; - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultIndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultIndexProducerTest.java index 2682a96a7e..73a3a58dc9 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultIndexProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultIndexProducerTest.java @@ -22,8 +22,11 @@ import java.util.BitSet; import java.util.Objects; import java.util.concurrent.ThreadLocalRandom; +import java.util.stream.IntStream; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; public class DefaultIndexProducerTest extends AbstractIndexProducerTest { @@ -58,8 +61,7 @@ protected IndexProducer createEmptyProducer() { @Override protected int getAsIndexArrayBehaviour() { - // The default method streams a BitSet so is distinct and ordered. - return DISTINCT | ORDERED; + return 0; } @Override @@ -119,4 +121,21 @@ public void testFromIndexArray() { assertArrayEquals(expected, ip.asIndexArray()); } } + + @ParameterizedTest + @ValueSource(ints = {32, 33}) + public void testEntries(int size) { + int[] values = IntStream.range(0, size).toArray(); + IndexProducer producer = predicate -> { + Objects.requireNonNull(predicate); + for (final int i : values) { + if (!predicate.test(i)) { + return false; + } + } + return true; + }; + int[] other = producer.asIndexArray(); + assertArrayEquals(values, other); + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java similarity index 71% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java index 1cfe291a50..f9d7f18fcf 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java @@ -16,12 +16,12 @@ */ package org.apache.commons.collections4.bloomfilter; -public class BitCountProducerFromArrayCountingBloomFilterTest extends AbstractBitCountProducerTest { +public class IndexProducerFromArrayCountingBloomFilterTest extends AbstractIndexProducerTest { protected Shape shape = Shape.fromKM(17, 72); @Override - protected BitCountProducer createProducer() { + protected IndexProducer createProducer() { final ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); filter.merge(new IncrementingHasher(0, 1)); filter.merge(new IncrementingHasher(5, 1)); @@ -29,25 +29,18 @@ protected BitCountProducer createProducer() { } @Override - protected BitCountProducer createEmptyProducer() { + protected IndexProducer createEmptyProducer() { return new ArrayCountingBloomFilter(shape); } @Override - protected int getAsIndexArrayBehaviour() { - // CountingBloomFilter based on an array will be distinct and ordered - return DISTINCT | ORDERED; + protected int[] getExpectedIndices() { + return new int[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}; } @Override - protected int[][] getExpectedBitCount() { - return new int[][] {{0, 1}, {1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 2}, {6, 2}, {7, 2}, - {8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2}, {15, 2}, {16, 2}, - {17, 1}, {18, 1}, {19, 1}, {20, 1}, {21, 1}}; + protected int getAsIndexArrayBehaviour() { + return DISTINCT | ORDERED; } - @Override - protected int[] getExpectedIndices() { - return new int[] {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}; - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromDefaultIndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java similarity index 69% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromDefaultIndexProducerTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java index 56a5c792a6..f6fdd91008 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromDefaultIndexProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java @@ -16,27 +16,26 @@ */ package org.apache.commons.collections4.bloomfilter; -public class BitCountProducerFromDefaultIndexProducerTest extends AbstractBitCountProducerTest { - - int[] data = {0, 63, 1, 1, 64, 127, 128}; +public class IndexProducerFromHasherTest extends AbstractIndexProducerTest { @Override - protected BitCountProducer createProducer() { - return BitCountProducer.from(IndexProducer.fromIndexArray(data)); + protected int getAsIndexArrayBehaviour() { + return 0; } @Override - protected BitCountProducer createEmptyProducer() { - return BitCountProducer.from(IndexProducer.fromIndexArray(new int[0])); + protected IndexProducer createProducer() { + // hasher has collisions and wraps + return new IncrementingHasher(4, 8).indices(Shape.fromKM(17, 72)); } @Override - protected int getAsIndexArrayBehaviour() { - return 0; + protected IndexProducer createEmptyProducer() { + return NullHasher.INSTANCE.indices(Shape.fromKM(17, 72)); } @Override protected int[] getExpectedIndices() { - return data; + return new int[] {4, 12, 20, 28, 36, 44, 52, 60, 68, 4, 12, 20, 28, 36, 44, 52, 60}; } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromIntArrayTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromIntArrayTest.java similarity index 76% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromIntArrayTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromIntArrayTest.java index d0a598a17d..4b0281dccc 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromIntArrayTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromIntArrayTest.java @@ -16,27 +16,28 @@ */ package org.apache.commons.collections4.bloomfilter; -public class BitCountProducerFromIntArrayTest extends AbstractBitCountProducerTest { +public class IndexProducerFromIntArrayTest extends AbstractIndexProducerTest { int[] data = {6, 8, 1, 2, 4, 4, 5}; @Override - protected BitCountProducer createEmptyProducer() { - return BitCountProducer.from(IndexProducer.fromIndexArray(new int[0])); + protected IndexProducer createEmptyProducer() { + return IndexProducer.fromIndexArray(new int[0]); } @Override - protected BitCountProducer createProducer() { - return BitCountProducer.from(IndexProducer.fromIndexArray(data)); + protected IndexProducer createProducer() { + return IndexProducer.fromIndexArray(data); } @Override - protected int getAsIndexArrayBehaviour() { - return 0; + protected int[] getExpectedIndices() { + return data; } @Override - protected int[] getExpectedIndices() { - return data; + protected int getAsIndexArrayBehaviour() { + return 0; } + } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromSimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java similarity index 77% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromSimpleBloomFilterTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java index a6b2be2099..b2f3e947a6 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromSimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java @@ -16,31 +16,30 @@ */ package org.apache.commons.collections4.bloomfilter; -public class BitCountProducerFromSimpleBloomFilterTest extends AbstractBitCountProducerTest { +public class IndexProducerFromSimpleBloomFilterTest extends AbstractIndexProducerTest { protected Shape shape = Shape.fromKM(17, 72); @Override - protected BitCountProducer createProducer() { + protected IndexProducer createProducer() { final Hasher hasher = new IncrementingHasher(3, 2); final BloomFilter bf = new SimpleBloomFilter(shape); bf.merge(hasher); - return BitCountProducer.from(bf); + return bf; } @Override - protected BitCountProducer createEmptyProducer() { - return BitCountProducer.from(new SimpleBloomFilter(shape)); + protected IndexProducer createEmptyProducer() { + return new SimpleBloomFilter(shape); } @Override - protected int getAsIndexArrayBehaviour() { - // BloomFilter based on a bit map array will be distinct and ordered - return DISTINCT | ORDERED; + protected int[] getExpectedIndices() { + return new int[] {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35}; } @Override - protected int[] getExpectedIndices() { - return new int[] {3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35}; + protected int getAsIndexArrayBehaviour() { + return DISTINCT | ORDERED; } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromSparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java similarity index 70% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromSparseBloomFilterTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java index 7e05bb9961..d7bf45cba9 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromSparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java @@ -16,33 +16,30 @@ */ package org.apache.commons.collections4.bloomfilter; -public class BitCountProducerFromSparseBloomFilterTest extends AbstractBitCountProducerTest { +public class IndexProducerFromSparseBloomFilterTest extends AbstractIndexProducerTest { protected Shape shape = Shape.fromKM(17, 72); @Override - protected BitCountProducer createProducer() { + protected IndexProducer createProducer() { final Hasher hasher = new IncrementingHasher(4, 7); final BloomFilter bf = new SparseBloomFilter(shape); bf.merge(hasher); - return BitCountProducer.from(bf); + return bf; } @Override - protected BitCountProducer createEmptyProducer() { - return BitCountProducer.from(new SparseBloomFilter(shape)); + protected IndexProducer createEmptyProducer() { + return new SparseBloomFilter(shape); } @Override - protected int getAsIndexArrayBehaviour() { - // A sparse BloomFilter will be distinct but it may not be ordered. - // Currently the ordered behavior is asserted as the implementation uses - // an ordered TreeSet. This may change in the future. - return DISTINCT | ORDERED; + protected int[] getExpectedIndices() { + return new int[] {2, 4, 9, 11, 16, 18, 23, 25, 30, 32, 37, 39, 44, 46, 53, 60, 67}; } @Override - protected int[] getExpectedIndices() { - return new int[] {2, 4, 9, 11, 16, 18, 23, 25, 30, 32, 37, 39, 44, 46, 53, 60, 67}; + protected int getAsIndexArrayBehaviour() { + return DISTINCT |ORDERED; } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromUniqueHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromUniqueHasherTest.java similarity index 73% rename from src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromUniqueHasherTest.java rename to src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromUniqueHasherTest.java index 18e7f7936d..c7cbd217cb 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromUniqueHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromUniqueHasherTest.java @@ -16,27 +16,26 @@ */ package org.apache.commons.collections4.bloomfilter; -public class BitCountProducerFromUniqueHasherTest extends AbstractBitCountProducerTest { +public class IndexProducerFromUniqueHasherTest extends AbstractIndexProducerTest { @Override - protected BitCountProducer createProducer() { + protected IndexProducer createProducer() { // hasher has collisions and wraps - return BitCountProducer.from(new IncrementingHasher(4, 8).uniqueIndices(Shape.fromKM(17, 72))); + return new IncrementingHasher(4, 8).indices(Shape.fromKM(17, 72)).uniqueIndices(); } @Override - protected BitCountProducer createEmptyProducer() { - return BitCountProducer.from(NullHasher.INSTANCE.indices(Shape.fromKM(17, 72))); + protected IndexProducer createEmptyProducer() { + return NullHasher.INSTANCE.indices(Shape.fromKM(17, 72)); } @Override - protected int getAsIndexArrayBehaviour() { - // Hasher may be unordered - return DISTINCT; + protected int[] getExpectedIndices() { + return new int[] {4, 12, 20, 28, 36, 44, 52, 60, 68}; } @Override - protected int[] getExpectedIndices() { - return new int[] {4, 12, 20, 28, 36, 44, 52, 60, 68}; + protected int getAsIndexArrayBehaviour() { + return DISTINCT; } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java index 52f557a834..655dfeed9d 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerTest.java @@ -22,7 +22,10 @@ import java.util.List; import java.util.function.LongPredicate; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; public class IndexProducerTest { @@ -68,4 +71,17 @@ public boolean forEachBitMap(final LongPredicate consumer) { return true; } } + + @ParameterizedTest + @ValueSource(ints = {32, 33}) + void testAsIndexArray(int n) { + IndexProducer ip = i -> { + for (int j = 0; j < n; j++) { + // Always test index zero + i.test(0); + } + return true; + }; + Assertions.assertArrayEquals(new int[n], ip.asIndexArray()); + } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/NullHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/NullHasher.java index 9ab0fb76d5..6ac6d04eae 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/NullHasher.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/NullHasher.java @@ -53,10 +53,4 @@ public IndexProducer indices(final Shape shape) { Objects.requireNonNull(shape, "shape"); return PRODUCER; } - - @Override - public IndexProducer uniqueIndices(final Shape shape) { - Objects.requireNonNull(shape, "shape"); - return PRODUCER; - } }