Skip to content

Commit

Permalink
Optimize large IN filters on integer data types
Browse files Browse the repository at this point in the history
Use a bitset based filter instead of a hash set when the range of
values is narrow enough. We use bitset only when it would occupy
lesser space than the equivalent open hash set.
This is useful for making evaluation of dynamic filter more efficient
as we often collect large integer sets in dynamic filters.

BenchmarkDynamicPageFilter.filterPages
(filterSize)   (inputDataSet)  (nonNullsSelectivity)   Mode  Cnt  Before score      After score
         100  INT64_FIXED_32K                    0.2  thrpt   30  446.174 ? 10.598   449.113 ?  5.323 ops/s
        1000  INT64_FIXED_32K                    0.2  thrpt   30  407.625 ?  3.139  1379.767 ? 19.318 ops/s
        5000  INT64_FIXED_32K                    0.2  thrpt   30  426.413 ?  6.485  1254.731 ? 11.685 ops/s
  • Loading branch information
raunaqmorarka committed Sep 19, 2024
1 parent fe59130 commit f9082b4
Show file tree
Hide file tree
Showing 5 changed files with 575 additions and 2 deletions.
149 changes: 148 additions & 1 deletion core/trino-main/src/main/java/io/trino/util/FastutilSetHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,50 @@
import com.google.common.util.concurrent.UncheckedExecutionException;
import com.google.errorprone.annotations.concurrent.GuardedBy;
import io.trino.cache.NonEvictableCache;
import io.trino.spi.type.BigintType;
import io.trino.spi.type.DateType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.IntegerType;
import io.trino.spi.type.SmallintType;
import io.trino.spi.type.TimeType;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.TinyintType;
import io.trino.spi.type.Type;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.booleans.BooleanOpenHashSet;
import it.unimi.dsi.fastutil.doubles.DoubleHash;
import it.unimi.dsi.fastutil.doubles.DoubleOpenCustomHashSet;
import it.unimi.dsi.fastutil.longs.AbstractLongIterator;
import it.unimi.dsi.fastutil.longs.AbstractLongSet;
import it.unimi.dsi.fastutil.longs.LongHash;
import it.unimi.dsi.fastutil.longs.LongIterator;
import it.unimi.dsi.fastutil.longs.LongOpenCustomHashSet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectOpenCustomHashSet;
import jakarta.annotation.Nonnull;

import java.lang.invoke.MethodHandle;
import java.math.BigInteger;
import java.util.BitSet;
import java.util.Collection;
import java.util.Objects;
import java.util.PrimitiveIterator;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Throwables.throwIfUnchecked;
import static com.google.common.base.Verify.verifyNotNull;
import static io.trino.cache.CacheUtils.uncheckedCacheGet;
import static io.trino.cache.SafeCaches.buildNonEvictableCache;
import static io.trino.util.SingleAccessMethodCompiler.compileSingleAccessMethod;
import static it.unimi.dsi.fastutil.HashCommon.nextPowerOfTwo;
import static java.lang.Boolean.TRUE;
import static java.lang.Math.ceil;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.String.format;
import static java.lang.invoke.MethodType.methodType;
import static java.util.Objects.requireNonNull;

Expand All @@ -56,7 +79,7 @@ public static Set<?> toFastutilHashSet(Set<?> set, Type type, MethodHandle hashC
// The performance of InCodeGenerator heavily depends on the load factor being small.
Class<?> javaElementType = type.getJavaType();
if (javaElementType == long.class) {
return new LongOpenCustomHashSet((Collection<Long>) set, 0.25f, new LongStrategy(hashCodeHandle, equalsHandle, type));
return createSetForLongJavaType((Set<Long>) set, type, hashCodeHandle, equalsHandle);
}
if (javaElementType == double.class) {
return new DoubleOpenCustomHashSet((Collection<Double>) set, 0.25f, new DoubleStrategy(hashCodeHandle, equalsHandle, type));
Expand Down Expand Up @@ -85,11 +108,93 @@ public static boolean in(long longValue, LongOpenCustomHashSet set)
return set.contains(longValue);
}

public static boolean in(long longValue, LongBitSetFilter set)
{
return set.contains(longValue);
}

public static boolean in(long longValue, LongOpenHashSet set)
{
return set.contains(longValue);
}

public static boolean in(Object objectValue, ObjectOpenCustomHashSet<?> set)
{
return set.contains(objectValue);
}

public static final class LongBitSetFilter
extends AbstractLongSet
{
private final BitSet bitmask;
private final long min;
private final long max;
private final int size;

private LongBitSetFilter(Set<Long> values, long min, long max)
{
checkArgument(min <= max, "min must be less than or equal to max");
checkArgument(!values.isEmpty(), "values must not be empty");
this.min = min;
this.max = max;
long range = max - min + 1;
checkArgument((int) range == range, format("Values range %d is outside integer range", range));
bitmask = new BitSet((int) range);

for (long value : values) {
bitmask.set((int) (value - min));
}
this.size = values.size();
}

@Override
public boolean contains(long value)
{
if (value < min || value > max) {
return false;
}

return bitmask.get((int) (value - min));
}

@Override
@Nonnull
public LongIterator iterator()
{
PrimitiveIterator.OfInt iterator = bitmask.stream().iterator();
return new AbstractLongIterator() {
@Override
public long nextLong()
{
return iterator.nextInt();
}

@Override
public boolean hasNext()
{
return iterator.hasNext();
}
};
}

@Override
public int size()
{
return size;
}

@Override
public String toString()
{
return toStringHelper(this)
.add("bitmask", bitmask)
.add("min", min)
.add("max", max)
.add("size", size)
.toString();
}
}

private static final class LongStrategy
implements LongHash.Strategy
{
Expand Down Expand Up @@ -318,4 +423,46 @@ public synchronized T get()
}
}
}

private static Set<Long> createSetForLongJavaType(Set<Long> values, Type type, MethodHandle hashCodeHandle, MethodHandle equalsHandle)
{
if (values.isEmpty() || !isDirectLongComparisonValidType(type)) {
return new LongOpenCustomHashSet(values, 0.25f, new LongStrategy(hashCodeHandle, equalsHandle, type));
}

long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
for (Long value : values) {
min = min(min, value);
max = max(max, value);
}

// A Set based on a fastutil open hashset uses (next power of two of size / load-factor)
// slots in a hash table which is an array of longs.
long setEntries = (int) min(1 << 30, nextPowerOfTwo((long) ceil(values.size() / 0.25f)));
BigInteger range = BigInteger.valueOf(max)
.subtract(BigInteger.valueOf(min))
.add(BigInteger.valueOf(1));
// A Set based on a bitmap uses (max - min) / 64 longs
// Create a bitset only if it uses fewer entries than the equivalent hash set
if (range.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) > 0
|| (range.intValueExact() / 64) + 1 > setEntries) {
return new LongOpenHashSet(values, 0.25f);
}
return new LongBitSetFilter(values, min, max);
}

private static boolean isDirectLongComparisonValidType(Type type)
{
// Types for which we can safely use equality and hashCode on the stored long value
// instead of going through type specific methods
return type instanceof TinyintType ||
type instanceof SmallintType ||
type instanceof IntegerType ||
type instanceof BigintType ||
type instanceof TimeType ||
type instanceof DateType ||
type instanceof TimestampType timestampType && timestampType.isShort() ||
(type instanceof DecimalType && ((DecimalType) type).isShort());
}
}
Loading

0 comments on commit f9082b4

Please sign in to comment.