diff --git a/core/trino-main/src/main/java/io/trino/server/security/jwt/FileSigningKeyResolver.java b/core/trino-main/src/main/java/io/trino/server/security/jwt/FileSigningKeyResolver.java index 853eb0addf54..a4927235d8a0 100644 --- a/core/trino-main/src/main/java/io/trino/server/security/jwt/FileSigningKeyResolver.java +++ b/core/trino-main/src/main/java/io/trino/server/security/jwt/FileSigningKeyResolver.java @@ -75,12 +75,12 @@ public Key resolveSigningKey(JwsHeader header, Claims claims) } @Override - public Key resolveSigningKey(JwsHeader header, String plaintext) + public Key resolveSigningKey(JwsHeader header, byte[] plaintext) { return getKey(header); } - private Key getKey(JwsHeader header) + private Key getKey(JwsHeader header) { SignatureAlgorithm algorithm = SignatureAlgorithm.forName(header.getAlgorithm()); @@ -93,7 +93,7 @@ private Key getKey(JwsHeader header) return key.getKey(algorithm); } - private static String getKeyId(JwsHeader header) + private static String getKeyId(JwsHeader header) { String keyId = header.getKeyId(); if (keyId == null) { diff --git a/core/trino-main/src/main/java/io/trino/server/security/jwt/JwkSigningKeyResolver.java b/core/trino-main/src/main/java/io/trino/server/security/jwt/JwkSigningKeyResolver.java index 51ca571913c8..eda07f2e2aa9 100644 --- a/core/trino-main/src/main/java/io/trino/server/security/jwt/JwkSigningKeyResolver.java +++ b/core/trino-main/src/main/java/io/trino/server/security/jwt/JwkSigningKeyResolver.java @@ -39,12 +39,12 @@ public Key resolveSigningKey(JwsHeader header, Claims claims) } @Override - public Key resolveSigningKey(JwsHeader header, String plaintext) + public Key resolveSigningKey(JwsHeader header, byte[] plaintext) { return getKey(header); } - private Key getKey(JwsHeader header) + private Key getKey(JwsHeader header) { String keyId = header.getKeyId(); if (keyId == null) { diff --git a/core/trino-main/src/main/java/io/trino/server/security/oauth2/JweTokenSerializer.java b/core/trino-main/src/main/java/io/trino/server/security/oauth2/JweTokenSerializer.java index 219fbfe3c49d..1a7565a90b07 100644 --- a/core/trino-main/src/main/java/io/trino/server/security/oauth2/JweTokenSerializer.java +++ b/core/trino-main/src/main/java/io/trino/server/security/oauth2/JweTokenSerializer.java @@ -148,7 +148,7 @@ private static SecretKey getOrGenerateKey(RefreshTokensConfig config) return signingKey; } - private static CompressionCodec resolveCompressionCodec(Header header) + private static CompressionCodec resolveCompressionCodec(Header header) throws CompressionException { if (header.getCompressionAlgorithm() != null) { diff --git a/core/trino-main/src/main/java/io/trino/server/security/oauth2/ZstdCodec.java b/core/trino-main/src/main/java/io/trino/server/security/oauth2/ZstdCodec.java index 3b4f64562853..f7446c1e0d81 100644 --- a/core/trino-main/src/main/java/io/trino/server/security/oauth2/ZstdCodec.java +++ b/core/trino-main/src/main/java/io/trino/server/security/oauth2/ZstdCodec.java @@ -18,6 +18,9 @@ import io.jsonwebtoken.CompressionCodec; import io.jsonwebtoken.CompressionException; +import java.io.InputStream; +import java.io.OutputStream; + import static java.lang.Math.toIntExact; import static java.util.Arrays.copyOfRange; @@ -50,4 +53,22 @@ public byte[] decompress(byte[] bytes) new ZstdDecompressor().decompress(bytes, 0, bytes.length, output, 0, output.length); return output; } + + @Override + public OutputStream compress(OutputStream out) + { + throw new UnsupportedOperationException("Unimplemented method 'compress'"); + } + + @Override + public InputStream decompress(InputStream in) + { + throw new UnsupportedOperationException("Unimplemented method 'decompress'"); + } + + @Override + public String getId() + { + throw new UnsupportedOperationException("Unimplemented method 'getId'"); + } } diff --git a/core/trino-main/src/test/java/io/trino/server/security/jwt/TestJwkDecoder.java b/core/trino-main/src/test/java/io/trino/server/security/jwt/TestJwkDecoder.java index 9e117108730c..4218fb659387 100644 --- a/core/trino-main/src/test/java/io/trino/server/security/jwt/TestJwkDecoder.java +++ b/core/trino-main/src/test/java/io/trino/server/security/jwt/TestJwkDecoder.java @@ -208,12 +208,12 @@ public Key resolveSigningKey(JwsHeader header, Claims claims) } @Override - public Key resolveSigningKey(JwsHeader header, String plaintext) + public Key resolveSigningKey(JwsHeader header, byte[] plaintext) { return getKey(header); } - private Key getKey(JwsHeader header) + private Key getKey(JwsHeader header) { String keyId = header.getKeyId(); assertEquals(keyId, "test-rsa"); @@ -344,12 +344,12 @@ public Key resolveSigningKey(JwsHeader header, Claims claims) } @Override - public Key resolveSigningKey(JwsHeader header, String plaintext) + public Key resolveSigningKey(JwsHeader header, byte[] plaintext) { return getKey(header); } - private Key getKey(JwsHeader header) + private Key getKey(JwsHeader header) { String keyId = header.getKeyId(); assertEquals(keyId, keyName); diff --git a/core/trino-main/src/test/java/io/trino/server/security/oauth2/TestJweTokenSerializer.java b/core/trino-main/src/test/java/io/trino/server/security/oauth2/TestJweTokenSerializer.java index c8712694cedf..df939402a537 100644 --- a/core/trino-main/src/test/java/io/trino/server/security/oauth2/TestJweTokenSerializer.java +++ b/core/trino-main/src/test/java/io/trino/server/security/oauth2/TestJweTokenSerializer.java @@ -194,7 +194,7 @@ static class Oauth2ClientStub implements OAuth2Client { private final Map claims = Jwts.claims() - .setSubject("user"); + .setSubject("user").build(); @Override public void load() diff --git a/core/trino-main/src/test/java/io/trino/server/ui/TestWebUi.java b/core/trino-main/src/test/java/io/trino/server/ui/TestWebUi.java index 24c7c6b10eb5..d48da56db7c5 100644 --- a/core/trino-main/src/test/java/io/trino/server/ui/TestWebUi.java +++ b/core/trino-main/src/test/java/io/trino/server/ui/TestWebUi.java @@ -29,6 +29,7 @@ import io.jsonwebtoken.Claims; import io.jsonwebtoken.JwsHeader; import io.jsonwebtoken.impl.DefaultClaims; +import io.jsonwebtoken.impl.DefaultClaimsBuilder; import io.trino.security.AccessControl; import io.trino.server.HttpRequestSessionContextFactory; import io.trino.server.ProtocolConfig; @@ -1297,11 +1298,12 @@ private static String issueToken(Claims claims) private static Claims createClaims() { - return new DefaultClaims() + return new DefaultClaimsBuilder() .setIssuer(TOKEN_ISSUER) .setAudience(OAUTH_CLIENT_ID) .setSubject("test-user") - .setExpiration(Date.from(Instant.now().plus(Duration.ofMinutes(5)))); + .setExpiration(Date.from(Instant.now().plus(Duration.ofMinutes(5)))) + .build(); } public static String randomNonce() diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/Location.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/Location.java index 08cbe7ef03df..62cf2502bfd2 100644 --- a/lib/trino-filesystem/src/main/java/io/trino/filesystem/Location.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/Location.java @@ -23,6 +23,7 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.Iterables.getLast; +import static io.trino.filesystem.Locations.isS3Tables; import static java.lang.Integer.parseInt; import static java.util.Objects.requireNonNull; import static java.util.function.Predicate.not; @@ -95,7 +96,10 @@ public static Location of(String location) } } - checkArgument((userInfo.isEmpty() && host.isEmpty() && port.isEmpty()) || authoritySplit.size() == 2, "Path missing in file system location: %s", location); + if (!isS3Tables(location)) { + // S3 Tables create tables under the bucket like 's3://e97725d9-dbfb-4334-784sox7edps35ncq16arh546frqa1use2b--table-s3' + checkArgument((userInfo.isEmpty() && host.isEmpty() && port.isEmpty()) || authoritySplit.size() == 2, "Path missing in file system location: %s", location); + } String path = (authoritySplit.size() == 2) ? authoritySplit.get(1) : ""; return new Location(location, Optional.of(scheme), userInfo, host, port, path); diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/Locations.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/Locations.java index 13694965dc76..d29b80df9acf 100644 --- a/lib/trino-filesystem/src/main/java/io/trino/filesystem/Locations.java +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/Locations.java @@ -13,10 +13,14 @@ */ package io.trino.filesystem; +import java.util.regex.Pattern; + import static com.google.common.base.Preconditions.checkArgument; public final class Locations { + private static final Pattern S3_TABLES = Pattern.compile("s3://(?!.*/).*--table-s3"); + private Locations() {} /** @@ -47,4 +51,9 @@ public static boolean areDirectoryLocationsEquivalent(Location leftLocation, Loc return leftLocation.equals(rightLocation) || leftLocation.removeOneTrailingSlash().equals(rightLocation.removeOneTrailingSlash()); } + + public static boolean isS3Tables(String location) + { + return S3_TABLES.matcher(location).matches(); + } } diff --git a/lib/trino-filesystem/src/main/java/io/trino/filesystem/s3/S3FileSystemConstants.java b/lib/trino-filesystem/src/main/java/io/trino/filesystem/s3/S3FileSystemConstants.java new file mode 100644 index 000000000000..1155026292a6 --- /dev/null +++ b/lib/trino-filesystem/src/main/java/io/trino/filesystem/s3/S3FileSystemConstants.java @@ -0,0 +1,23 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.filesystem.s3; + +public final class S3FileSystemConstants +{ + public static final String EXTRA_CREDENTIALS_ACCESS_KEY_PROPERTY = "internal$s3_aws_access_key"; + public static final String EXTRA_CREDENTIALS_SECRET_KEY_PROPERTY = "internal$s3_aws_secret_key"; + public static final String EXTRA_CREDENTIALS_SESSION_TOKEN_PROPERTY = "internal$s3_aws_session_token"; + + private S3FileSystemConstants() {} +} diff --git a/lib/trino-hive-formats/pom.xml b/lib/trino-hive-formats/pom.xml index dd8a1d3f88da..f44a3c6e7b46 100644 --- a/lib/trino-hive-formats/pom.xml +++ b/lib/trino-hive-formats/pom.xml @@ -194,12 +194,14 @@ test + org.assertj diff --git a/lib/trino-parquet/pom.xml b/lib/trino-parquet/pom.xml index ea888e03b3da..1165812e2533 100644 --- a/lib/trino-parquet/pom.xml +++ b/lib/trino-parquet/pom.xml @@ -84,6 +84,12 @@ org.apache.parquet parquet-format-structures + + + javax.annotation + javax.annotation-api + + diff --git a/plugin/trino-bigquery/pom.xml b/plugin/trino-bigquery/pom.xml index f558faf3af25..00f9869e60ae 100644 --- a/plugin/trino-bigquery/pom.xml +++ b/plugin/trino-bigquery/pom.xml @@ -36,7 +36,7 @@ org.apache.commons commons-lang3 - 3.11 + 3.18.0 diff --git a/plugin/trino-delta-lake/pom.xml b/plugin/trino-delta-lake/pom.xml index d0c01736b5bf..d4e7fdf948e0 100644 --- a/plugin/trino-delta-lake/pom.xml +++ b/plugin/trino-delta-lake/pom.xml @@ -192,6 +192,12 @@ org.apache.parquet parquet-format-structures + + + javax.annotation + javax.annotation-api + + diff --git a/plugin/trino-hive/pom.xml b/plugin/trino-hive/pom.xml index ed6caaf034ac..df75e83d7254 100644 --- a/plugin/trino-hive/pom.xml +++ b/plugin/trino-hive/pom.xml @@ -259,6 +259,12 @@ org.apache.parquet parquet-format-structures + + + javax.annotation + javax.annotation-api + + diff --git a/plugin/trino-iceberg/pom.xml b/plugin/trino-iceberg/pom.xml index c37e9c99f077..f8ba3f3f7987 100644 --- a/plugin/trino-iceberg/pom.xml +++ b/plugin/trino-iceberg/pom.xml @@ -25,7 +25,7 @@ --> instances - 0.59.0 + 0.104.4 @@ -75,6 +75,11 @@ bootstrap + + io.airlift + concurrent + + io.airlift configuration @@ -130,6 +135,11 @@ trino-filesystem-manager + + io.trino + trino-filesystem-s3 + + io.trino trino-hdfs @@ -151,6 +161,41 @@ + + io.trino + trino-main + + + io.airlift + http-server + + + + + io.airlift + tracing + + + io.trino + trino-parser + + + org.eclipse.jetty.toolchain + jetty-jakarta-servlet-api + + + + io.trino trino-memory-context @@ -161,10 +206,12 @@ trino-orc + io.trino @@ -213,6 +260,17 @@ iceberg-api + + org.apache.iceberg + iceberg-aws + + + org.jspecify + jspecify + + + + org.apache.iceberg iceberg-core @@ -247,6 +305,12 @@ org.apache.parquet parquet-format-structures + + + javax.annotation + javax.annotation-api + + @@ -281,6 +345,11 @@ jmxutils + + software.amazon.awssdk + glue + + com.fasterxml.jackson.core jackson-annotations @@ -323,12 +392,6 @@ runtime - - io.airlift - concurrent - runtime - - io.airlift log-manager @@ -368,14 +431,14 @@ org.apache.httpcomponents.client5 httpclient5 - 5.2.1 + 5.4.3 runtime org.apache.httpcomponents.core5 httpcore5 - 5.2.1 + 5.3.4 runtime @@ -477,19 +540,6 @@ test - - io.trino - trino-main - test - - - - io.trino - trino-main - test-jar - test - - io.trino trino-parser @@ -593,6 +643,22 @@ + + org.apache.maven.plugins + maven-compiler-plugin + + + ${extraJavaVectorArgs} + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + ${extraJavaVectorArgs} + + org.apache.maven.plugins maven-dependency-plugin diff --git a/plugin/trino-iceberg/src/main/java/io/trino/filesystem/TrinoFileSystemException.java b/plugin/trino-iceberg/src/main/java/io/trino/filesystem/TrinoFileSystemException.java new file mode 100644 index 000000000000..30d09b429d98 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/filesystem/TrinoFileSystemException.java @@ -0,0 +1,35 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.filesystem; + +import java.io.IOException; + +/** + * Unrecoverable file system exception. + * This exception is thrown for fatal errors, or after retries have already been performed, + * so additional retries must not be performed when this is caught. + */ +public class TrinoFileSystemException + extends IOException +{ + public TrinoFileSystemException(String message, Throwable cause) + { + super(message, cause); + } + + public TrinoFileSystemException(String message) + { + super(message); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/metastore/TableInfo.java b/plugin/trino-iceberg/src/main/java/io/trino/metastore/TableInfo.java new file mode 100644 index 000000000000..9d0d07bcd4c0 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/metastore/TableInfo.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.metastore; + +import io.trino.spi.connector.RelationType; +import io.trino.spi.connector.SchemaTableName; + +import static java.util.Objects.requireNonNull; + +public record TableInfo(SchemaTableName tableName, ExtendedRelationType extendedRelationType) +{ + public static final String PRESTO_VIEW_COMMENT = "Presto View"; + public static final String ICEBERG_MATERIALIZED_VIEW_COMMENT = "Presto Materialized View"; + + public TableInfo + { + requireNonNull(tableName, "tableName is null"); + requireNonNull(extendedRelationType, "extendedRelationType is null"); + } + + public enum ExtendedRelationType + { + TABLE(RelationType.TABLE), + OTHER_VIEW(RelationType.VIEW), + OTHER_MATERIALIZED_VIEW(RelationType.MATERIALIZED_VIEW), + TRINO_VIEW(RelationType.VIEW), + TRINO_MATERIALIZED_VIEW(RelationType.MATERIALIZED_VIEW); + + private final RelationType relationType; + + ExtendedRelationType(RelationType relationType) + { + this.relationType = relationType; + } + + public RelationType toRelationType() + { + return relationType; + } + + public static ExtendedRelationType fromTableTypeAndComment(String tableType, String comment) + { + return switch (tableType) { + case "VIRTUAL_VIEW" -> { + if (PRESTO_VIEW_COMMENT.equals(comment)) { + yield TRINO_VIEW; + } + if (ICEBERG_MATERIALIZED_VIEW_COMMENT.equals(comment)) { + yield TRINO_MATERIALIZED_VIEW; + } + yield OTHER_VIEW; + } + case "MATERIALIZED_VIEW" -> ICEBERG_MATERIALIZED_VIEW_COMMENT.equals(comment) ? TRINO_MATERIALIZED_VIEW : OTHER_MATERIALIZED_VIEW; + default -> TABLE; + }; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/AbstractParquetDataSource.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/AbstractParquetDataSource.java new file mode 100644 index 000000000000..459b6eea7be3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/AbstractParquetDataSource.java @@ -0,0 +1,360 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableListMultimap; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ListMultimap; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.airlift.units.DataSize; +import io.trino.memory.context.AggregatedMemoryContext; +import io.trino.memory.context.LocalMemoryContext; +import io.trino.parquet.reader.ChunkedInputStream; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static java.util.Comparator.comparingLong; +import static java.util.Objects.requireNonNull; + +public abstract class AbstractParquetDataSource + implements ParquetDataSource +{ + private final ParquetDataSourceId id; + private final long estimatedSize; + private final ParquetReaderOptions options; + private long readTimeNanos; + private long readBytes; + + protected AbstractParquetDataSource(ParquetDataSourceId id, long estimatedSize, ParquetReaderOptions options) + { + this.id = requireNonNull(id, "id is null"); + this.estimatedSize = estimatedSize; + this.options = requireNonNull(options, "options is null"); + } + + protected Slice readTailInternal(int length) + throws IOException + { + int readSize = toIntExact(min(estimatedSize, length)); + return readFully(estimatedSize - readSize, readSize); + } + + protected abstract void readInternal(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException; + + @Override + public ParquetDataSourceId getId() + { + return id; + } + + @Override + public final long getReadBytes() + { + return readBytes; + } + + @Override + public final long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public final long getEstimatedSize() + { + return estimatedSize; + } + + @Override + public Slice readTail(int length) + throws IOException + { + long start = System.nanoTime(); + + Slice tailSlice = readTailInternal(length); + + readTimeNanos += System.nanoTime() - start; + readBytes += tailSlice.length(); + + return tailSlice; + } + + @Override + public final Slice readFully(long position, int length) + throws IOException + { + byte[] buffer = new byte[length]; + readFully(position, buffer, 0, length); + return Slices.wrappedBuffer(buffer); + } + + private void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long start = System.nanoTime(); + + readInternal(position, buffer, bufferOffset, bufferLength); + + readTimeNanos += System.nanoTime() - start; + readBytes += bufferLength; + } + + @Override + public final Map planRead(ListMultimap diskRanges, AggregatedMemoryContext memoryContext) + { + requireNonNull(diskRanges, "diskRanges is null"); + + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + return planChunksRead(diskRanges, memoryContext).asMap() + .entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, entry -> new ChunkedInputStream(entry.getValue()))); + } + + @VisibleForTesting + public ListMultimap planChunksRead(ListMultimap diskRanges, AggregatedMemoryContext memoryContext) + { + checkArgument(!diskRanges.isEmpty(), "diskRanges is empty"); + + // + // Note: this code does not use the stream APIs to avoid any extra object allocation + // + + // split disk ranges into "big" and "small" + ImmutableListMultimap.Builder smallRangesBuilder = ImmutableListMultimap.builder(); + ImmutableListMultimap.Builder largeRangesBuilder = ImmutableListMultimap.builder(); + for (Map.Entry entry : diskRanges.entries()) { + if (entry.getValue().getLength() <= options.getMaxBufferSize().toBytes()) { + smallRangesBuilder.put(entry); + } + else { + largeRangesBuilder.putAll(entry.getKey(), splitLargeRange(entry.getValue())); + } + } + ListMultimap smallRanges = smallRangesBuilder.build(); + ListMultimap largeRanges = largeRangesBuilder.build(); + + // read ranges + ImmutableListMultimap.Builder slices = ImmutableListMultimap.builder(); + slices.putAll(readSmallDiskRanges(smallRanges, memoryContext)); + slices.putAll(readLargeDiskRanges(largeRanges, memoryContext)); + // Re-order ChunkReaders by their DiskRange offsets as ParquetColumnChunkIterator expects + // the input slices to be in the order that they're present in the file + slices.orderValuesBy(comparingLong(ChunkReader::getDiskOffset)); + + return slices.build(); + } + + private List splitLargeRange(DiskRange range) + { + int maxBufferSizeBytes = toIntExact(options.getMaxBufferSize().toBytes()); + checkArgument(maxBufferSizeBytes > 0, "maxBufferSize must by larger than zero but is %s bytes", maxBufferSizeBytes); + ImmutableList.Builder ranges = ImmutableList.builder(); + long endOffset = range.getOffset() + range.getLength(); + long offset = range.getOffset(); + while (offset + maxBufferSizeBytes < endOffset) { + ranges.add(new DiskRange(offset, maxBufferSizeBytes)); + offset += maxBufferSizeBytes; + } + + long lengthLeft = endOffset - offset; + if (lengthLeft > 0) { + ranges.add(new DiskRange(offset, toIntExact(lengthLeft))); + } + + return ranges.build(); + } + + private ListMultimap readSmallDiskRanges(ListMultimap diskRanges, AggregatedMemoryContext memoryContext) + { + if (diskRanges.isEmpty()) { + return ImmutableListMultimap.of(); + } + + Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), options.getMaxMergeDistance(), options.getMaxBufferSize()); + + ImmutableListMultimap.Builder slices = ImmutableListMultimap.builder(); + for (DiskRange mergedRange : mergedRanges) { + ReferenceCountedReader mergedRangeLoader = new ReferenceCountedReader(mergedRange, memoryContext); + + for (Map.Entry diskRangeEntry : diskRanges.entries()) { + DiskRange diskRange = diskRangeEntry.getValue(); + if (mergedRange.contains(diskRange)) { + mergedRangeLoader.addReference(); + + slices.put(diskRangeEntry.getKey(), new ChunkReader() + { + @Override + public long getDiskOffset() + { + return diskRange.getOffset(); + } + + @Override + public Slice read() + throws IOException + { + int offset = toIntExact(diskRange.getOffset() - mergedRange.getOffset()); + return mergedRangeLoader.read().slice(offset, toIntExact(diskRange.getLength())); + } + + @Override + public void free() + { + mergedRangeLoader.free(); + } + }); + } + } + + mergedRangeLoader.free(); + } + + ListMultimap sliceStreams = slices.build(); + verify(sliceStreams.keySet().equals(diskRanges.keySet())); + return sliceStreams; + } + + private ListMultimap readLargeDiskRanges(ListMultimap diskRanges, AggregatedMemoryContext memoryContext) + { + if (diskRanges.isEmpty()) { + return ImmutableListMultimap.of(); + } + + ImmutableListMultimap.Builder slices = ImmutableListMultimap.builder(); + for (Map.Entry entry : diskRanges.entries()) { + slices.put(entry.getKey(), new ReferenceCountedReader(entry.getValue(), memoryContext)); + } + return slices.build(); + } + + private static List mergeAdjacentDiskRanges(Collection diskRanges, DataSize maxMergeDistance, DataSize maxReadSize) + { + // sort ranges by start offset + List ranges = new ArrayList<>(diskRanges); + ranges.sort(comparingLong(DiskRange::getOffset)); + + long maxReadSizeBytes = maxReadSize.toBytes(); + long maxMergeDistanceBytes = maxMergeDistance.toBytes(); + + // merge overlapping ranges + ImmutableList.Builder result = ImmutableList.builder(); + DiskRange last = ranges.get(0); + for (int i = 1; i < ranges.size(); i++) { + DiskRange current = ranges.get(i); + DiskRange merged = null; + boolean blockTooLong = false; + try { + merged = last.span(current); + } + catch (ArithmeticException e) { + blockTooLong = true; + } + if (!blockTooLong && merged.getLength() <= maxReadSizeBytes && last.getEnd() + maxMergeDistanceBytes >= current.getOffset()) { + last = merged; + } + else { + result.add(last); + last = current; + } + } + result.add(last); + + return result.build(); + } + + private class ReferenceCountedReader + implements ChunkReader + { + // See jdk.internal.util.ArraysSupport.SOFT_MAX_ARRAY_LENGTH for an explanation + private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; + private final DiskRange range; + private final LocalMemoryContext readerMemoryUsage; + private Slice data; + private int referenceCount = 1; + + public ReferenceCountedReader(DiskRange range, AggregatedMemoryContext memoryContext) + { + this.range = range; + checkArgument(range.getLength() <= MAX_ARRAY_SIZE, "Cannot read range bigger than %s but got %s", MAX_ARRAY_SIZE, range); + this.readerMemoryUsage = memoryContext.newLocalMemoryContext(ReferenceCountedReader.class.getSimpleName()); + } + + public void addReference() + { + checkState(referenceCount > 0, "Chunk reader is already closed"); + referenceCount++; + } + + @Override + public long getDiskOffset() + { + return range.getOffset(); + } + + @Override + public Slice read() + throws IOException + { + checkState(referenceCount > 0, "Chunk reader is already closed"); + + if (data == null) { + byte[] buffer = new byte[toIntExact(range.getLength())]; + readerMemoryUsage.setBytes(buffer.length); + readFully(range.getOffset(), buffer, 0, buffer.length); + data = Slices.wrappedBuffer(buffer); + } + + return data; + } + + @Override + public void free() + { + checkState(referenceCount > 0, "Reference count is already 0"); + + referenceCount--; + if (referenceCount == 0) { + data = null; + readerMemoryUsage.setBytes(0); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("range", range) + .add("referenceCount", referenceCount) + .toString(); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/BloomFilterStore.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/BloomFilterStore.java new file mode 100644 index 000000000000..1afc665c1494 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/BloomFilterStore.java @@ -0,0 +1,136 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.BasicSliceInput; +import io.airlift.slice.Slice; +import io.trino.parquet.metadata.BlockMetadata; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.TupleDomain; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.format.BloomFilterHeader; +import org.apache.parquet.format.Util; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.io.ParquetDecodingException; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter.UPPER_BOUND_BYTES; + +public class BloomFilterStore +{ + // since bloomfilter header is relatively small(18 bytes when testing) we can read in a larger buffer(BlockSplitBloomFilter.HEADER_SIZE*4 in this case) + // and get actual bytes used when deserializing header in order to calculate the correct offset for bloomfilter data. + private static final int MAX_HEADER_LENGTH = BlockSplitBloomFilter.HEADER_SIZE * 4; + + private final ParquetDataSource dataSource; + private final Map bloomFilterOffsets; + + public BloomFilterStore(ParquetDataSource dataSource, BlockMetadata block, Set columnsFiltered) + { + this.dataSource = requireNonNull(dataSource, "dataSource is null"); + requireNonNull(block, "block is null"); + requireNonNull(columnsFiltered, "columnsFiltered is null"); + + ImmutableMap.Builder bloomFilterOffsetBuilder = ImmutableMap.builder(); + for (ColumnChunkMetadata column : block.columns()) { + ColumnPath path = column.getPath(); + if (hasBloomFilter(column) && columnsFiltered.contains(path)) { + bloomFilterOffsetBuilder.put(path, column.getBloomFilterOffset()); + } + } + this.bloomFilterOffsets = bloomFilterOffsetBuilder.buildOrThrow(); + } + + public Optional getBloomFilter(ColumnPath columnPath) + { + BloomFilterHeader bloomFilterHeader; + long bloomFilterDataOffset; + try { + Long columnBloomFilterOffset = bloomFilterOffsets.get(columnPath); + if (columnBloomFilterOffset == null) { + return Optional.empty(); + } + BasicSliceInput headerSliceInput = dataSource.readFully(columnBloomFilterOffset, MAX_HEADER_LENGTH).getInput(); + bloomFilterHeader = Util.readBloomFilterHeader(headerSliceInput); + bloomFilterDataOffset = columnBloomFilterOffset + headerSliceInput.position(); + } + catch (IOException exception) { + throw new UncheckedIOException("Failed to read Bloom filter header", exception); + } + + if (!bloomFilterSupported(columnPath, bloomFilterHeader)) { + return Optional.empty(); + } + + try { + Slice bloomFilterData = dataSource.readFully(bloomFilterDataOffset, bloomFilterHeader.getNumBytes()); + verify(bloomFilterData.length() > 0, "Read empty bloom filter %s", bloomFilterHeader); + return Optional.of(new BlockSplitBloomFilter(bloomFilterData.getBytes())); + } + catch (IOException exception) { + throw new UncheckedIOException("Failed to read Bloom filter data", exception); + } + } + + public static Optional getBloomFilterStore( + ParquetDataSource dataSource, + BlockMetadata blockMetadata, + TupleDomain parquetTupleDomain, + ParquetReaderOptions options) + { + if (!options.useBloomFilter() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) { + return Optional.empty(); + } + + boolean hasBloomFilter = blockMetadata.columns().stream().anyMatch(BloomFilterStore::hasBloomFilter); + if (!hasBloomFilter) { + return Optional.empty(); + } + + Map parquetDomains = parquetTupleDomain.getDomains() + .orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains")); + Set columnsFilteredPaths = parquetDomains.keySet().stream() + .map(column -> ColumnPath.get(column.getPath())) + .collect(toImmutableSet()); + + return Optional.of(new BloomFilterStore(dataSource, blockMetadata, columnsFilteredPaths)); + } + + public static boolean hasBloomFilter(ColumnChunkMetadata columnMetaData) + { + return columnMetaData.getBloomFilterOffset() > 0; + } + + private static boolean bloomFilterSupported(ColumnPath columnPath, BloomFilterHeader bloomFilterHeader) + { + int numBytes = bloomFilterHeader.getNumBytes(); + if (numBytes <= 0 || numBytes > UPPER_BOUND_BYTES) { + throw new ParquetDecodingException(format("Column: %s has bloom filter number of bytes value of %d, which is out of bound of lower limit: %d and upper limit: %d", columnPath, numBytes, 0, UPPER_BOUND_BYTES)); + } + return bloomFilterHeader.getHash().isSetXXHASH() && bloomFilterHeader.getAlgorithm().isSetBLOCK() && bloomFilterHeader.getCompression().isSetUNCOMPRESSED(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ChunkKey.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ChunkKey.java new file mode 100644 index 000000000000..473ec7ee6a0b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ChunkKey.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import java.util.Objects; + +import static java.lang.String.format; + +public class ChunkKey +{ + private final int column; + private final int rowGroup; + + public ChunkKey(int column, int rowGroup) + { + this.column = column; + this.rowGroup = rowGroup; + } + + @Override + public int hashCode() + { + return Objects.hash(column, rowGroup); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + ChunkKey other = (ChunkKey) obj; + return this.column == other.column && + this.rowGroup == other.rowGroup; + } + + @Override + public String toString() + { + return format("[rowGroup=%s, column=%s]", rowGroup, column); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ChunkReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ChunkReader.java new file mode 100644 index 000000000000..934b5ae31c4b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ChunkReader.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.airlift.slice.Slice; + +import java.io.IOException; +import java.io.UncheckedIOException; + +public interface ChunkReader +{ + long getDiskOffset(); + + Slice read() + throws IOException; + + default Slice readUnchecked() + { + try { + return read(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + void free(); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/Column.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/Column.java new file mode 100644 index 000000000000..a6c703cafb90 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/Column.java @@ -0,0 +1,25 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import static java.util.Objects.requireNonNull; + +public record Column(String name, Field field) +{ + public Column + { + requireNonNull(name, "name is null"); + requireNonNull(field, "field is null"); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ColumnStatisticsValidation.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ColumnStatisticsValidation.java new file mode 100644 index 000000000000..ef28f2469286 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ColumnStatisticsValidation.java @@ -0,0 +1,173 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.TrinoException; +import io.trino.spi.block.Block; +import io.trino.spi.block.ColumnarArray; +import io.trino.spi.block.ColumnarMap; +import io.trino.spi.block.RowBlock; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.MapType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.Type; + +import java.util.List; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.block.ColumnarArray.toColumnarArray; +import static io.trino.spi.block.ColumnarMap.toColumnarMap; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +class ColumnStatisticsValidation +{ + private final Type type; + private final List fieldBuilders; + + private long valuesCount; + private long nonLeafValuesCount; + + public ColumnStatisticsValidation(Type type) + { + this.type = requireNonNull(type, "type is null"); + this.fieldBuilders = type.getTypeParameters().stream() + .map(ColumnStatisticsValidation::new) + .collect(toImmutableList()); + } + + public void addBlock(Block block) + { + addBlock(block, new ColumnStatistics(0, 0)); + } + + public List build() + { + if (fieldBuilders.isEmpty()) { + return ImmutableList.of(new ColumnStatistics(valuesCount, nonLeafValuesCount)); + } + return fieldBuilders.stream() + .flatMap(builder -> builder.build().stream()) + .collect(toImmutableList()); + } + + private void addBlock(Block block, ColumnStatistics columnStatistics) + { + if (fieldBuilders.isEmpty()) { + addPrimitiveBlock(block); + valuesCount += columnStatistics.valuesCount(); + nonLeafValuesCount += columnStatistics.nonLeafValuesCount(); + return; + } + + List fields; + ColumnStatistics mergedColumnStatistics; + if (type instanceof ArrayType) { + ColumnarArray columnarArray = toColumnarArray(block); + fields = ImmutableList.of(columnarArray.getElementsBlock()); + mergedColumnStatistics = columnStatistics.merge(addArrayBlock(columnarArray)); + } + else if (type instanceof MapType) { + ColumnarMap columnarMap = toColumnarMap(block); + fields = ImmutableList.of(columnarMap.getKeysBlock(), columnarMap.getValuesBlock()); + mergedColumnStatistics = columnStatistics.merge(addMapBlock(columnarMap)); + } + else if (type instanceof RowType) { + // the validation code is designed to work with null-suppressed blocks + fields = RowBlock.getNullSuppressedRowFieldsFromBlock(block); + mergedColumnStatistics = columnStatistics.merge(addRowBlock(block)); + } + else { + throw new TrinoException(NOT_SUPPORTED, format("Unsupported type: %s", type)); + } + + for (int i = 0; i < fieldBuilders.size(); i++) { + fieldBuilders.get(i).addBlock(fields.get(i), mergedColumnStatistics); + } + } + + private void addPrimitiveBlock(Block block) + { + valuesCount += block.getPositionCount(); + if (!block.mayHaveNull()) { + return; + } + int nullsCount = 0; + for (int position = 0; position < block.getPositionCount(); position++) { + nullsCount += block.isNull(position) ? 1 : 0; + } + nonLeafValuesCount += nullsCount; + } + + private static ColumnStatistics addMapBlock(ColumnarMap block) + { + if (!block.mayHaveNull()) { + int emptyEntriesCount = 0; + for (int position = 0; position < block.getPositionCount(); position++) { + emptyEntriesCount += block.getEntryCount(position) == 0 ? 1 : 0; + } + return new ColumnStatistics(emptyEntriesCount, emptyEntriesCount); + } + int nonLeafValuesCount = 0; + for (int position = 0; position < block.getPositionCount(); position++) { + nonLeafValuesCount += block.isNull(position) || block.getEntryCount(position) == 0 ? 1 : 0; + } + return new ColumnStatistics(nonLeafValuesCount, nonLeafValuesCount); + } + + private static ColumnStatistics addArrayBlock(ColumnarArray block) + { + if (!block.mayHaveNull()) { + int emptyEntriesCount = 0; + for (int position = 0; position < block.getPositionCount(); position++) { + emptyEntriesCount += block.getLength(position) == 0 ? 1 : 0; + } + return new ColumnStatistics(emptyEntriesCount, emptyEntriesCount); + } + int nonLeafValuesCount = 0; + for (int position = 0; position < block.getPositionCount(); position++) { + nonLeafValuesCount += block.isNull(position) || block.getLength(position) == 0 ? 1 : 0; + } + return new ColumnStatistics(nonLeafValuesCount, nonLeafValuesCount); + } + + private static ColumnStatistics addRowBlock(Block block) + { + if (!block.mayHaveNull()) { + return new ColumnStatistics(0, 0); + } + int nullsCount = 0; + for (int position = 0; position < block.getPositionCount(); position++) { + nullsCount += block.isNull(position) ? 1 : 0; + } + return new ColumnStatistics(nullsCount, nullsCount); + } + + /** + * @param valuesCount Count of values for a column field, including nulls, empty and defined values. + * @param nonLeafValuesCount Count of non-leaf values for a column field, this is nulls count for primitives + * and count of values below the max definition level for nested types + */ + record ColumnStatistics(long valuesCount, long nonLeafValuesCount) + { + ColumnStatistics merge(ColumnStatistics other) + { + return new ColumnStatistics( + valuesCount + other.valuesCount(), + nonLeafValuesCount + other.nonLeafValuesCount()); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPage.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPage.java new file mode 100644 index 000000000000..bbece17c9b7e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPage.java @@ -0,0 +1,44 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import java.util.OptionalLong; + +public abstract sealed class DataPage + extends Page + permits DataPageV1, DataPageV2 +{ + protected final int valueCount; + private final OptionalLong firstRowIndex; + + public DataPage(int uncompressedSize, int valueCount, OptionalLong firstRowIndex) + { + super(uncompressedSize); + this.valueCount = valueCount; + this.firstRowIndex = firstRowIndex; + } + + /** + * @return the index of the first row index in this page or -1 if unset. + */ + public OptionalLong getFirstRowIndex() + { + return firstRowIndex; + } + + public int getValueCount() + { + return valueCount; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPageV1.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPageV1.java new file mode 100755 index 000000000000..b0895445d813 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPageV1.java @@ -0,0 +1,79 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.airlift.slice.Slice; + +import java.util.OptionalLong; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public final class DataPageV1 + extends DataPage +{ + private final Slice slice; + private final ParquetEncoding repetitionLevelEncoding; + private final ParquetEncoding definitionLevelEncoding; + private final ParquetEncoding valuesEncoding; + + public DataPageV1( + Slice slice, + int valueCount, + int uncompressedSize, + OptionalLong firstRowIndex, + ParquetEncoding repetitionLevelEncoding, + ParquetEncoding definitionLevelEncoding, + ParquetEncoding valuesEncoding) + { + super(uncompressedSize, valueCount, firstRowIndex); + this.slice = requireNonNull(slice, "slice is null"); + this.repetitionLevelEncoding = repetitionLevelEncoding; + this.definitionLevelEncoding = definitionLevelEncoding; + this.valuesEncoding = valuesEncoding; + } + + public Slice getSlice() + { + return slice; + } + + public ParquetEncoding getDefinitionLevelEncoding() + { + return definitionLevelEncoding; + } + + public ParquetEncoding getRepetitionLevelEncoding() + { + return repetitionLevelEncoding; + } + + public ParquetEncoding getValueEncoding() + { + return valuesEncoding; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("slice", slice) + .add("repetitionLevelEncoding", repetitionLevelEncoding) + .add("definitionLevelEncoding", definitionLevelEncoding) + .add("valuesEncoding", valuesEncoding) + .add("valueCount", valueCount) + .add("uncompressedSize", uncompressedSize) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPageV2.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPageV2.java new file mode 100644 index 000000000000..b0cbfd9ed8fc --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DataPageV2.java @@ -0,0 +1,116 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.airlift.slice.Slice; +import org.apache.parquet.column.statistics.Statistics; + +import java.util.OptionalLong; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public final class DataPageV2 + extends DataPage +{ + private final int rowCount; + private final int nullCount; + private final Slice repetitionLevels; + private final Slice definitionLevels; + private final ParquetEncoding dataEncoding; + private final Slice slice; + private final Statistics statistics; + private final boolean isCompressed; + + public DataPageV2( + int rowCount, + int nullCount, + int valueCount, + Slice repetitionLevels, + Slice definitionLevels, + ParquetEncoding dataEncoding, + Slice slice, + int uncompressedSize, + OptionalLong firstRowIndex, + Statistics statistics, + boolean isCompressed) + { + super(uncompressedSize, valueCount, firstRowIndex); + this.rowCount = rowCount; + this.nullCount = nullCount; + this.repetitionLevels = requireNonNull(repetitionLevels, "repetitionLevels slice is null"); + this.definitionLevels = requireNonNull(definitionLevels, "definitionLevels slice is null"); + this.dataEncoding = dataEncoding; + this.slice = requireNonNull(slice, "slice is null"); + this.statistics = statistics; + this.isCompressed = isCompressed; + } + + public int getRowCount() + { + return rowCount; + } + + public int getNullCount() + { + return nullCount; + } + + public Slice getRepetitionLevels() + { + return repetitionLevels; + } + + public Slice getDefinitionLevels() + { + return definitionLevels; + } + + public ParquetEncoding getDataEncoding() + { + return dataEncoding; + } + + public Slice getSlice() + { + return slice; + } + + public Statistics getStatistics() + { + return statistics; + } + + public boolean isCompressed() + { + return isCompressed; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("rowCount", rowCount) + .add("nullCount", nullCount) + .add("repetitionLevels", repetitionLevels) + .add("definitionLevels", definitionLevels) + .add("dataEncoding", dataEncoding) + .add("slice", slice) + .add("statistics", statistics) + .add("isCompressed", isCompressed) + .add("valueCount", valueCount) + .add("uncompressedSize", uncompressedSize) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/DictionaryPage.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DictionaryPage.java new file mode 100644 index 000000000000..74fdf540199d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DictionaryPage.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.airlift.slice.Slice; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class DictionaryPage + extends Page +{ + private final Slice slice; + private final int dictionarySize; + private final ParquetEncoding encoding; + + public DictionaryPage(Slice slice, int dictionarySize, ParquetEncoding encoding) + { + this(slice, slice.length(), dictionarySize, encoding); + } + + public DictionaryPage(Slice slice, int uncompressedSize, int dictionarySize, ParquetEncoding encoding) + { + super(uncompressedSize); + this.slice = requireNonNull(slice, "slice is null"); + this.dictionarySize = dictionarySize; + this.encoding = requireNonNull(encoding, "encoding is null"); + checkArgument( + encoding == ParquetEncoding.PLAIN_DICTIONARY || encoding == ParquetEncoding.PLAIN, + "Dictionary does not support encoding: %s", + encoding); + } + + public Slice getSlice() + { + return slice; + } + + public int getDictionarySize() + { + return dictionarySize; + } + + public ParquetEncoding getEncoding() + { + return encoding; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("slice", slice) + .add("dictionarySize", dictionarySize) + .add("encoding", encoding) + .add("uncompressedSize", uncompressedSize) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/DiskRange.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DiskRange.java new file mode 100644 index 000000000000..14ad2fcb7adf --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/DiskRange.java @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +// same as io.trino.orc.DiskRange +public final class DiskRange +{ + private final long offset; + private final long length; + + public DiskRange(long offset, long length) + { + checkArgument(offset >= 0, "offset is negative"); + checkArgument(length > 0, "length must be at least 1"); + + this.offset = offset; + this.length = length; + } + + public long getOffset() + { + return offset; + } + + public long getLength() + { + return length; + } + + public long getEnd() + { + return offset + length; + } + + public boolean contains(DiskRange diskRange) + { + return offset <= diskRange.getOffset() && diskRange.getEnd() <= getEnd(); + } + + /** + * Returns the minimal DiskRange that encloses both this DiskRange + * and otherDiskRange. If there was a gap between the ranges the + * new range will cover that gap. + */ + public DiskRange span(DiskRange otherDiskRange) + { + requireNonNull(otherDiskRange, "otherDiskRange is null"); + long start = Math.min(this.offset, otherDiskRange.getOffset()); + long end = Math.max(getEnd(), otherDiskRange.getEnd()); + return new DiskRange(start, end - start); + } + + @Override + public int hashCode() + { + return Objects.hash(offset, length); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + DiskRange other = (DiskRange) obj; + return this.offset == other.offset && + this.length == other.length; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("offset", offset) + .add("length", length) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/Field.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/Field.java new file mode 100644 index 000000000000..570939cd8a3e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/Field.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.trino.spi.type.Type; + +import static java.util.Objects.requireNonNull; + +public abstract class Field +{ + private final Type type; + private final int repetitionLevel; + private final int definitionLevel; + private final boolean required; + + protected Field(Type type, int repetitionLevel, int definitionLevel, boolean required) + { + this.type = requireNonNull(type, "type is required"); + this.repetitionLevel = repetitionLevel; + this.definitionLevel = definitionLevel; + this.required = required; + } + + public Type getType() + { + return type; + } + + public int getRepetitionLevel() + { + return repetitionLevel; + } + + public int getDefinitionLevel() + { + return definitionLevel; + } + + public boolean isRequired() + { + return required; + } + + @Override + public abstract String toString(); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/GroupField.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/GroupField.java new file mode 100644 index 000000000000..9a006d8adad2 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/GroupField.java @@ -0,0 +1,63 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.type.Type; + +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; +import static java.util.stream.Collectors.toList; + +public class GroupField + extends Field +{ + private final List> children; + + public GroupField(Type type, int repetitionLevel, int definitionLevel, boolean required, List> children) + { + super(type, repetitionLevel, definitionLevel, required); + checkArgument( + type.getTypeParameters().size() == children.size(), + "Type %s has %s parameters, but %s children: %s", + type, + type.getTypeParameters().size(), + children.size(), + children); + this.children = ImmutableList.copyOf(requireNonNull(children, "children is null")); + } + + public List> getChildren() + { + return children; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", getType()) + .add("repetitionLevel", getRepetitionLevel()) + .add("definitionLevel", getDefinitionLevel()) + .add("required", isRequired()) + .add("children", getChildren().stream() + .map(field -> field.orElse(null)) + .collect(toList())) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/Page.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/Page.java new file mode 100644 index 000000000000..69cde62cf435 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/Page.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +public abstract class Page +{ + protected final int uncompressedSize; + + public Page(int uncompressedSize) + { + this.uncompressedSize = uncompressedSize; + } + + public int getUncompressedSize() + { + return uncompressedSize; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetCompressionUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetCompressionUtils.java new file mode 100644 index 000000000000..d482f627caac --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetCompressionUtils.java @@ -0,0 +1,152 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.io.ByteStreams; +import io.airlift.compress.Decompressor; +import io.airlift.compress.lz4.Lz4Decompressor; +import io.airlift.compress.lzo.LzoDecompressor; +import io.airlift.compress.snappy.SnappyDecompressor; +import io.airlift.compress.zstd.ZstdDecompressor; +import io.airlift.slice.Slice; +import org.apache.parquet.format.CompressionCodec; + +import java.io.IOException; +import java.util.zip.GZIPInputStream; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.SIZE_OF_LONG; +import static io.airlift.slice.Slices.EMPTY_SLICE; +import static io.airlift.slice.Slices.wrappedBuffer; +import static java.lang.Math.min; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public final class ParquetCompressionUtils +{ + private static final int GZIP_BUFFER_SIZE = 8 * 1024; + + private ParquetCompressionUtils() {} + + public static Slice decompress(CompressionCodec codec, Slice input, int uncompressedSize) + throws IOException + { + requireNonNull(input, "input is null"); + + if (input.length() == 0) { + return EMPTY_SLICE; + } + + switch (codec) { + case GZIP: + return decompressGzip(input, uncompressedSize); + case SNAPPY: + return decompressSnappy(input, uncompressedSize); + case UNCOMPRESSED: + return input; + case LZO: + return decompressLZO(input, uncompressedSize); + case LZ4: + return decompressLz4(input, uncompressedSize); + case ZSTD: + return decompressZstd(input, uncompressedSize); + case BROTLI: + case LZ4_RAW: + // unsupported + break; + } + throw new ParquetCorruptionException("Codec not supported in Parquet: " + codec); + } + + private static Slice decompressSnappy(Slice input, int uncompressedSize) + { + // Snappy decompressor is more efficient if there's at least a long's worth of extra space + // in the output buffer + byte[] buffer = new byte[uncompressedSize + SIZE_OF_LONG]; + int actualUncompressedSize = decompress(new SnappyDecompressor(), input, 0, input.length(), buffer, 0); + if (actualUncompressedSize != uncompressedSize) { + throw new IllegalArgumentException(format("Invalid uncompressedSize for SNAPPY input. Expected %s, actual: %s", uncompressedSize, actualUncompressedSize)); + } + return wrappedBuffer(buffer, 0, uncompressedSize); + } + + private static Slice decompressZstd(Slice input, int uncompressedSize) + { + byte[] buffer = new byte[uncompressedSize]; + decompress(new ZstdDecompressor(), input, 0, input.length(), buffer, 0); + return wrappedBuffer(buffer); + } + + private static Slice decompressGzip(Slice input, int uncompressedSize) + throws IOException + { + if (uncompressedSize == 0) { + return EMPTY_SLICE; + } + + try (GZIPInputStream gzipInputStream = new GZIPInputStream(input.getInput(), min(GZIP_BUFFER_SIZE, input.length()))) { + byte[] buffer = new byte[uncompressedSize]; + int bytesRead = ByteStreams.read(gzipInputStream, buffer, 0, buffer.length); + if (bytesRead != uncompressedSize) { + throw new IllegalArgumentException(format("Invalid uncompressedSize for GZIP input. Expected %s, actual: %s", uncompressedSize, bytesRead)); + } + // Verify we're at EOF and aren't truncating the input + checkArgument(gzipInputStream.read() == -1, "Invalid uncompressedSize for GZIP input. Actual size exceeds %s bytes", uncompressedSize); + return wrappedBuffer(buffer, 0, bytesRead); + } + } + + private static Slice decompressLz4(Slice input, int uncompressedSize) + { + return decompressFramed(new Lz4Decompressor(), input, uncompressedSize); + } + + private static Slice decompressLZO(Slice input, int uncompressedSize) + { + return decompressFramed(new LzoDecompressor(), input, uncompressedSize); + } + + private static Slice decompressFramed(Decompressor decompressor, Slice input, int uncompressedSize) + { + long totalDecompressedCount = 0; + // over allocate buffer which makes decompression easier + byte[] output = new byte[uncompressedSize + SIZE_OF_LONG]; + int outputOffset = 0; + int inputOffset = 0; + int cumulativeUncompressedBlockLength = 0; + + while (totalDecompressedCount < uncompressedSize) { + if (totalDecompressedCount == cumulativeUncompressedBlockLength) { + cumulativeUncompressedBlockLength += Integer.reverseBytes(input.getInt(inputOffset)); + inputOffset += SIZE_OF_INT; + } + int compressedChunkLength = Integer.reverseBytes(input.getInt(inputOffset)); + inputOffset += SIZE_OF_INT; + int decompressionSize = decompress(decompressor, input, inputOffset, compressedChunkLength, output, outputOffset); + totalDecompressedCount += decompressionSize; + outputOffset += decompressionSize; + inputOffset += compressedChunkLength; + } + checkArgument(outputOffset == uncompressedSize); + return wrappedBuffer(output, 0, uncompressedSize); + } + + private static int decompress(Decompressor decompressor, Slice input, int inputOffset, int inputLength, byte[] output, int outputOffset) + { + byte[] byteArray = input.byteArray(); + int byteArrayOffset = inputOffset + input.byteArrayOffset(); + return decompressor.decompress(byteArray, byteArrayOffset, inputLength, output, outputOffset, output.length - outputOffset); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetCorruptionException.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetCorruptionException.java new file mode 100644 index 000000000000..7cbddf8a07c4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetCorruptionException.java @@ -0,0 +1,46 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.errorprone.annotations.FormatMethod; + +import java.io.IOException; + +import static java.lang.String.format; + +public class ParquetCorruptionException + extends IOException +{ + public ParquetCorruptionException(ParquetDataSourceId dataSourceId, String message) + { + this(dataSourceId, "%s", message); + } + + @FormatMethod + public ParquetCorruptionException(Throwable cause, ParquetDataSourceId dataSourceId, String messageFormat, Object... args) + { + super(formatMessage(dataSourceId, messageFormat, args), cause); + } + + @FormatMethod + public ParquetCorruptionException(ParquetDataSourceId dataSourceId, String messageFormat, Object... args) + { + super(formatMessage(dataSourceId, messageFormat, args)); + } + + private static String formatMessage(ParquetDataSourceId dataSourceId, String messageFormat, Object[] args) + { + return "Malformed Parquet file. " + format(messageFormat, args) + " [" + dataSourceId + "]"; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetDataSource.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetDataSource.java new file mode 100644 index 000000000000..62ec264fbc5b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetDataSource.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.collect.ListMultimap; +import io.airlift.slice.Slice; +import io.trino.memory.context.AggregatedMemoryContext; +import io.trino.parquet.reader.ChunkedInputStream; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; + +public interface ParquetDataSource + extends Closeable +{ + ParquetDataSourceId getId(); + + long getReadBytes(); + + long getReadTimeNanos(); + + long getEstimatedSize(); + + Slice readTail(int length) + throws IOException; + + Slice readFully(long position, int length) + throws IOException; + + Map planRead(ListMultimap diskRanges, AggregatedMemoryContext memoryContext); + + @Override + default void close() + throws IOException + { + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetDataSourceId.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetDataSourceId.java new file mode 100644 index 000000000000..5b68730f38e4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetDataSourceId.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import java.util.Objects; + +import static java.util.Objects.requireNonNull; + +public final class ParquetDataSourceId +{ + private final String id; + + public ParquetDataSourceId(String id) + { + this.id = requireNonNull(id, "id is null"); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + ParquetDataSourceId that = (ParquetDataSourceId) o; + return Objects.equals(id, that.id); + } + + @Override + public int hashCode() + { + return Objects.hash(id); + } + + @Override + public String toString() + { + return id; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetEncoding.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetEncoding.java new file mode 100644 index 000000000000..09d450a0a90f --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetEncoding.java @@ -0,0 +1,198 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.trino.parquet.dictionary.BinaryDictionary; +import io.trino.parquet.dictionary.Dictionary; +import io.trino.parquet.dictionary.DoubleDictionary; +import io.trino.parquet.dictionary.FloatDictionary; +import io.trino.parquet.dictionary.IntegerDictionary; +import io.trino.parquet.dictionary.LongDictionary; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.column.values.bitpacking.ByteBitPackingValuesReader; +import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForDouble; +import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesReaderForFloat; +import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesReader; +import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader; +import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader; +import org.apache.parquet.column.values.plain.BinaryPlainValuesReader; +import org.apache.parquet.column.values.plain.BooleanPlainValuesReader; +import org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesReader; +import org.apache.parquet.column.values.plain.PlainValuesReader.DoublePlainValuesReader; +import org.apache.parquet.column.values.plain.PlainValuesReader.FloatPlainValuesReader; +import org.apache.parquet.column.values.plain.PlainValuesReader.IntegerPlainValuesReader; +import org.apache.parquet.column.values.plain.PlainValuesReader.LongPlainValuesReader; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesReader; +import org.apache.parquet.column.values.rle.ZeroIntegerValuesReader; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.parquet.column.values.bitpacking.Packer.BIG_ENDIAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; + +public enum ParquetEncoding +{ + PLAIN { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + return switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case BOOLEAN -> new BooleanPlainValuesReader(); + case BINARY -> new BinaryPlainValuesReader(); + case FLOAT -> new FloatPlainValuesReader(); + case DOUBLE -> new DoublePlainValuesReader(); + case INT32 -> new IntegerPlainValuesReader(); + case INT64 -> new LongPlainValuesReader(); + case INT96 -> new FixedLenByteArrayPlainValuesReader(INT96_TYPE_LENGTH); + case FIXED_LEN_BYTE_ARRAY -> new FixedLenByteArrayPlainValuesReader(descriptor.getPrimitiveType().getTypeLength()); + }; + } + + @Override + public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) + throws IOException + { + return switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + // No dictionary encoding for boolean + case BOOLEAN -> throw new ParquetDecodingException("Dictionary encoding does not support: " + descriptor.getPrimitiveType().getPrimitiveTypeName()); + case BINARY -> new BinaryDictionary(dictionaryPage); + case FIXED_LEN_BYTE_ARRAY -> new BinaryDictionary(dictionaryPage, descriptor.getPrimitiveType().getTypeLength()); + case INT96 -> new BinaryDictionary(dictionaryPage, INT96_TYPE_LENGTH); + case INT64 -> new LongDictionary(dictionaryPage); + case DOUBLE -> new DoubleDictionary(dictionaryPage); + case INT32 -> new IntegerDictionary(dictionaryPage); + case FLOAT -> new FloatDictionary(dictionaryPage); + }; + } + }, + + RLE { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + int bitWidth = BytesUtils.getWidthFromMaxInt(getMaxLevel(descriptor, valuesType)); + if (bitWidth == 0) { + return new ZeroIntegerValuesReader(); + } + return new RunLengthBitPackingHybridValuesReader(bitWidth); + } + }, + + BIT_PACKED { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + return new ByteBitPackingValuesReader(getMaxLevel(descriptor, valuesType), BIG_ENDIAN); + } + }, + + PLAIN_DICTIONARY { + @Override + public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) + throws IOException + { + return PLAIN.initDictionary(descriptor, dictionaryPage); + } + }, + + DELTA_BINARY_PACKED { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + PrimitiveTypeName typeName = descriptor.getPrimitiveType().getPrimitiveTypeName(); + checkArgument(typeName == INT32 || typeName == INT64, "Encoding DELTA_BINARY_PACKED is only supported for type INT32 and INT64"); + return new DeltaBinaryPackingValuesReader(); + } + }, + + DELTA_LENGTH_BYTE_ARRAY { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + checkArgument(descriptor.getPrimitiveType().getPrimitiveTypeName() == BINARY, "Encoding DELTA_LENGTH_BYTE_ARRAY is only supported for type BINARY"); + return new DeltaLengthByteArrayValuesReader(); + } + }, + + DELTA_BYTE_ARRAY { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + PrimitiveTypeName typeName = descriptor.getPrimitiveType().getPrimitiveTypeName(); + checkArgument(typeName == BINARY || typeName == FIXED_LEN_BYTE_ARRAY, "Encoding DELTA_BYTE_ARRAY is only supported for type BINARY and FIXED_LEN_BYTE_ARRAY"); + return new DeltaByteArrayReader(); + } + }, + + RLE_DICTIONARY { + @Override + public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) + throws IOException + { + return PLAIN.initDictionary(descriptor, dictionaryPage); + } + }, + + BYTE_STREAM_SPLIT { + @Override + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + PrimitiveTypeName typeName = descriptor.getPrimitiveType().getPrimitiveTypeName(); + checkArgument(typeName == FLOAT || typeName == DOUBLE, "Encoding BYTE_STREAM_SPLIT is only " + + "supported for type FLOAT and DOUBLE"); + if (typeName == FLOAT) { + return new ByteStreamSplitValuesReaderForFloat(); + } + return new ByteStreamSplitValuesReaderForDouble(); + } + }; + + static final int INT96_TYPE_LENGTH = 12; + + static int getMaxLevel(ColumnDescriptor descriptor, ValuesType valuesType) + { + return switch (valuesType) { + case REPETITION_LEVEL -> descriptor.getMaxRepetitionLevel(); + case DEFINITION_LEVEL -> descriptor.getMaxDefinitionLevel(); + case VALUES -> { + if (descriptor.getPrimitiveType().getPrimitiveTypeName() == BOOLEAN) { + yield 1; + } + throw new ParquetDecodingException("Unsupported values type: " + valuesType); + } + }; + } + + public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) + throws IOException + { + throw new UnsupportedOperationException(" Dictionary encoding is not supported for: " + name()); + } + + public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) + { + throw new UnsupportedOperationException("Error decoding values in encoding: " + this.name()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetMetadataConverter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetMetadataConverter.java new file mode 100644 index 000000000000..75ef2b788e8f --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetMetadataConverter.java @@ -0,0 +1,575 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.trino.parquet.metadata.IndexReference; +import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.format.BoundaryOrder; +import org.apache.parquet.format.BsonType; +import org.apache.parquet.format.ColumnChunk; +import org.apache.parquet.format.ColumnIndex; +import org.apache.parquet.format.ConvertedType; +import org.apache.parquet.format.DateType; +import org.apache.parquet.format.DecimalType; +import org.apache.parquet.format.Encoding; +import org.apache.parquet.format.EnumType; +import org.apache.parquet.format.IntType; +import org.apache.parquet.format.JsonType; +import org.apache.parquet.format.ListType; +import org.apache.parquet.format.LogicalType; +import org.apache.parquet.format.MapType; +import org.apache.parquet.format.MicroSeconds; +import org.apache.parquet.format.MilliSeconds; +import org.apache.parquet.format.NanoSeconds; +import org.apache.parquet.format.NullType; +import org.apache.parquet.format.OffsetIndex; +import org.apache.parquet.format.PageEncodingStats; +import org.apache.parquet.format.PageLocation; +import org.apache.parquet.format.SchemaElement; +import org.apache.parquet.format.Statistics; +import org.apache.parquet.format.StringType; +import org.apache.parquet.format.TimeType; +import org.apache.parquet.format.TimeUnit; +import org.apache.parquet.format.TimestampType; +import org.apache.parquet.format.Type; +import org.apache.parquet.format.UUIDType; +import org.apache.parquet.internal.column.columnindex.BinaryTruncator; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; + +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +import static org.apache.parquet.CorruptStatistics.shouldIgnoreStatistics; +import static org.apache.parquet.schema.LogicalTypeAnnotation.BsonLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.EnumLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.IntLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.IntervalLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.JsonLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.MapKeyValueTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.MapLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.StringLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; +import static org.apache.parquet.schema.LogicalTypeAnnotation.bsonType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.dateType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.enumType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.float16Type; +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.jsonType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.listType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.mapType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timeType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.uuidType; + +// based on org.apache.parquet.format.converter.ParquetMetadataConverter +public final class ParquetMetadataConverter +{ + public static final long MAX_STATS_SIZE = 4096; + + private ParquetMetadataConverter() {} + + public static LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement element) + { + return switch (type) { + case UTF8 -> stringType(); + case MAP -> mapType(); + case MAP_KEY_VALUE -> MapKeyValueTypeAnnotation.getInstance(); + case LIST -> listType(); + case ENUM -> enumType(); + case DECIMAL -> { + int scale = (element == null) ? 0 : element.scale; + int precision = (element == null) ? 0 : element.precision; + yield decimalType(scale, precision); + } + case DATE -> dateType(); + case TIME_MILLIS -> timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); + case TIME_MICROS -> timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS); + case TIMESTAMP_MILLIS -> timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); + case TIMESTAMP_MICROS -> timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS); + case INTERVAL -> IntervalLogicalTypeAnnotation.getInstance(); + case INT_8 -> intType(8, true); + case INT_16 -> intType(16, true); + case INT_32 -> intType(32, true); + case INT_64 -> intType(64, true); + case UINT_8 -> intType(8, false); + case UINT_16 -> intType(16, false); + case UINT_32 -> intType(32, false); + case UINT_64 -> intType(64, false); + case JSON -> jsonType(); + case BSON -> bsonType(); + }; + } + + public static LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) + { + return switch (type.getSetField()) { + case MAP -> mapType(); + case BSON -> bsonType(); + case DATE -> dateType(); + case ENUM -> enumType(); + case JSON -> jsonType(); + case LIST -> listType(); + case TIME -> { + TimeType time = type.getTIME(); + yield timeType(time.isAdjustedToUTC, convertTimeUnit(time.unit)); + } + case STRING -> stringType(); + case DECIMAL -> { + DecimalType decimal = type.getDECIMAL(); + yield decimalType(decimal.scale, decimal.precision); + } + case INTEGER -> { + IntType integer = type.getINTEGER(); + yield intType(integer.bitWidth, integer.isSigned); + } + case UNKNOWN -> null; + case TIMESTAMP -> { + TimestampType timestamp = type.getTIMESTAMP(); + yield timestampType(timestamp.isAdjustedToUTC, convertTimeUnit(timestamp.unit)); + } + case UUID -> uuidType(); + case FLOAT16 -> float16Type(); + }; + } + + public static LogicalType convertToLogicalType(LogicalTypeAnnotation annotation) + { + return annotation.accept(new LogicalTypeConverterVisitor()).orElse(null); + } + + public static PrimitiveTypeName getPrimitive(Type type) + { + return switch (type) { + case BYTE_ARRAY -> PrimitiveTypeName.BINARY; + case INT64 -> PrimitiveTypeName.INT64; + case INT32 -> PrimitiveTypeName.INT32; + case BOOLEAN -> PrimitiveTypeName.BOOLEAN; + case FLOAT -> PrimitiveTypeName.FLOAT; + case DOUBLE -> PrimitiveTypeName.DOUBLE; + case INT96 -> PrimitiveTypeName.INT96; + case FIXED_LEN_BYTE_ARRAY -> PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; + }; + } + + public static Encoding getEncoding(org.apache.parquet.column.Encoding encoding) + { + return Encoding.valueOf(encoding.name()); + } + + public static org.apache.parquet.column.Encoding getEncoding(Encoding encoding) + { + return org.apache.parquet.column.Encoding.valueOf(encoding.name()); + } + + public static EncodingStats convertEncodingStats(List stats) + { + if (stats == null) { + return null; + } + + EncodingStats.Builder builder = new EncodingStats.Builder(); + for (PageEncodingStats stat : stats) { + switch (stat.getPage_type()) { + case DATA_PAGE_V2: + builder.withV2Pages(); + // fall through + case DATA_PAGE: + builder.addDataEncoding(getEncoding(stat.getEncoding()), stat.getCount()); + break; + case DICTIONARY_PAGE: + builder.addDictEncoding(getEncoding(stat.getEncoding()), stat.getCount()); + break; + default: + // ignore + } + } + return builder.build(); + } + + public static org.apache.parquet.internal.column.columnindex.ColumnIndex fromParquetColumnIndex(PrimitiveType type, + ColumnIndex parquetColumnIndex) + { + if (!isMinMaxStatsSupported(type)) { + return null; + } + return ColumnIndexBuilder.build(type, + fromParquetBoundaryOrder(parquetColumnIndex.getBoundary_order()), + parquetColumnIndex.getNull_pages(), + parquetColumnIndex.getNull_counts(), + parquetColumnIndex.getMin_values(), + parquetColumnIndex.getMax_values()); + } + + public static org.apache.parquet.internal.column.columnindex.OffsetIndex fromParquetOffsetIndex(OffsetIndex parquetOffsetIndex) + { + OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + for (PageLocation pageLocation : parquetOffsetIndex.getPage_locations()) { + builder.add(pageLocation.getOffset(), pageLocation.getCompressed_page_size(), pageLocation.getFirst_row_index()); + } + return builder.build(); + } + + public static boolean isMinMaxStatsSupported(PrimitiveType type) + { + return type.columnOrder().getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER; + } + + public static Statistics toParquetStatistics(org.apache.parquet.column.statistics.Statistics stats, int truncateLength) + { + Statistics formatStats = new Statistics(); + if (!stats.isEmpty() && withinLimit(stats, truncateLength)) { + formatStats.setNull_count(stats.getNumNulls()); + if (stats.hasNonNullValue()) { + byte[] min; + byte[] max; + boolean isMinValueExact = true; + boolean isMaxValueExact = true; + + if (stats instanceof BinaryStatistics && truncateLength != Integer.MAX_VALUE) { + BinaryTruncator truncator = BinaryTruncator.getTruncator(stats.type()); + byte[] originalMin = stats.getMinBytes(); + byte[] originalMax = stats.getMaxBytes(); + min = truncateMin(truncator, truncateLength, originalMin); + max = truncateMax(truncator, truncateLength, originalMax); + isMinValueExact = originalMin.length == min.length; + isMaxValueExact = originalMax.length == max.length; + } + else { + min = stats.getMinBytes(); + max = stats.getMaxBytes(); + } + // Fill the former min-max statistics only if the comparison logic is + // signed so the logic of V1 and V2 stats are the same (which is + // trivially true for equal min-max values) + if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { + formatStats.setMin(min); + formatStats.setMax(max); + } + + if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { + formatStats.setMin_value(min); + formatStats.setMax_value(max); + formatStats.setIs_min_value_exact(isMinValueExact); + formatStats.setIs_max_value_exact(isMaxValueExact); + } + } + } + return formatStats; + } + + public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics(String createdBy, Statistics statistics, PrimitiveType type) + { + org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); + if (statistics != null) { + if (statistics.isSetMin_value() && statistics.isSetMax_value()) { + byte[] min = statistics.min_value.array(); + byte[] max = statistics.max_value.array(); + if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { + statsBuilder.withMin(min); + statsBuilder.withMax(max); + } + } + else { + boolean isSet = statistics.isSetMax() && statistics.isSetMin(); + boolean maxEqualsMin = isSet && Arrays.equals(statistics.getMin(), statistics.getMax()); + boolean sortOrdersMatch = SortOrder.SIGNED == sortOrder(type); + if (isSet && !shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && (sortOrdersMatch || maxEqualsMin)) { + statsBuilder.withMin(statistics.min.array()); + statsBuilder.withMax(statistics.max.array()); + } + } + + if (statistics.isSetNull_count()) { + statsBuilder.withNumNulls(statistics.null_count); + } + } + return statsBuilder.build(); + } + + public static IndexReference toColumnIndexReference(ColumnChunk columnChunk) + { + if (columnChunk.isSetColumn_index_offset() && columnChunk.isSetColumn_index_length()) { + return new IndexReference(columnChunk.getColumn_index_offset(), columnChunk.getColumn_index_length()); + } + return null; + } + + public static IndexReference toOffsetIndexReference(ColumnChunk columnChunk) + { + if (columnChunk.isSetOffset_index_offset() && columnChunk.isSetOffset_index_length()) { + return new IndexReference(columnChunk.getOffset_index_offset(), columnChunk.getOffset_index_length()); + } + return null; + } + + public enum SortOrder + { + SIGNED, + UNSIGNED, + UNKNOWN + } + + private static SortOrder sortOrder(PrimitiveType primitive) + { + LogicalTypeAnnotation annotation = primitive.getLogicalTypeAnnotation(); + if (annotation == null) { + return defaultSortOrder(primitive.getPrimitiveTypeName()); + } + return annotation.accept(new SortOrderVisitor()) + .orElse(defaultSortOrder(primitive.getPrimitiveTypeName())); + } + + private static SortOrder defaultSortOrder(PrimitiveTypeName primitive) + { + return switch (primitive) { + case BOOLEAN, INT32, INT64, FLOAT, DOUBLE -> SortOrder.SIGNED; + case BINARY, FIXED_LEN_BYTE_ARRAY -> SortOrder.UNSIGNED; + default -> SortOrder.UNKNOWN; + }; + } + + private static LogicalTypeAnnotation.TimeUnit convertTimeUnit(TimeUnit unit) + { + return switch (unit.getSetField()) { + case MICROS -> LogicalTypeAnnotation.TimeUnit.MICROS; + case MILLIS -> LogicalTypeAnnotation.TimeUnit.MILLIS; + case NANOS -> LogicalTypeAnnotation.TimeUnit.NANOS; + }; + } + + private static org.apache.parquet.internal.column.columnindex.BoundaryOrder fromParquetBoundaryOrder(BoundaryOrder boundaryOrder) + { + return switch (boundaryOrder) { + case ASCENDING -> org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; + case DESCENDING -> org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; + case UNORDERED -> org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; + }; + } + + private static boolean withinLimit(org.apache.parquet.column.statistics.Statistics stats, int truncateLength) + { + if (stats.isSmallerThan(MAX_STATS_SIZE)) { + return true; + } + + return (stats instanceof BinaryStatistics binaryStats) && + binaryStats.isSmallerThanWithTruncation(MAX_STATS_SIZE, truncateLength); + } + + private static byte[] truncateMin(BinaryTruncator truncator, int truncateLength, byte[] input) + { + return truncator.truncateMin(Binary.fromConstantByteArray(input), truncateLength).getBytes(); + } + + private static byte[] truncateMax(BinaryTruncator truncator, int truncateLength, byte[] input) + { + return truncator.truncateMax(Binary.fromConstantByteArray(input), truncateLength).getBytes(); + } + + private static class LogicalTypeConverterVisitor + implements LogicalTypeAnnotationVisitor + { + @Override + public Optional visit(StringLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.STRING(new StringType())); + } + + @Override + public Optional visit(MapLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.MAP(new MapType())); + } + + @Override + public Optional visit(ListLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.LIST(new ListType())); + } + + @Override + public Optional visit(EnumLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.ENUM(new EnumType())); + } + + @Override + public Optional visit(DecimalLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.DECIMAL(new DecimalType(type.getScale(), type.getPrecision()))); + } + + @Override + public Optional visit(DateLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.DATE(new DateType())); + } + + @Override + public Optional visit(TimeLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.TIME(new TimeType(type.isAdjustedToUTC(), convertUnit(type.getUnit())))); + } + + @Override + public Optional visit(TimestampLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.TIMESTAMP(new TimestampType(type.isAdjustedToUTC(), convertUnit(type.getUnit())))); + } + + @Override + public Optional visit(IntLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.INTEGER(new IntType((byte) type.getBitWidth(), type.isSigned()))); + } + + @Override + public Optional visit(JsonLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.JSON(new JsonType())); + } + + @Override + public Optional visit(BsonLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.BSON(new BsonType())); + } + + @Override + public Optional visit(UUIDLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.UUID(new UUIDType())); + } + + @Override + public Optional visit(IntervalLogicalTypeAnnotation type) + { + return Optional.of(LogicalType.UNKNOWN(new NullType())); + } + + static TimeUnit convertUnit(LogicalTypeAnnotation.TimeUnit unit) + { + return switch (unit) { + case MICROS -> TimeUnit.MICROS(new MicroSeconds()); + case MILLIS -> TimeUnit.MILLIS(new MilliSeconds()); + case NANOS -> TimeUnit.NANOS(new NanoSeconds()); + }; + } + } + + private static class SortOrderVisitor + implements LogicalTypeAnnotationVisitor + { + @Override + public Optional visit(IntLogicalTypeAnnotation intLogicalType) + { + return Optional.of(intLogicalType.isSigned() ? SortOrder.SIGNED : SortOrder.UNSIGNED); + } + + @Override + public Optional visit(IntervalLogicalTypeAnnotation intervalLogicalType) + { + return Optional.of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(DateLogicalTypeAnnotation dateLogicalType) + { + return Optional.of(SortOrder.SIGNED); + } + + @Override + public Optional visit(EnumLogicalTypeAnnotation enumLogicalType) + { + return Optional.of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(BsonLogicalTypeAnnotation bsonLogicalType) + { + return Optional.of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) + { + return Optional.of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(JsonLogicalTypeAnnotation jsonLogicalType) + { + return Optional.of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(StringLogicalTypeAnnotation stringLogicalType) + { + return Optional.of(SortOrder.UNSIGNED); + } + + @Override + public Optional visit(DecimalLogicalTypeAnnotation decimalLogicalType) + { + return Optional.of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(MapKeyValueTypeAnnotation mapKeyValueLogicalType) + { + return Optional.of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(MapLogicalTypeAnnotation mapLogicalType) + { + return Optional.of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(ListLogicalTypeAnnotation listLogicalType) + { + return Optional.of(SortOrder.UNKNOWN); + } + + @Override + public Optional visit(TimeLogicalTypeAnnotation timeLogicalType) + { + return Optional.of(SortOrder.SIGNED); + } + + @Override + public Optional visit(TimestampLogicalTypeAnnotation timestampLogicalType) + { + return Optional.of(SortOrder.SIGNED); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetReaderOptions.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetReaderOptions.java new file mode 100644 index 000000000000..ee3c77ab0ded --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetReaderOptions.java @@ -0,0 +1,249 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.airlift.units.DataSize; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.units.DataSize.Unit.MEGABYTE; +import static java.util.Objects.requireNonNull; + +public class ParquetReaderOptions +{ + private static final DataSize DEFAULT_MAX_READ_BLOCK_SIZE = DataSize.of(16, MEGABYTE); + private static final int DEFAULT_MAX_READ_BLOCK_ROW_COUNT = 8 * 1024; + private static final DataSize DEFAULT_MAX_MERGE_DISTANCE = DataSize.of(1, MEGABYTE); + private static final DataSize DEFAULT_MAX_BUFFER_SIZE = DataSize.of(8, MEGABYTE); + private static final DataSize DEFAULT_SMALL_FILE_THRESHOLD = DataSize.of(3, MEGABYTE); + private static final DataSize DEFAULT_MAX_FOOTER_READ_SIZE = DataSize.of(15, MEGABYTE); + + private final boolean ignoreStatistics; + private final DataSize maxReadBlockSize; + private final int maxReadBlockRowCount; + private final DataSize maxMergeDistance; + private final DataSize maxBufferSize; + private final boolean useColumnIndex; + private final boolean useBloomFilter; + private final DataSize smallFileThreshold; + private final boolean vectorizedDecodingEnabled; + private final DataSize maxFooterReadSize; + + private ParquetReaderOptions() + { + ignoreStatistics = false; + maxReadBlockSize = DEFAULT_MAX_READ_BLOCK_SIZE; + maxReadBlockRowCount = DEFAULT_MAX_READ_BLOCK_ROW_COUNT; + maxMergeDistance = DEFAULT_MAX_MERGE_DISTANCE; + maxBufferSize = DEFAULT_MAX_BUFFER_SIZE; + useColumnIndex = true; + useBloomFilter = true; + smallFileThreshold = DEFAULT_SMALL_FILE_THRESHOLD; + vectorizedDecodingEnabled = true; + maxFooterReadSize = DEFAULT_MAX_FOOTER_READ_SIZE; + } + + private ParquetReaderOptions( + boolean ignoreStatistics, + DataSize maxReadBlockSize, + int maxReadBlockRowCount, + DataSize maxMergeDistance, + DataSize maxBufferSize, + boolean useColumnIndex, + boolean useBloomFilter, + DataSize smallFileThreshold, + boolean vectorizedDecodingEnabled, + DataSize maxFooterReadSize) + { + this.ignoreStatistics = ignoreStatistics; + this.maxReadBlockSize = requireNonNull(maxReadBlockSize, "maxReadBlockSize is null"); + checkArgument(maxReadBlockRowCount > 0, "maxReadBlockRowCount must be greater than 0"); + this.maxReadBlockRowCount = maxReadBlockRowCount; + this.maxMergeDistance = requireNonNull(maxMergeDistance, "maxMergeDistance is null"); + this.maxBufferSize = requireNonNull(maxBufferSize, "maxBufferSize is null"); + this.useColumnIndex = useColumnIndex; + this.useBloomFilter = useBloomFilter; + this.smallFileThreshold = requireNonNull(smallFileThreshold, "smallFileThreshold is null"); + this.vectorizedDecodingEnabled = vectorizedDecodingEnabled; + this.maxFooterReadSize = requireNonNull(maxFooterReadSize, "maxFooterReadSize is null"); + } + + public static Builder builder() + { + return new Builder(new ParquetReaderOptions()); + } + + public static Builder builder(ParquetReaderOptions parquetReaderOptions) + { + return new Builder(parquetReaderOptions); + } + + public static ParquetReaderOptions defaultOptions() + { + return new ParquetReaderOptions(); + } + + public boolean isIgnoreStatistics() + { + return ignoreStatistics; + } + + public DataSize getMaxReadBlockSize() + { + return maxReadBlockSize; + } + + public DataSize getMaxMergeDistance() + { + return maxMergeDistance; + } + + public boolean isUseColumnIndex() + { + return useColumnIndex; + } + + public boolean useBloomFilter() + { + return useBloomFilter; + } + + public boolean isVectorizedDecodingEnabled() + { + return vectorizedDecodingEnabled; + } + + public DataSize getMaxBufferSize() + { + return maxBufferSize; + } + + public int getMaxReadBlockRowCount() + { + return maxReadBlockRowCount; + } + + public DataSize getSmallFileThreshold() + { + return smallFileThreshold; + } + + public DataSize getMaxFooterReadSize() + { + return maxFooterReadSize; + } + + public static class Builder + { + private boolean ignoreStatistics; + private DataSize maxReadBlockSize; + private int maxReadBlockRowCount; + private DataSize maxMergeDistance; + private DataSize maxBufferSize; + private boolean useColumnIndex; + private boolean useBloomFilter; + private DataSize smallFileThreshold; + private boolean vectorizedDecodingEnabled; + private DataSize maxFooterReadSize; + + private Builder(ParquetReaderOptions parquetReaderOptions) + { + requireNonNull(parquetReaderOptions, "parquetReaderOptions is null"); + this.ignoreStatistics = parquetReaderOptions.ignoreStatistics; + this.maxReadBlockSize = parquetReaderOptions.maxReadBlockSize; + this.maxReadBlockRowCount = parquetReaderOptions.maxReadBlockRowCount; + this.maxMergeDistance = parquetReaderOptions.maxMergeDistance; + this.maxBufferSize = parquetReaderOptions.maxBufferSize; + this.useColumnIndex = parquetReaderOptions.useColumnIndex; + this.useBloomFilter = parquetReaderOptions.useBloomFilter; + this.smallFileThreshold = parquetReaderOptions.smallFileThreshold; + this.vectorizedDecodingEnabled = parquetReaderOptions.vectorizedDecodingEnabled; + this.maxFooterReadSize = parquetReaderOptions.maxFooterReadSize; + } + + public Builder withIgnoreStatistics(boolean ignoreStatistics) + { + this.ignoreStatistics = ignoreStatistics; + return this; + } + + public Builder withMaxReadBlockSize(DataSize maxReadBlockSize) + { + this.maxReadBlockSize = requireNonNull(maxReadBlockSize, "maxReadBlockSize is null"); + return this; + } + + public Builder withMaxReadBlockRowCount(int maxReadBlockRowCount) + { + this.maxReadBlockRowCount = maxReadBlockRowCount; + return this; + } + + public Builder withMaxMergeDistance(DataSize maxMergeDistance) + { + this.maxMergeDistance = requireNonNull(maxMergeDistance, "maxMergeDistance is null"); + return this; + } + + public Builder withMaxBufferSize(DataSize maxBufferSize) + { + this.maxBufferSize = requireNonNull(maxBufferSize, "maxBufferSize is null"); + return this; + } + + public Builder withUseColumnIndex(boolean useColumnIndex) + { + this.useColumnIndex = useColumnIndex; + return this; + } + + public Builder withBloomFilter(boolean useBloomFilter) + { + this.useBloomFilter = useBloomFilter; + return this; + } + + public Builder withSmallFileThreshold(DataSize smallFileThreshold) + { + this.smallFileThreshold = requireNonNull(smallFileThreshold, "smallFileThreshold is null"); + return this; + } + + public Builder withVectorizedDecodingEnabled(boolean vectorizedDecodingEnabled) + { + this.vectorizedDecodingEnabled = vectorizedDecodingEnabled; + return this; + } + + public Builder withMaxFooterReadSize(DataSize maxFooterReadSize) + { + this.maxFooterReadSize = requireNonNull(maxFooterReadSize, "maxFooterReadSize is null"); + return this; + } + + public ParquetReaderOptions build() + { + return new ParquetReaderOptions( + ignoreStatistics, + maxReadBlockSize, + maxReadBlockRowCount, + maxMergeDistance, + maxBufferSize, + useColumnIndex, + useBloomFilter, + smallFileThreshold, + vectorizedDecodingEnabled, + maxFooterReadSize); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetReaderUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetReaderUtils.java new file mode 100644 index 000000000000..a608955e9b8d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetReaderUtils.java @@ -0,0 +1,293 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Sets; +import io.airlift.slice.Slice; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.reader.SimpleSliceInputStream; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.EncodingStats; + +import java.util.Set; + +import static com.google.common.base.Verify.verify; +import static java.lang.String.format; +import static org.apache.parquet.column.Encoding.BIT_PACKED; +import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY; +import static org.apache.parquet.column.Encoding.RLE; + +public final class ParquetReaderUtils +{ + private ParquetReaderUtils() {} + + public static ByteBufferInputStream toInputStream(Slice slice) + { + return ByteBufferInputStream.wrap(slice.toByteBuffer()); + } + + public static ByteBufferInputStream toInputStream(DictionaryPage page) + { + return toInputStream(page.getSlice()); + } + + /** + * Reads an integer formatted in ULEB128 variable-width format described in + * ... + */ + public static int readUleb128Int(SimpleSliceInputStream input) + { + byte[] inputBytes = input.getByteArray(); + int offset = input.getByteArrayOffset(); + // Manual loop unrolling shows improvements in BenchmarkReadUleb128Int + int inputByte = inputBytes[offset]; + int value = inputByte & 0x7F; + if ((inputByte & 0x80) == 0) { + input.skip(1); + return value; + } + inputByte = inputBytes[offset + 1]; + value |= (inputByte & 0x7F) << 7; + if ((inputByte & 0x80) == 0) { + input.skip(2); + return value; + } + inputByte = inputBytes[offset + 2]; + value |= (inputByte & 0x7F) << 14; + if ((inputByte & 0x80) == 0) { + input.skip(3); + return value; + } + inputByte = inputBytes[offset + 3]; + value |= (inputByte & 0x7F) << 21; + if ((inputByte & 0x80) == 0) { + input.skip(4); + return value; + } + inputByte = inputBytes[offset + 4]; + verify((inputByte & 0x80) == 0, "ULEB128 variable-width integer should not be longer than 5 bytes"); + input.skip(5); + return value | inputByte << 28; + } + + public static long readUleb128Long(SimpleSliceInputStream input) + { + byte[] inputBytes = input.getByteArray(); + int offset = input.getByteArrayOffset(); + // Manual loop unrolling shows improvements in BenchmarkReadUleb128Long + long inputByte = inputBytes[offset]; + long value = inputByte & 0x7F; + if ((inputByte & 0x80) == 0) { + input.skip(1); + return value; + } + inputByte = inputBytes[offset + 1]; + value |= (inputByte & 0x7F) << 7; + if ((inputByte & 0x80) == 0) { + input.skip(2); + return value; + } + inputByte = inputBytes[offset + 2]; + value |= (inputByte & 0x7F) << 14; + if ((inputByte & 0x80) == 0) { + input.skip(3); + return value; + } + inputByte = inputBytes[offset + 3]; + value |= (inputByte & 0x7F) << 21; + if ((inputByte & 0x80) == 0) { + input.skip(4); + return value; + } + inputByte = inputBytes[offset + 4]; + value |= (inputByte & 0x7F) << 28; + if ((inputByte & 0x80) == 0) { + input.skip(5); + return value; + } + inputByte = inputBytes[offset + 5]; + value |= (inputByte & 0x7F) << 35; + if ((inputByte & 0x80) == 0) { + input.skip(6); + return value; + } + inputByte = inputBytes[offset + 6]; + value |= (inputByte & 0x7F) << 42; + if ((inputByte & 0x80) == 0) { + input.skip(7); + return value; + } + inputByte = inputBytes[offset + 7]; + value |= (inputByte & 0x7F) << 49; + if ((inputByte & 0x80) == 0) { + input.skip(8); + return value; + } + inputByte = inputBytes[offset + 8]; + value |= (inputByte & 0x7F) << 56; + if ((inputByte & 0x80) == 0) { + input.skip(9); + return value; + } + inputByte = inputBytes[offset + 9]; + verify((inputByte & 0x80) == 0, "ULEB128 variable-width long should not be longer than 10 bytes"); + input.skip(10); + return value | inputByte << 63; + } + + public static int readFixedWidthInt(SimpleSliceInputStream input, int bytesWidth) + { + return switch (bytesWidth) { + case 0 -> 0; + case 1 -> input.readByte() & 0xFF; + case 2 -> input.readShort() & 0xFFFF; + case 3 -> { + int value = input.readShort() & 0xFFFF; + yield ((input.readByte() & 0xFF) << 16) | value; + } + case 4 -> input.readInt(); + default -> throw new IllegalArgumentException(format("Encountered bytesWidth (%d) that requires more than 4 bytes", bytesWidth)); + }; + } + + /** + * For storing signed values (not the deltas themselves) in DELTA_BINARY_PACKED encoding, zigzag encoding + * (...) + * is used to map negative values to positive ones and then apply ULEB128 on the result. + */ + public static long zigzagDecode(long value) + { + return (value >>> 1) ^ -(value & 1); + } + + /** + * Returns the result of arguments division rounded up. + *

+ * Works only for positive numbers. + * The sum of dividend and divisor cannot exceed Integer.MAX_VALUE + */ + public static int ceilDiv(int dividend, int divisor) + { + return (dividend + divisor - 1) / divisor; + } + + /** + * Propagate the sign bit in values that are shorter than 8 bytes. + *

+ * When the value of less than 8 bytes in put into a long variable, the padding bytes on the + * left side of the number should be all zeros for a positive number or all ones for negatives. + * This method does this padding using signed bit shift operator without branches. + * + * @param value Value to trim + * @param bitsToPad Number of bits to pad + * @return Value with correct padding + */ + public static long propagateSignBit(long value, int bitsToPad) + { + return value << bitsToPad >> bitsToPad; + } + + /** + * Method simulates a cast from boolean to byte value. Despite using + * a ternary (?) operator, the just-in-time compiler usually figures out + * that this is a cast and turns that into a no-op. + *

+ * Method may be used to avoid branches that may be CPU costly due to + * branch misprediction. + * The following code: + *

+     *      boolean[] flags = ...
+     *      int sum = 0;
+     *      for (int i = 0; i < length; i++){
+     *          if (flags[i])
+     *              sum++;
+     *      }
+     * 
+ * will perform better when rewritten to + *
+     *      boolean[] flags = ...
+     *      int sum = 0;
+     *      for (int i = 0; i < length; i++){
+     *          sum += castToByte(flags[i]);
+     *      }
+     * 
+ */ + public static byte castToByte(boolean value) + { + return (byte) (value ? 1 : 0); + } + + /** + * Works the same as {@link io.trino.parquet.ParquetReaderUtils#castToByte(boolean)} and negates the boolean value + */ + public static byte castToByteNegate(boolean value) + { + return (byte) (value ? 0 : 1); + } + + public static short toShortExact(long value) + { + if ((short) value != value) { + throw new ArithmeticException("short overflow"); + } + return (short) value; + } + + public static short toShortExact(int value) + { + if ((short) value != value) { + throw new ArithmeticException(format("Value %d exceeds short range", value)); + } + return (short) value; + } + + public static byte toByteExact(long value) + { + if ((byte) value != value) { + throw new ArithmeticException("byte overflow"); + } + return (byte) value; + } + + public static byte toByteExact(int value) + { + if ((byte) value != value) { + throw new ArithmeticException(format("Value %d exceeds byte range", value)); + } + return (byte) value; + } + + @SuppressWarnings("deprecation") + public static boolean isOnlyDictionaryEncodingPages(ColumnChunkMetadata columnMetaData) + { + // Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available + // Otherwise, fallback to v1 logic + EncodingStats stats = columnMetaData.getEncodingStats(); + if (stats != null) { + return stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages(); + } + + Set encodings = columnMetaData.getEncodings(); + if (encodings.contains(PLAIN_DICTIONARY)) { + // PLAIN_DICTIONARY was present, which means at least one page was + // dictionary-encoded and 1.0 encodings are used + // The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels + return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty(); + } + + return false; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetTimestampUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetTimestampUtils.java new file mode 100644 index 000000000000..117d1e0adb13 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetTimestampUtils.java @@ -0,0 +1,92 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; +import io.trino.plugin.base.type.DecodedTimestamp; +import io.trino.spi.TrinoException; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.LogicalTypeAnnotation; + +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_SECOND; +import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_SECOND; +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MICROSECOND; +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_SECOND; +import static io.trino.spi.type.Timestamps.SECONDS_PER_DAY; +import static java.lang.StrictMath.floorDiv; +import static java.lang.StrictMath.floorMod; +import static java.lang.StrictMath.toIntExact; + +/** + * Utility class for decoding INT96 encoded parquet timestamp to timestamp millis in GMT. + */ +public final class ParquetTimestampUtils +{ + public static final int JULIAN_EPOCH_OFFSET_DAYS = 2_440_588; + + private ParquetTimestampUtils() {} + + /** + * Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos). + * + * @param timestampBinary INT96 parquet timestamp + */ + public static DecodedTimestamp decodeInt96Timestamp(Binary timestampBinary) + { + if (timestampBinary.length() != 12) { + throw new TrinoException(NOT_SUPPORTED, "Parquet timestamp must be 12 bytes, actual " + timestampBinary.length()); + } + byte[] bytes = timestampBinary.getBytes(); + + // little endian encoding - need to invert byte order + long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]); + int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]); + + return decodeInt96Timestamp(timeOfDayNanos, julianDay); + } + + public static DecodedTimestamp decodeInt96Timestamp(long timeOfDayNanos, int julianDay) + { + long epochSeconds = (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * SECONDS_PER_DAY + floorDiv(timeOfDayNanos, NANOSECONDS_PER_SECOND); + return new DecodedTimestamp(epochSeconds, (int) floorMod(timeOfDayNanos, NANOSECONDS_PER_SECOND)); + } + + public static DecodedTimestamp decodeInt64Timestamp(long timestamp, LogicalTypeAnnotation.TimeUnit precision) + { + long toSecondsConversion; + long toNanosConversion = switch (precision) { + case MILLIS -> { + toSecondsConversion = MILLISECONDS_PER_SECOND; + yield NANOSECONDS_PER_MILLISECOND; + } + case MICROS -> { + toSecondsConversion = MICROSECONDS_PER_SECOND; + yield NANOSECONDS_PER_MICROSECOND; + } + case NANOS -> { + toSecondsConversion = NANOSECONDS_PER_SECOND; + yield 1; + } + default -> throw new TrinoException(NOT_SUPPORTED, "Unsupported Parquet timestamp time unit " + precision); + }; + long epochSeconds = floorDiv(timestamp, toSecondsConversion); + long fractionalSecond = floorMod(timestamp, toSecondsConversion); + int nanosOfSecond = toIntExact(fractionalSecond * toNanosConversion); + + return new DecodedTimestamp(epochSeconds, nanosOfSecond); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetTypeUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetTypeUtils.java new file mode 100644 index 000000000000..13e1c754d484 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetTypeUtils.java @@ -0,0 +1,381 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.TrinoException; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.MapType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.Type; +import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.io.ColumnIO; +import org.apache.parquet.io.ColumnIOFactory; +import org.apache.parquet.io.GroupColumnIO; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.io.PrimitiveColumnIO; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; + +import java.math.BigInteger; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.type.StandardTypes.JSON; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static java.lang.String.format; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REPEATED; + +public final class ParquetTypeUtils +{ + private ParquetTypeUtils() {} + + public static List getColumns(MessageType fileSchema, MessageType requestedSchema) + { + return ImmutableList.copyOf(new ColumnIOFactory().getColumnIO(requestedSchema, fileSchema, true).getLeaves()); + } + + public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema) + { + return new ColumnIOFactory().getColumnIO(requestedSchema, fileSchema, true); + } + + public static GroupColumnIO getMapKeyValueColumn(GroupColumnIO groupColumnIO) + { + while (groupColumnIO.getChildrenCount() == 1) { + groupColumnIO = (GroupColumnIO) groupColumnIO.getChild(0); + } + return groupColumnIO; + } + + /* For backward-compatibility, the type of elements in LIST-annotated structures should always be determined by the following rules: + * 1. If the repeated field is not a group, then its type is the element type and elements are required. + * 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required. + * 3. If the repeated field is a group with one field and is named either array or uses the LIST-annotated group's name with _tuple appended then the repeated type is the element type and elements are required. + * 4. Otherwise, the repeated field's type is the element type with the repeated field's repetition. + * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists + */ + public static ColumnIO getArrayElementColumn(ColumnIO columnIO) + { + while (columnIO instanceof GroupColumnIO && !columnIO.getType().isRepetition(REPEATED)) { + columnIO = ((GroupColumnIO) columnIO).getChild(0); + } + + /* If array has a standard 3-level structure with middle level repeated group with a single field: + * optional group my_list (LIST) { + * repeated group element { + * required binary str (UTF8); + * }; + * } + */ + if (columnIO instanceof GroupColumnIO groupColumnIO && + columnIO.getType().getLogicalTypeAnnotation() == null && + groupColumnIO.getChildrenCount() == 1 && + !columnIO.getName().equals("array") && + !columnIO.getName().equals(columnIO.getParent().getName() + "_tuple")) { + return groupColumnIO.getChild(0); + } + + /* Backward-compatibility support for 2-level arrays where a repeated field is not a group: + * optional group my_list (LIST) { + * repeated int32 element; + * } + */ + return columnIO; + } + + public static Map, ColumnDescriptor> getDescriptors(MessageType fileSchema, MessageType requestedSchema) + { + // io.trino.parquet.reader.MetadataReader.readFooter performs lower casing of all column names in fileSchema. + // requestedSchema also contains lower cased columns because of being derived from fileSchema. + // io.trino.parquet.ParquetTypeUtils.getParquetTypeByName takes care of case-insensitive matching if needed. + // Therefore, we don't need to repeat case-insensitive matching here. + return getColumns(fileSchema, requestedSchema) + .stream() + .collect(toImmutableMap( + columnIO -> Arrays.asList(columnIO.getFieldPath()), + PrimitiveColumnIO::getColumnDescriptor, + // Same column name may occur more than once when the file is written by case-sensitive tools + (oldValue, ignore) -> oldValue)); + } + + @SuppressWarnings("deprecation") + public static ParquetEncoding getParquetEncoding(Encoding encoding) + { + return switch (encoding) { + case PLAIN -> ParquetEncoding.PLAIN; + case RLE -> ParquetEncoding.RLE; + case BYTE_STREAM_SPLIT -> ParquetEncoding.BYTE_STREAM_SPLIT; + case BIT_PACKED -> ParquetEncoding.BIT_PACKED; + case PLAIN_DICTIONARY -> ParquetEncoding.PLAIN_DICTIONARY; + case DELTA_BINARY_PACKED -> ParquetEncoding.DELTA_BINARY_PACKED; + case DELTA_LENGTH_BYTE_ARRAY -> ParquetEncoding.DELTA_LENGTH_BYTE_ARRAY; + case DELTA_BYTE_ARRAY -> ParquetEncoding.DELTA_BYTE_ARRAY; + case RLE_DICTIONARY -> ParquetEncoding.RLE_DICTIONARY; + }; + } + + public static org.apache.parquet.schema.Type getParquetTypeByName(String columnName, GroupType groupType) + { + if (groupType.containsField(columnName)) { + return groupType.getType(columnName); + } + // parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase + // check for direct match above but if no match found, try case-insensitive match + for (org.apache.parquet.schema.Type type : groupType.getFields()) { + if (type.getName().equalsIgnoreCase(columnName)) { + return type; + } + } + + return null; + } + + /** + * Parquet column names are case-sensitive unlike Hive, which converts all column names to lowercase. + * Therefore, when we look up columns we first check for exact match, and if that fails we look for a case-insensitive match. + */ + public static ColumnIO lookupColumnByName(GroupColumnIO groupColumnIO, String columnName) + { + ColumnIO columnIO = groupColumnIO.getChild(columnName); + + if (columnIO != null) { + return columnIO; + } + + for (int i = 0; i < groupColumnIO.getChildrenCount(); i++) { + if (groupColumnIO.getChild(i).getName().equalsIgnoreCase(columnName)) { + return groupColumnIO.getChild(i); + } + } + + return null; + } + + @Nullable + public static ColumnIO lookupColumnById(GroupColumnIO groupColumnIO, int columnId) + { + for (int i = 0; i < groupColumnIO.getChildrenCount(); i++) { + ColumnIO child = groupColumnIO.getChild(i); + if (child.getType().getId().intValue() == columnId) { + return child; + } + } + return null; + } + + public static Optional createDecimalType(PrimitiveField field) + { + if (!(field.getDescriptor().getPrimitiveType().getLogicalTypeAnnotation() instanceof DecimalLogicalTypeAnnotation decimalLogicalType)) { + return Optional.empty(); + } + return Optional.of(DecimalType.createDecimalType(decimalLogicalType.getPrecision(), decimalLogicalType.getScale())); + } + + /** + * For optional fields: + *
    + *
  • definitionLevel == maxDefinitionLevel => Value is defined
  • + *
  • definitionLevel == maxDefinitionLevel - 1 => Value is null
  • + *
  • definitionLevel < maxDefinitionLevel - 1 => Value does not exist, because one of its optional parent fields is null
  • + *
+ */ + public static boolean isValueNull(boolean required, int definitionLevel, int maxDefinitionLevel) + { + return !required && (definitionLevel == maxDefinitionLevel - 1); + } + + public static boolean isOptionalFieldValueNull(int definitionLevel, int maxDefinitionLevel) + { + return definitionLevel == maxDefinitionLevel - 1; + } + + public static long getShortDecimalValue(byte[] bytes) + { + return getShortDecimalValue(bytes, 0, bytes.length); + } + + public static long getShortDecimalValue(byte[] bytes, int startOffset, int length) + { + long value = 0; + switch (length) { + case 8: + value |= bytes[startOffset + 7] & 0xFFL; + // fall through + case 7: + value |= (bytes[startOffset + 6] & 0xFFL) << 8; + // fall through + case 6: + value |= (bytes[startOffset + 5] & 0xFFL) << 16; + // fall through + case 5: + value |= (bytes[startOffset + 4] & 0xFFL) << 24; + // fall through + case 4: + value |= (bytes[startOffset + 3] & 0xFFL) << 32; + // fall through + case 3: + value |= (bytes[startOffset + 2] & 0xFFL) << 40; + // fall through + case 2: + value |= (bytes[startOffset + 1] & 0xFFL) << 48; + // fall through + case 1: + value |= (bytes[startOffset] & 0xFFL) << 56; + } + value = value >> ((8 - length) * 8); + return value; + } + + public static void checkBytesFitInShortDecimal(byte[] bytes, int offset, int length, ColumnDescriptor descriptor) + { + int endOffset = offset + length; + // Equivalent to expectedValue = bytes[endOffset] < 0 ? -1 : 0 + byte expectedValue = (byte) (bytes[endOffset] >> 7); + for (int i = offset; i < endOffset; i++) { + if (bytes[i] != expectedValue) { + throw new TrinoException(NOT_SUPPORTED, format( + "Could not read unscaled value %s into a short decimal from column %s", + new BigInteger(bytes, offset, length + Long.BYTES), + descriptor)); + } + } + } + + public static byte[] paddingBigInteger(BigInteger bigInteger, int numBytes) + { + byte[] bytes = bigInteger.toByteArray(); + if (bytes.length == numBytes) { + return bytes; + } + byte[] result = new byte[numBytes]; + if (bigInteger.signum() < 0) { + Arrays.fill(result, 0, numBytes - bytes.length, (byte) 0xFF); + } + System.arraycopy(bytes, 0, result, numBytes - bytes.length, bytes.length); + return result; + } + + /** + * Assumes the parent of columnIO is a MessageColumnIO, i.e. columnIO should be a top level column in the schema. + */ + public static Optional constructField(Type type, ColumnIO columnIO) + { + return constructField(type, columnIO, true); + } + + private static Optional constructField(Type type, ColumnIO columnIO, boolean isTopLevel) + { + if (columnIO == null) { + return Optional.empty(); + } + boolean required = columnIO.getType().getRepetition() != OPTIONAL; + int repetitionLevel = columnIO.getRepetitionLevel(); + int definitionLevel = columnIO.getDefinitionLevel(); + if (isVariantType(type, columnIO)) { + checkArgument(type.getTypeParameters().isEmpty(), "Expected type parameters to be empty for variant but got %s", type.getTypeParameters()); + if (!(columnIO instanceof GroupColumnIO groupColumnIo)) { + throw new IllegalStateException("Expected columnIO to be GroupColumnIO but got %s".formatted(columnIO.getClass().getSimpleName())); + } + PrimitiveField valueField = (PrimitiveField) constructField(VARBINARY, groupColumnIo.getChild(0), false).orElseThrow(); + PrimitiveField metadataField = (PrimitiveField) constructField(VARBINARY, groupColumnIo.getChild(1), false).orElseThrow(); + return Optional.of(new VariantField( + type, + repetitionLevel, + definitionLevel, + required, + new PrimitiveField(valueField.getType(), false, valueField.getDescriptor(), valueField.getId()), + // Mark the metadata field as optional, this is because the metadata field is not present when the actual Variant value is null + new PrimitiveField(metadataField.getType(), false, metadataField.getDescriptor(), metadataField.getId()))); + } + if (type instanceof RowType rowType) { + GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; + ImmutableList.Builder> fieldsBuilder = ImmutableList.builder(); + List fields = rowType.getFields(); + boolean structHasParameters = false; + for (RowType.Field rowField : fields) { + String name = rowField.getName().orElseThrow().toLowerCase(Locale.ENGLISH); + Optional field = constructField(rowField.getType(), lookupColumnByName(groupColumnIO, name), false); + structHasParameters |= field.isPresent(); + fieldsBuilder.add(field); + } + if (structHasParameters) { + return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, fieldsBuilder.build())); + } + return Optional.empty(); + } + if (type instanceof MapType mapType) { + GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; + GroupColumnIO keyValueColumnIO = getMapKeyValueColumn(groupColumnIO); + if (keyValueColumnIO.getChildrenCount() != 2) { + return Optional.empty(); + } + Optional keyField = constructField(mapType.getKeyType(), keyValueColumnIO.getChild(0), false); + Optional valueField = constructField(mapType.getValueType(), keyValueColumnIO.getChild(1), false); + return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(keyField, valueField))); + } + if (type instanceof ArrayType arrayType) { + // Per the parquet spec (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md): + // `A repeated field that is neither contained by a LIST- or MAP-annotated group nor annotated by LIST or MAP should be interpreted as a required list of required elements + // where the element type is the type of the field.` + // + // A parquet encoding for a required list of strings can be expressed in two ways, however for backwards compatibility they should be handled the same, so here we need + // to adjust repetition and definition levels when converting ColumnIOs to Fields. + // 1. required group colors (LIST) { + // repeated group list { + // required string element; + // } + // } + // 2. repeated binary colors (STRING); + if (columnIO instanceof PrimitiveColumnIO primitiveColumnIO) { + if (columnIO.getType().getRepetition() != REPEATED || repetitionLevel == 0 || definitionLevel == 0) { + throw new TrinoException(NOT_SUPPORTED, format("Unsupported schema for Parquet column (%s)", primitiveColumnIO.getColumnDescriptor())); + } + PrimitiveField primitiveFieldElement = new PrimitiveField(arrayType.getElementType(), true, primitiveColumnIO.getColumnDescriptor(), primitiveColumnIO.getId()); + return Optional.of(new GroupField(type, repetitionLevel - 1, definitionLevel - 1, true, ImmutableList.of(Optional.of(primitiveFieldElement)))); + } + GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; + if (groupColumnIO.getChildrenCount() != 1) { + return Optional.empty(); + } + Optional field = constructField(arrayType.getElementType(), getArrayElementColumn(groupColumnIO.getChild(0)), false); + return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(field))); + } + PrimitiveColumnIO primitiveColumnIO = (PrimitiveColumnIO) columnIO; + if (primitiveColumnIO.getType().getRepetition() == REPEATED && isTopLevel) { + throw new TrinoException(NOT_SUPPORTED, format("Unsupported Trino column type (%s) for Parquet column (%s)", type, primitiveColumnIO.getColumnDescriptor())); + } + return Optional.of(new PrimitiveField(type, required, primitiveColumnIO.getColumnDescriptor(), primitiveColumnIO.getId())); + } + + private static boolean isVariantType(Type type, ColumnIO columnIO) + { + return type.getTypeSignature().getBase().equals(JSON) && + columnIO instanceof GroupColumnIO groupColumnIo && + groupColumnIo.getChildrenCount() == 2 && + groupColumnIo.getChild("value") != null && + groupColumnIo.getChild("metadata") != null; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetValidationUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetValidationUtils.java new file mode 100644 index 000000000000..5e31a08f704e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetValidationUtils.java @@ -0,0 +1,30 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.errorprone.annotations.FormatMethod; + +public final class ParquetValidationUtils +{ + private ParquetValidationUtils() {} + + @FormatMethod + public static void validateParquet(boolean condition, ParquetDataSourceId dataSourceId, String formatString, Object... args) + throws ParquetCorruptionException + { + if (!condition) { + throw new ParquetCorruptionException(dataSourceId, formatString, args); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetWriteValidation.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetWriteValidation.java new file mode 100644 index 000000000000..f35bfa8f8c3b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ParquetWriteValidation.java @@ -0,0 +1,663 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import com.google.common.collect.ImmutableList; +import io.airlift.slice.SizeOf; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.airlift.slice.XxHash64; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.metadata.IndexReference; +import io.trino.parquet.metadata.PrunedBlockMetadata; +import io.trino.parquet.reader.RowGroupInfo; +import io.trino.spi.Page; +import io.trino.spi.block.Block; +import io.trino.spi.type.Type; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.format.ColumnChunk; +import org.apache.parquet.format.ColumnMetaData; +import org.apache.parquet.format.RowGroup; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; + +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.estimatedSizeOf; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.SizeOf.sizeOf; +import static io.trino.parquet.ColumnStatisticsValidation.ColumnStatistics; +import static io.trino.parquet.ParquetMetadataConverter.getPrimitive; +import static io.trino.parquet.ParquetValidationUtils.validateParquet; +import static io.trino.parquet.ParquetWriteValidation.IndexReferenceValidation.fromIndexReference; +import static java.util.Objects.requireNonNull; + +public class ParquetWriteValidation +{ + private final String createdBy; + private final Optional timeZoneId; + private final List columns; + private final List rowGroups; + private final WriteChecksum checksum; + private final List types; + private final List columnNames; + + private ParquetWriteValidation( + String createdBy, + Optional timeZoneId, + List columns, + List rowGroups, + WriteChecksum checksum, + List types, + List columnNames) + { + this.createdBy = requireNonNull(createdBy, "createdBy is null"); + checkArgument(!createdBy.isEmpty(), "createdBy is empty"); + this.timeZoneId = requireNonNull(timeZoneId, "timeZoneId is null"); + this.columns = requireNonNull(columns, "columnPaths is null"); + this.rowGroups = requireNonNull(rowGroups, "rowGroups is null"); + this.checksum = requireNonNull(checksum, "checksum is null"); + this.types = requireNonNull(types, "types is null"); + this.columnNames = requireNonNull(columnNames, "columnNames is null"); + } + + public String getCreatedBy() + { + return createdBy; + } + + public List getTypes() + { + return types; + } + + public List getColumnNames() + { + return columnNames; + } + + public void validateTimeZone(ParquetDataSourceId dataSourceId, Optional actualTimeZoneId) + throws ParquetCorruptionException + { + validateParquet( + timeZoneId.equals(actualTimeZoneId), + dataSourceId, + "Found unexpected time zone %s, expected %s", + actualTimeZoneId, + timeZoneId); + } + + public void validateColumns(ParquetDataSourceId dataSourceId, MessageType schema) + throws ParquetCorruptionException + { + List actualColumns = schema.getColumns(); + validateParquet( + actualColumns.size() == columns.size(), + dataSourceId, + "Found columns %s, expected %s", + actualColumns, + columns); + for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) { + validateColumnDescriptorsSame(actualColumns.get(columnIndex), columns.get(columnIndex), dataSourceId); + } + } + + public void validateBlocksMetadata(ParquetDataSourceId dataSourceId, List rowGroupInfos) + throws ParquetCorruptionException + { + validateParquet( + rowGroupInfos.size() == rowGroups.size(), + dataSourceId, + "Number of row groups %d did not match %d", + rowGroupInfos.size(), + rowGroups.size()); + for (int rowGroupIndex = 0; rowGroupIndex < rowGroupInfos.size(); rowGroupIndex++) { + PrunedBlockMetadata block = rowGroupInfos.get(rowGroupIndex).prunedBlockMetadata(); + RowGroup rowGroup = rowGroups.get(rowGroupIndex); + validateParquet( + block.getRowCount() == rowGroup.getNum_rows(), + dataSourceId, + "Number of rows %d in row group %d did not match %d", + block.getRowCount(), + rowGroupIndex, + rowGroup.getNum_rows()); + + List columnChunkMetaData = block.getColumns(); + validateParquet( + columnChunkMetaData.size() == rowGroup.getColumnsSize(), + dataSourceId, + "Number of columns %d in row group %d did not match %d", + columnChunkMetaData.size(), + rowGroupIndex, + rowGroup.getColumnsSize()); + + for (int columnIndex = 0; columnIndex < columnChunkMetaData.size(); columnIndex++) { + ColumnChunkMetadata actualColumnMetadata = columnChunkMetaData.get(columnIndex); + ColumnChunk columnChunk = rowGroup.getColumns().get(columnIndex); + ColumnMetaData expectedColumnMetadata = columnChunk.getMeta_data(); + verifyColumnMetadataMatch( + actualColumnMetadata.getCodec().getParquetCompressionCodec().equals(expectedColumnMetadata.getCodec()), + "Compression codec", + actualColumnMetadata.getCodec(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getCodec()); + + verifyColumnMetadataMatch( + actualColumnMetadata.getPrimitiveType().getPrimitiveTypeName().equals(getPrimitive(expectedColumnMetadata.getType())), + "Type", + actualColumnMetadata.getPrimitiveType().getPrimitiveTypeName(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getType()); + + verifyColumnMetadataMatch( + areEncodingsSame(actualColumnMetadata.getEncodings(), expectedColumnMetadata.getEncodings()), + "Encodings", + actualColumnMetadata.getEncodings(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getEncodings()); + + verifyColumnMetadataMatch( + areStatisticsSame(actualColumnMetadata.getStatistics(), expectedColumnMetadata.getStatistics()), + "Statistics", + actualColumnMetadata.getStatistics(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getStatistics()); + + verifyColumnMetadataMatch( + actualColumnMetadata.getFirstDataPageOffset() == expectedColumnMetadata.getData_page_offset(), + "Data page offset", + actualColumnMetadata.getFirstDataPageOffset(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getData_page_offset()); + + verifyColumnMetadataMatch( + actualColumnMetadata.getDictionaryPageOffset() == expectedColumnMetadata.getDictionary_page_offset(), + "Dictionary page offset", + actualColumnMetadata.getDictionaryPageOffset(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getDictionary_page_offset()); + + verifyColumnMetadataMatch( + actualColumnMetadata.getValueCount() == expectedColumnMetadata.getNum_values(), + "Value count", + actualColumnMetadata.getValueCount(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getNum_values()); + + verifyColumnMetadataMatch( + actualColumnMetadata.getTotalUncompressedSize() == expectedColumnMetadata.getTotal_uncompressed_size(), + "Total uncompressed size", + actualColumnMetadata.getTotalUncompressedSize(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getTotal_uncompressed_size()); + + verifyColumnMetadataMatch( + actualColumnMetadata.getTotalSize() == expectedColumnMetadata.getTotal_compressed_size(), + "Total size", + actualColumnMetadata.getTotalSize(), + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnMetadata.getTotal_compressed_size()); + + IndexReferenceValidation expectedColumnIndexReference = new IndexReferenceValidation(columnChunk.getColumn_index_offset(), columnChunk.getColumn_index_length()); + IndexReference actualColumnIndexReference = actualColumnMetadata.getColumnIndexReference(); + verifyColumnMetadataMatch( + actualColumnIndexReference == null || fromIndexReference(actualColumnMetadata.getColumnIndexReference()).equals(expectedColumnIndexReference), + "Column index reference", + actualColumnIndexReference, + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedColumnIndexReference); + + IndexReferenceValidation expectedOffsetIndexReference = new IndexReferenceValidation(columnChunk.getOffset_index_offset(), columnChunk.getOffset_index_length()); + IndexReference actualOffsetIndexReference = actualColumnMetadata.getOffsetIndexReference(); + verifyColumnMetadataMatch( + actualOffsetIndexReference == null || fromIndexReference(actualOffsetIndexReference).equals(expectedOffsetIndexReference), + "Offset index reference", + actualOffsetIndexReference, + actualColumnMetadata.getPath(), + rowGroupIndex, + dataSourceId, + expectedOffsetIndexReference); + } + } + } + + public void validateChecksum(ParquetDataSourceId dataSourceId, WriteChecksum actualChecksum) + throws ParquetCorruptionException + { + validateParquet( + checksum.totalRowCount() == actualChecksum.totalRowCount(), + dataSourceId, + "Write validation failed: Expected row count %d, found %d", + checksum.totalRowCount(), + actualChecksum.totalRowCount()); + + List columnHashes = actualChecksum.columnHashes(); + for (int columnIndex = 0; columnIndex < columnHashes.size(); columnIndex++) { + long expectedHash = checksum.columnHashes().get(columnIndex); + validateParquet( + expectedHash == columnHashes.get(columnIndex), + dataSourceId, + "Invalid checksum for column %s: Expected hash %d, found %d", + columnIndex, + expectedHash, + columnHashes.get(columnIndex)); + } + } + + public record WriteChecksum(long totalRowCount, List columnHashes) + { + public WriteChecksum(long totalRowCount, List columnHashes) + { + this.totalRowCount = totalRowCount; + this.columnHashes = ImmutableList.copyOf(requireNonNull(columnHashes, "columnHashes is null")); + } + } + + public static class WriteChecksumBuilder + { + private final List validationHashes; + private final List columnHashes; + private final byte[] longBuffer = new byte[Long.BYTES]; + private final Slice longSlice = Slices.wrappedBuffer(longBuffer); + + private long totalRowCount; + + private WriteChecksumBuilder(List types) + { + this.validationHashes = requireNonNull(types, "types is null").stream() + .map(ValidationHash::createValidationHash) + .collect(toImmutableList()); + + ImmutableList.Builder columnHashes = ImmutableList.builder(); + for (Type ignored : types) { + columnHashes.add(new XxHash64()); + } + this.columnHashes = columnHashes.build(); + } + + public static WriteChecksumBuilder createWriteChecksumBuilder(List readTypes) + { + return new WriteChecksumBuilder(readTypes); + } + + public void addPage(Page page) + { + requireNonNull(page, "page is null"); + checkArgument( + page.getChannelCount() == columnHashes.size(), + "Invalid page: page channels count %s did not match columns count %s", + page.getChannelCount(), + columnHashes.size()); + + for (int channel = 0; channel < columnHashes.size(); channel++) { + ValidationHash validationHash = validationHashes.get(channel); + Block block = page.getBlock(channel); + XxHash64 xxHash64 = columnHashes.get(channel); + for (int position = 0; position < block.getPositionCount(); position++) { + long hash = validationHash.hash(block, position); + longSlice.setLong(0, hash); + xxHash64.update(longBuffer); + } + } + totalRowCount += page.getPositionCount(); + } + + public WriteChecksum build() + { + return new WriteChecksum( + totalRowCount, + columnHashes.stream() + .map(XxHash64::hash) + .collect(toImmutableList())); + } + } + + public void validateRowGroupStatistics(ParquetDataSourceId dataSourceId, PrunedBlockMetadata blockMetaData, List actualColumnStatistics) + throws ParquetCorruptionException + { + List columnChunks = blockMetaData.getColumns(); + checkArgument( + columnChunks.size() == actualColumnStatistics.size(), + "Column chunk metadata count %s did not match column fields count %s", + columnChunks.size(), + actualColumnStatistics.size()); + + for (int columnIndex = 0; columnIndex < columnChunks.size(); columnIndex++) { + ColumnChunkMetadata columnMetaData = columnChunks.get(columnIndex); + ColumnStatistics columnStatistics = actualColumnStatistics.get(columnIndex); + long expectedValuesCount = columnMetaData.getValueCount(); + validateParquet( + expectedValuesCount == columnStatistics.valuesCount(), + dataSourceId, + "Invalid values count for column %s: Expected %d, found %d", + columnIndex, + expectedValuesCount, + columnStatistics.valuesCount()); + + Statistics parquetStatistics = columnMetaData.getStatistics(); + if (parquetStatistics.isNumNullsSet()) { + long expectedNullsCount = parquetStatistics.getNumNulls(); + validateParquet( + expectedNullsCount == columnStatistics.nonLeafValuesCount(), + dataSourceId, + "Invalid nulls count for column %s: Expected %d, found %d", + columnIndex, + expectedNullsCount, + columnStatistics.nonLeafValuesCount()); + } + } + } + + public static class StatisticsValidation + { + private final List types; + private List columnStatisticsValidations; + + private StatisticsValidation(List types) + { + this.types = requireNonNull(types, "types is null"); + this.columnStatisticsValidations = types.stream() + .map(ColumnStatisticsValidation::new) + .collect(toImmutableList()); + } + + public static StatisticsValidation createStatisticsValidationBuilder(List readTypes) + { + return new StatisticsValidation(readTypes); + } + + public void addPage(Page page) + { + requireNonNull(page, "page is null"); + checkArgument( + page.getChannelCount() == columnStatisticsValidations.size(), + "Invalid page: page channels count %s did not match columns count %s", + page.getChannelCount(), + columnStatisticsValidations.size()); + + for (int channel = 0; channel < columnStatisticsValidations.size(); channel++) { + ColumnStatisticsValidation columnStatisticsValidation = columnStatisticsValidations.get(channel); + columnStatisticsValidation.addBlock(page.getBlock(channel)); + } + } + + public void reset() + { + this.columnStatisticsValidations = types.stream() + .map(ColumnStatisticsValidation::new) + .collect(toImmutableList()); + } + + public List build() + { + return this.columnStatisticsValidations.stream() + .flatMap(validation -> validation.build().stream()) + .collect(toImmutableList()); + } + } + + public static class ParquetWriteValidationBuilder + { + private static final int INSTANCE_SIZE = instanceSize(ParquetWriteValidationBuilder.class); + private static final int COLUMN_DESCRIPTOR_INSTANCE_SIZE = instanceSize(ColumnDescriptor.class); + private static final int PRIMITIVE_TYPE_INSTANCE_SIZE = instanceSize(PrimitiveType.class); + + private final List types; + private final List columnNames; + private final WriteChecksumBuilder checksum; + + private String createdBy; + private Optional timeZoneId = Optional.empty(); + private List columns; + private List rowGroups; + private long retainedSize = INSTANCE_SIZE; + + public ParquetWriteValidationBuilder(List types, List columnNames) + { + this.types = ImmutableList.copyOf(requireNonNull(types, "types is null")); + this.columnNames = ImmutableList.copyOf(requireNonNull(columnNames, "columnNames is null")); + checkArgument( + types.size() == columnNames.size(), + "Types count %s did not match column names count %s", + types.size(), + columnNames.size()); + this.checksum = new WriteChecksumBuilder(types); + retainedSize += estimatedSizeOf(types, type -> 0) + + estimatedSizeOf(columnNames, SizeOf::estimatedSizeOf); + } + + public long getRetainedSize() + { + return retainedSize; + } + + public void setCreatedBy(String createdBy) + { + this.createdBy = createdBy; + retainedSize += estimatedSizeOf(createdBy); + } + + public void setTimeZone(Optional timeZoneId) + { + this.timeZoneId = timeZoneId; + timeZoneId.ifPresent(id -> retainedSize += estimatedSizeOf(id)); + } + + public void setColumns(List columns) + { + this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null")); + retainedSize += estimatedSizeOf(columns, descriptor -> { + return COLUMN_DESCRIPTOR_INSTANCE_SIZE + + (2 * SIZE_OF_INT) // maxRep, maxDef + + estimatedSizeOfStringArray(descriptor.getPath()) + + PRIMITIVE_TYPE_INSTANCE_SIZE + + (3 * SIZE_OF_INT); // primitive, length, columnOrder + }); + } + + public void setRowGroups(List rowGroups) + { + this.rowGroups = ImmutableList.copyOf(requireNonNull(rowGroups, "rowGroups is null")); + } + + public void addPage(Page page) + { + checksum.addPage(page); + } + + public ParquetWriteValidation build() + { + return new ParquetWriteValidation( + createdBy, + timeZoneId, + columns, + rowGroups, + checksum.build(), + types, + columnNames); + } + } + + // parquet-mr IndexReference class lacks equals and toString implementations + static class IndexReferenceValidation + { + private final long offset; + private final int length; + + private IndexReferenceValidation(long offset, int length) + { + this.offset = offset; + this.length = length; + } + + static IndexReferenceValidation fromIndexReference(IndexReference indexReference) + { + return new IndexReferenceValidation(indexReference.getOffset(), indexReference.getLength()); + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + IndexReferenceValidation that = (IndexReferenceValidation) o; + return offset == that.offset && length == that.length; + } + + @Override + public int hashCode() + { + return Objects.hash(offset, length); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("offset", offset) + .add("length", length) + .toString(); + } + } + + private static void verifyColumnMetadataMatch( + boolean condition, + String name, + T actual, + ColumnPath path, + int rowGroup, + ParquetDataSourceId dataSourceId, + U expected) + throws ParquetCorruptionException + { + if (!condition) { + throw new ParquetCorruptionException( + dataSourceId, + "%s [%s] for column %s in row group %d did not match [%s]", + name, + actual, + path, + rowGroup, + expected); + } + } + + private static boolean areEncodingsSame(Set actual, List expected) + { + return actual.equals(expected.stream().map(ParquetMetadataConverter::getEncoding).collect(toImmutableSet())); + } + + private static boolean areStatisticsSame(org.apache.parquet.column.statistics.Statistics actual, org.apache.parquet.format.Statistics expected) + { + Statistics.Builder expectedStatsBuilder = Statistics.getBuilderForReading(actual.type()); + if (expected.isSetNull_count()) { + expectedStatsBuilder.withNumNulls(expected.getNull_count()); + } + if (expected.isSetMin_value()) { + expectedStatsBuilder.withMin(expected.getMin_value()); + } + if (expected.isSetMax_value()) { + expectedStatsBuilder.withMax(expected.getMax_value()); + } + return actual.equals(expectedStatsBuilder.build()); + } + + private static void validateColumnDescriptorsSame(ColumnDescriptor actual, ColumnDescriptor expected, ParquetDataSourceId dataSourceId) + throws ParquetCorruptionException + { + // Column names are lower-cased by MetadataReader#readFooter + validateParquet( + Arrays.equals(actual.getPath(), Arrays.stream(expected.getPath()).map(field -> field.toLowerCase(Locale.ENGLISH)).toArray()), + dataSourceId, + "Column path %s did not match expected column path %s", + actual.getPath(), + expected.getPath()); + + validateParquet( + actual.getMaxDefinitionLevel() == expected.getMaxDefinitionLevel(), + dataSourceId, + "Column %s max definition level %d did not match expected max definition level %d", + actual.getPath(), + actual.getMaxDefinitionLevel(), + expected.getMaxDefinitionLevel()); + + validateParquet( + actual.getMaxRepetitionLevel() == expected.getMaxRepetitionLevel(), + dataSourceId, + "Column %s max repetition level %d did not match expected max repetition level %d", + actual.getPath(), + actual.getMaxRepetitionLevel(), + expected.getMaxRepetitionLevel()); + + PrimitiveType actualPrimitiveType = actual.getPrimitiveType(); + PrimitiveType expectedPrimitiveType = expected.getPrimitiveType(); + // We don't use PrimitiveType#equals directly because column names are lower-cased by MetadataReader#readFooter + validateParquet( + actualPrimitiveType.getPrimitiveTypeName().equals(expectedPrimitiveType.getPrimitiveTypeName()) + && actualPrimitiveType.getTypeLength() == expectedPrimitiveType.getTypeLength() + && actualPrimitiveType.getRepetition().equals(expectedPrimitiveType.getRepetition()) + && actualPrimitiveType.getName().equals(expectedPrimitiveType.getName().toLowerCase(Locale.ENGLISH)) + && Objects.equals(actualPrimitiveType.getLogicalTypeAnnotation(), expectedPrimitiveType.getLogicalTypeAnnotation()), + dataSourceId, + "Column %s primitive type %s did not match expected primitive type %s", + actual.getPath(), + actualPrimitiveType, + expectedPrimitiveType); + } + + private static long estimatedSizeOfStringArray(String[] path) + { + long size = sizeOf(path); + for (String field : path) { + size += estimatedSizeOf(field); + } + return size; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/PrimitiveField.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/PrimitiveField.java new file mode 100644 index 000000000000..4f4f08818c82 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/PrimitiveField.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.trino.spi.type.Type; +import org.apache.parquet.column.ColumnDescriptor; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public class PrimitiveField + extends Field +{ + private final ColumnDescriptor descriptor; + private final int id; + + public PrimitiveField(Type type, boolean required, ColumnDescriptor descriptor, int id) + { + super(type, descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel(), required); + this.descriptor = requireNonNull(descriptor, "descriptor is required"); + this.id = id; + } + + public ColumnDescriptor getDescriptor() + { + return descriptor; + } + + public int getId() + { + return id; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", getType()) + .add("id", id) + .add("repetitionLevel", getRepetitionLevel()) + .add("definitionLevel", getDefinitionLevel()) + .add("required", isRequired()) + .add("descriptor", descriptor) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ValidationHash.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ValidationHash.java new file mode 100644 index 000000000000..9de636d8b8d3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ValidationHash.java @@ -0,0 +1,144 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.trino.spi.block.Block; +import io.trino.spi.function.InvocationConvention; +import io.trino.spi.type.Type; +import io.trino.spi.type.TypeOperators; + +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodType; + +import static com.google.common.base.Throwables.throwIfUnchecked; +import static io.trino.spi.function.InvocationConvention.InvocationArgumentConvention.BLOCK_POSITION_NOT_NULL; +import static io.trino.spi.function.InvocationConvention.InvocationReturnConvention.FAIL_ON_NULL; +import static io.trino.spi.type.StandardTypes.ARRAY; +import static io.trino.spi.type.StandardTypes.MAP; +import static io.trino.spi.type.StandardTypes.ROW; +import static java.lang.invoke.MethodHandles.lookup; +import static java.util.Objects.requireNonNull; + +/** + * Based on io.trino.rcfile.ValidationHash and io.trino.orc.ValidationHash + * with minor differences in handling of timestamp and map types. + */ +class ValidationHash +{ + // This value is a large arbitrary prime + private static final long NULL_HASH_CODE = 0x6e3efbd56c16a0cbL; + + private static final MethodHandle MAP_HASH; + private static final MethodHandle ARRAY_HASH; + private static final MethodHandle ROW_HASH; + + static { + try { + MAP_HASH = lookup().findStatic( + ValidationHash.class, + "mapHash", + MethodType.methodType(long.class, Type.class, ValidationHash.class, ValidationHash.class, Block.class, int.class)); + ARRAY_HASH = lookup().findStatic( + ValidationHash.class, + "arrayHash", + MethodType.methodType(long.class, Type.class, ValidationHash.class, Block.class, int.class)); + ROW_HASH = lookup().findStatic( + ValidationHash.class, + "rowHash", + MethodType.methodType(long.class, Type.class, ValidationHash[].class, Block.class, int.class)); + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + + // This should really come from the environment, but there is not good way to get a value here + private static final TypeOperators VALIDATION_TYPE_OPERATORS_CACHE = new TypeOperators(); + + public static ValidationHash createValidationHash(Type type) + { + requireNonNull(type, "type is null"); + if (type.getTypeSignature().getBase().equals(MAP)) { + ValidationHash keyHash = createValidationHash(type.getTypeParameters().get(0)); + ValidationHash valueHash = createValidationHash(type.getTypeParameters().get(1)); + return new ValidationHash(MAP_HASH.bindTo(type).bindTo(keyHash).bindTo(valueHash)); + } + + if (type.getTypeSignature().getBase().equals(ARRAY)) { + ValidationHash elementHash = createValidationHash(type.getTypeParameters().get(0)); + return new ValidationHash(ARRAY_HASH.bindTo(type).bindTo(elementHash)); + } + + if (type.getTypeSignature().getBase().equals(ROW)) { + ValidationHash[] fieldHashes = type.getTypeParameters().stream() + .map(ValidationHash::createValidationHash) + .toArray(ValidationHash[]::new); + return new ValidationHash(ROW_HASH.bindTo(type).bindTo(fieldHashes)); + } + + return new ValidationHash(VALIDATION_TYPE_OPERATORS_CACHE.getHashCodeOperator(type, InvocationConvention.simpleConvention(FAIL_ON_NULL, BLOCK_POSITION_NOT_NULL))); + } + + private final MethodHandle hashCodeOperator; + + private ValidationHash(MethodHandle hashCodeOperator) + { + this.hashCodeOperator = requireNonNull(hashCodeOperator, "hashCodeOperator is null"); + } + + public long hash(Block block, int position) + { + if (block.isNull(position)) { + return NULL_HASH_CODE; + } + try { + return (long) hashCodeOperator.invokeExact(block, position); + } + catch (Throwable throwable) { + throwIfUnchecked(throwable); + throw new RuntimeException(throwable); + } + } + + private static long mapHash(Type type, ValidationHash keyHash, ValidationHash valueHash, Block block, int position) + { + Block mapBlock = (Block) type.getObject(block, position); + long hash = 0; + for (int i = 0; i < mapBlock.getPositionCount(); i += 2) { + hash = 31 * hash + keyHash.hash(mapBlock, i); + hash = 31 * hash + valueHash.hash(mapBlock, i + 1); + } + return hash; + } + + private static long arrayHash(Type type, ValidationHash elementHash, Block block, int position) + { + Block array = (Block) type.getObject(block, position); + long hash = 0; + for (int i = 0; i < array.getPositionCount(); i++) { + hash = 31 * hash + elementHash.hash(array, i); + } + return hash; + } + + private static long rowHash(Type type, ValidationHash[] fieldHashes, Block block, int position) + { + Block row = (Block) type.getObject(block, position); + long hash = 0; + for (int i = 0; i < row.getPositionCount(); i++) { + hash = 31 * hash + fieldHashes[i].hash(row, i); + } + return hash; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/ValuesType.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ValuesType.java new file mode 100644 index 000000000000..06cdcf7f431b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/ValuesType.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +public enum ValuesType +{ + REPETITION_LEVEL, + DEFINITION_LEVEL, + VALUES +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/VariantField.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/VariantField.java new file mode 100644 index 000000000000..f9b65baf4ca9 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/VariantField.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet; + +import io.trino.spi.type.Type; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static java.util.Objects.requireNonNull; + +public class VariantField + extends Field +{ + private final Field value; + private final Field metadata; + + public VariantField(Type type, int repetitionLevel, int definitionLevel, boolean required, Field value, Field metadata) + { + super(type, repetitionLevel, definitionLevel, required); + this.value = requireNonNull(value, "value is null"); + this.metadata = requireNonNull(metadata, "metadata is null"); + } + + public Field getValue() + { + return value; + } + + public Field getMetadata() + { + return metadata; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", getType()) + .add("repetitionLevel", getRepetitionLevel()) + .add("definitionLevel", getDefinitionLevel()) + .add("required", isRequired()) + .add("value", value) + .add("metadata", getMetadata()) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/BinaryDictionary.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/BinaryDictionary.java new file mode 100644 index 000000000000..4a0f20257b1c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/BinaryDictionary.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.dictionary; + +import io.airlift.slice.Slice; +import io.trino.parquet.DictionaryPage; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; + +public class BinaryDictionary + implements Dictionary +{ + private final Slice[] content; + + public BinaryDictionary(DictionaryPage dictionaryPage) + { + this(dictionaryPage, null); + } + + public BinaryDictionary(DictionaryPage dictionaryPage, Integer length) + { + content = new Slice[dictionaryPage.getDictionarySize()]; + + Slice dictionarySlice = dictionaryPage.getSlice(); + + int currentInputOffset = 0; + if (length == null) { + for (int i = 0; i < content.length; i++) { + int positionLength = dictionarySlice.getInt(currentInputOffset); + currentInputOffset += Integer.BYTES; + content[i] = dictionarySlice.slice(currentInputOffset, positionLength); + currentInputOffset += positionLength; + } + } + else { + checkArgument(length > 0, "Invalid byte array length: %s", length); + for (int i = 0; i < content.length; i++) { + content[i] = dictionarySlice.slice(currentInputOffset, length); + currentInputOffset += length; + } + } + } + + @Override + public Slice decodeToSlice(int id) + { + return content[id]; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("content", content) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/Dictionary.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/Dictionary.java new file mode 100644 index 000000000000..29720538e62e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/Dictionary.java @@ -0,0 +1,44 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.dictionary; + +import io.airlift.slice.Slice; + +public interface Dictionary +{ + default Slice decodeToSlice(int id) + { + throw new UnsupportedOperationException(); + } + + default int decodeToInt(int id) + { + throw new UnsupportedOperationException(); + } + + default long decodeToLong(int id) + { + throw new UnsupportedOperationException(); + } + + default float decodeToFloat(int id) + { + throw new UnsupportedOperationException(); + } + + default double decodeToDouble(int id) + { + throw new UnsupportedOperationException(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/DoubleDictionary.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/DoubleDictionary.java new file mode 100644 index 000000000000..3d67ba12ff17 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/DoubleDictionary.java @@ -0,0 +1,55 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.dictionary; + +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.decoders.ValueDecoder; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.LongPlainValueDecoder; + +public class DoubleDictionary + implements Dictionary +{ + private final double[] content; + + public DoubleDictionary(DictionaryPage dictionaryPage) + { + int length = dictionaryPage.getDictionarySize(); + long[] buffer = new long[length]; + ValueDecoder doubleReader = new LongPlainValueDecoder(); + doubleReader.init(new SimpleSliceInputStream(dictionaryPage.getSlice())); + doubleReader.read(buffer, 0, length); + + content = new double[length]; + for (int i = 0; i < length; i++) { + content[i] = Double.longBitsToDouble(buffer[i]); + } + } + + @Override + public double decodeToDouble(int id) + { + return content[id]; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("content", content) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/FloatDictionary.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/FloatDictionary.java new file mode 100644 index 000000000000..d113f67b3cfa --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/FloatDictionary.java @@ -0,0 +1,55 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.dictionary; + +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.decoders.ValueDecoder; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.IntPlainValueDecoder; + +public class FloatDictionary + implements Dictionary +{ + private final float[] content; + + public FloatDictionary(DictionaryPage dictionaryPage) + { + int length = dictionaryPage.getDictionarySize(); + int[] buffer = new int[length]; + ValueDecoder floatReader = new IntPlainValueDecoder(); + floatReader.init(new SimpleSliceInputStream(dictionaryPage.getSlice())); + floatReader.read(buffer, 0, length); + + content = new float[length]; + for (int i = 0; i < length; i++) { + content[i] = Float.intBitsToFloat(buffer[i]); + } + } + + @Override + public float decodeToFloat(int id) + { + return content[id]; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("content", content) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/IntegerDictionary.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/IntegerDictionary.java new file mode 100644 index 000000000000..3aa8d95ffe3d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/IntegerDictionary.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.dictionary; + +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.decoders.ValueDecoder; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.IntPlainValueDecoder; + +public class IntegerDictionary + implements Dictionary +{ + private final int[] content; + + public IntegerDictionary(DictionaryPage dictionaryPage) + { + content = new int[dictionaryPage.getDictionarySize()]; + ValueDecoder intReader = new IntPlainValueDecoder(); + intReader.init(new SimpleSliceInputStream(dictionaryPage.getSlice())); + intReader.read(content, 0, dictionaryPage.getDictionarySize()); + } + + @Override + public int decodeToInt(int id) + { + return content[id]; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("content", content) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/LongDictionary.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/LongDictionary.java new file mode 100644 index 000000000000..e27cda0bc5ee --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/dictionary/LongDictionary.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.dictionary; + +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.decoders.ValueDecoder; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.LongPlainValueDecoder; + +public class LongDictionary + implements Dictionary +{ + private final long[] content; + + public LongDictionary(DictionaryPage dictionaryPage) + { + content = new long[dictionaryPage.getDictionarySize()]; + ValueDecoder longReader = new LongPlainValueDecoder(); + longReader.init(new SimpleSliceInputStream(dictionaryPage.getSlice())); + longReader.read(content, 0, dictionaryPage.getDictionarySize()); + } + + @Override + public long decodeToLong(int id) + { + return content[id]; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("content", content) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/BlockMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/BlockMetadata.java new file mode 100644 index 000000000000..bbfcb6eaf0ce --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/BlockMetadata.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import java.util.List; + +public record BlockMetadata(long fileRowCountOffset, long rowCount, List columns) +{ + public long getStartingPos() + { + return columns().get(0).getStartingPos(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ColumnChunkMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ColumnChunkMetadata.java new file mode 100644 index 000000000000..381260829869 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ColumnChunkMetadata.java @@ -0,0 +1,203 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; + +import java.util.Set; + +public abstract class ColumnChunkMetadata +{ + protected int rowGroupOrdinal = -1; + + public static ColumnChunkMetadata get( + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) + { + if (positiveLongFitsInAnInt(firstDataPage) + && positiveLongFitsInAnInt(dictionaryPageOffset) + && positiveLongFitsInAnInt(valueCount) + && positiveLongFitsInAnInt(totalSize) + && positiveLongFitsInAnInt(totalUncompressedSize)) { + return new IntColumnChunkMetadata( + path, type, codec, + encodingStats, encodings, + statistics, + firstDataPage, + dictionaryPageOffset, + valueCount, + totalSize, + totalUncompressedSize); + } + return new LongColumnChunkMetadata( + path, type, codec, + encodingStats, encodings, + statistics, + firstDataPage, + dictionaryPageOffset, + valueCount, + totalSize, + totalUncompressedSize); + } + + public void setRowGroupOrdinal(int rowGroupOrdinal) + { + this.rowGroupOrdinal = rowGroupOrdinal; + } + + public int getRowGroupOrdinal() + { + return rowGroupOrdinal; + } + + public long getStartingPos() + { + decryptIfNeeded(); + long dictionaryPageOffset = getDictionaryPageOffset(); + long firstDataPageOffset = getFirstDataPageOffset(); + if (dictionaryPageOffset > 0 && dictionaryPageOffset < firstDataPageOffset) { + return dictionaryPageOffset; + } + return firstDataPageOffset; + } + + protected static boolean positiveLongFitsInAnInt(long value) + { + return (value >= 0) && (value + Integer.MIN_VALUE <= Integer.MAX_VALUE); + } + + EncodingStats encodingStats; + + ColumnChunkProperties properties; + + private IndexReference columnIndexReference; + private IndexReference offsetIndexReference; + + private long bloomFilterOffset = -1; + + protected ColumnChunkMetadata(ColumnChunkProperties columnChunkProperties) + { + this(null, columnChunkProperties); + } + + protected ColumnChunkMetadata(EncodingStats encodingStats, ColumnChunkProperties columnChunkProperties) + { + this.encodingStats = encodingStats; + this.properties = columnChunkProperties; + } + + protected void decryptIfNeeded() {} + + public CompressionCodecName getCodec() + { + decryptIfNeeded(); + return properties.codec(); + } + + public ColumnPath getPath() + { + return properties.path(); + } + + public PrimitiveTypeName getType() + { + decryptIfNeeded(); + return properties.type().getPrimitiveTypeName(); + } + + public PrimitiveType getPrimitiveType() + { + decryptIfNeeded(); + return properties.type(); + } + + public abstract long getFirstDataPageOffset(); + + public abstract long getDictionaryPageOffset(); + + public abstract long getValueCount(); + + public abstract long getTotalUncompressedSize(); + + public abstract long getTotalSize(); + + public abstract Statistics getStatistics(); + + public IndexReference getColumnIndexReference() + { + decryptIfNeeded(); + return columnIndexReference; + } + + public void setColumnIndexReference(IndexReference indexReference) + { + this.columnIndexReference = indexReference; + } + + public IndexReference getOffsetIndexReference() + { + decryptIfNeeded(); + return offsetIndexReference; + } + + public void setOffsetIndexReference(IndexReference offsetIndexReference) + { + this.offsetIndexReference = offsetIndexReference; + } + + public void setBloomFilterOffset(long bloomFilterOffset) + { + this.bloomFilterOffset = bloomFilterOffset; + } + + public long getBloomFilterOffset() + { + decryptIfNeeded(); + return bloomFilterOffset; + } + + public Set getEncodings() + { + decryptIfNeeded(); + return properties.encodings(); + } + + public EncodingStats getEncodingStats() + { + decryptIfNeeded(); + return encodingStats; + } + + @Override + public String toString() + { + decryptIfNeeded(); + return "ColumnMetaData{" + properties.toString() + ", " + getFirstDataPageOffset() + "}"; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ColumnChunkProperties.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ColumnChunkProperties.java new file mode 100644 index 000000000000..37e94eed03f5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ColumnChunkProperties.java @@ -0,0 +1,23 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import org.apache.parquet.column.Encoding; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.PrimitiveType; + +import java.util.Set; + +public record ColumnChunkProperties(ColumnPath path, PrimitiveType type, CompressionCodecName codec, Set encodings) {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/FileMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/FileMetadata.java new file mode 100644 index 000000000000..3950d1c2dd07 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/FileMetadata.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import org.apache.parquet.schema.MessageType; + +import java.util.Map; + +import static java.util.Collections.unmodifiableMap; +import static java.util.Objects.requireNonNull; + +public final class FileMetadata +{ + private final MessageType schema; + private final Map keyValueMetaData; + private final String createdBy; + + public FileMetadata(MessageType schema, Map keyValueMetaData, String createdBy) + { + this.schema = requireNonNull(schema, "schema cannot be null"); + this.keyValueMetaData = unmodifiableMap(requireNonNull(keyValueMetaData, "keyValueMetaData cannot be null")); + this.createdBy = createdBy; + } + + public MessageType getSchema() + { + return schema; + } + + @Override + public String toString() + { + return "FileMetaData{schema: " + schema + ", metadata: " + keyValueMetaData + "}"; + } + + public Map getKeyValueMetaData() + { + return keyValueMetaData; + } + + public String getCreatedBy() + { + return createdBy; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/IndexReference.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/IndexReference.java new file mode 100644 index 000000000000..a2918fc03171 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/IndexReference.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +public class IndexReference +{ + private final long offset; + private final int length; + + public IndexReference(long offset, int length) + { + this.offset = offset; + this.length = length; + } + + public long getOffset() + { + return offset; + } + + public int getLength() + { + return length; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/IntColumnChunkMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/IntColumnChunkMetadata.java new file mode 100644 index 000000000000..046d34c174bc --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/IntColumnChunkMetadata.java @@ -0,0 +1,105 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.PrimitiveType; + +import java.util.Set; + +class IntColumnChunkMetadata + extends ColumnChunkMetadata +{ + private final int firstDataPage; + private final int dictionaryPageOffset; + private final int valueCount; + private final int totalSize; + private final int totalUncompressedSize; + private final Statistics statistics; + + IntColumnChunkMetadata( + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) + { + super(encodingStats, new ColumnChunkProperties(path, type, codec, encodings)); + this.firstDataPage = positiveLongToInt(firstDataPage); + this.dictionaryPageOffset = positiveLongToInt(dictionaryPageOffset); + this.valueCount = positiveLongToInt(valueCount); + this.totalSize = positiveLongToInt(totalSize); + this.totalUncompressedSize = positiveLongToInt(totalUncompressedSize); + this.statistics = statistics; + } + + private int positiveLongToInt(long value) + { + if (!ColumnChunkMetadata.positiveLongFitsInAnInt(value)) { + throw new IllegalArgumentException("value should be positive and fit in an int: " + value); + } + return (int) (value + Integer.MIN_VALUE); + } + + private long intToPositiveLong(int value) + { + return (long) value - Integer.MIN_VALUE; + } + + @Override + public long getFirstDataPageOffset() + { + return intToPositiveLong(firstDataPage); + } + + @Override + public long getDictionaryPageOffset() + { + return intToPositiveLong(dictionaryPageOffset); + } + + @Override + public long getValueCount() + { + return intToPositiveLong(valueCount); + } + + @Override + public long getTotalUncompressedSize() + { + return intToPositiveLong(totalUncompressedSize); + } + + @Override + public long getTotalSize() + { + return intToPositiveLong(totalSize); + } + + @Override + public Statistics getStatistics() + { + return statistics; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/LongColumnChunkMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/LongColumnChunkMetadata.java new file mode 100644 index 000000000000..d0ca0089b3ca --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/LongColumnChunkMetadata.java @@ -0,0 +1,92 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.PrimitiveType; + +import java.util.Set; + +class LongColumnChunkMetadata + extends ColumnChunkMetadata +{ + private final long firstDataPageOffset; + private final long dictionaryPageOffset; + private final long valueCount; + private final long totalSize; + private final long totalUncompressedSize; + private final Statistics statistics; + + LongColumnChunkMetadata( + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPageOffset, + long dictionaryPageOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) + { + super(encodingStats, new ColumnChunkProperties(path, type, codec, encodings)); + this.firstDataPageOffset = firstDataPageOffset; + this.dictionaryPageOffset = dictionaryPageOffset; + this.valueCount = valueCount; + this.totalSize = totalSize; + this.totalUncompressedSize = totalUncompressedSize; + this.statistics = statistics; + } + + @Override + public long getFirstDataPageOffset() + { + return firstDataPageOffset; + } + + @Override + public long getDictionaryPageOffset() + { + return dictionaryPageOffset; + } + + @Override + public long getValueCount() + { + return valueCount; + } + + @Override + public long getTotalUncompressedSize() + { + return totalUncompressedSize; + } + + @Override + public long getTotalSize() + { + return totalSize; + } + + @Override + public Statistics getStatistics() + { + return statistics; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ParquetMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ParquetMetadata.java new file mode 100644 index 000000000000..7a34dfe92754 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/ParquetMetadata.java @@ -0,0 +1,282 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.airlift.log.Logger; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.reader.MetadataReader; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.format.ColumnChunk; +import org.apache.parquet.format.ColumnMetaData; +import org.apache.parquet.format.FileMetaData; +import org.apache.parquet.format.KeyValue; +import org.apache.parquet.format.RowGroup; +import org.apache.parquet.format.SchemaElement; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.parquet.ParquetMetadataConverter.convertEncodingStats; +import static io.trino.parquet.ParquetMetadataConverter.getEncoding; +import static io.trino.parquet.ParquetMetadataConverter.getLogicalTypeAnnotation; +import static io.trino.parquet.ParquetMetadataConverter.getPrimitive; +import static io.trino.parquet.ParquetMetadataConverter.toColumnIndexReference; +import static io.trino.parquet.ParquetMetadataConverter.toOffsetIndexReference; +import static io.trino.parquet.ParquetValidationUtils.validateParquet; +import static java.util.Objects.requireNonNull; + +public class ParquetMetadata +{ + private static final Logger log = Logger.get(ParquetMetadata.class); + + private final FileMetaData parquetMetadata; + private final ParquetDataSourceId dataSourceId; + private final FileMetadata fileMetadata; + + public ParquetMetadata(FileMetaData parquetMetadata, ParquetDataSourceId dataSourceId) + throws ParquetCorruptionException + { + this.fileMetadata = new FileMetadata( + readMessageType(parquetMetadata, dataSourceId), + keyValueMetaData(parquetMetadata), + parquetMetadata.getCreated_by()); + this.parquetMetadata = parquetMetadata; + this.dataSourceId = requireNonNull(dataSourceId, "dataSourceId is null"); + } + + public FileMetadata getFileMetaData() + { + return fileMetadata; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("parquetMetadata", parquetMetadata) + .toString(); + } + + public List getBlocks() + throws ParquetCorruptionException + { + return getBlocks(0, Long.MAX_VALUE); + } + + public List getBlocks(long splitStart, long splitLength) + throws ParquetCorruptionException + { + List schema = parquetMetadata.getSchema(); + validateParquet(!schema.isEmpty(), dataSourceId, "Schema is empty"); + + MessageType messageType = readParquetSchema(schema); + List blocks = new ArrayList<>(); + List rowGroups = parquetMetadata.getRow_groups(); + + long fileRowCount = 0; + + if (rowGroups != null) { + for (RowGroup rowGroup : rowGroups) { + long fileRowCountOffset = fileRowCount; + fileRowCount += rowGroup.getNum_rows(); // Update fileRowCount for all row groups + + List columns = rowGroup.getColumns(); + validateParquet(!columns.isEmpty(), dataSourceId, "No columns in row group: %s", rowGroup); + String filePath = columns.get(0).getFile_path(); + long rowGroupStart = getRowGroupStart(columns, messageType); + boolean splitContainsRowGroup = splitStart <= rowGroupStart && rowGroupStart < splitStart + splitLength; + if (!splitContainsRowGroup) { + continue; + } + + ImmutableList.Builder columnMetadataBuilder = ImmutableList.builderWithExpectedSize(columns.size()); + for (ColumnChunk columnChunk : columns) { + validateParquet( + (filePath == null && columnChunk.getFile_path() == null) + || (filePath != null && filePath.equals(columnChunk.getFile_path())), + dataSourceId, + "all column chunks of the same row group must be in the same file"); + ColumnChunkMetadata column = toColumnChunkMetadata(columnChunk, parquetMetadata.getCreated_by(), messageType); + columnMetadataBuilder.add(column); + } + blocks.add(new BlockMetadata(fileRowCountOffset, rowGroup.getNum_rows(), columnMetadataBuilder.build())); + } + } + + return blocks; + } + + @VisibleForTesting + public FileMetaData getParquetMetadata() + { + return parquetMetadata; + } + + private static long getRowGroupStart(List columns, MessageType messageType) + { + // Note: Do not rely on org.apache.parquet.format.RowGroup.getFile_offset or org.apache.parquet.format.ColumnChunk.getFile_offset + // because some versions of parquet-cpp-arrow (and potentially other writers) set it incorrectly + ColumnChunkMetadata columnChunkMetadata = toColumnChunkMetadata(columns.get(0), null, messageType); + return columnChunkMetadata.getStartingPos(); + } + + private static ColumnChunkMetadata toColumnChunkMetadata(ColumnChunk columnChunk, String createdBy, MessageType messageType) + { + ColumnMetaData metaData = columnChunk.meta_data; + String[] path = metaData.path_in_schema.stream() + .map(value -> value.toLowerCase(Locale.ENGLISH)) + .toArray(String[]::new); + ColumnPath columnPath = ColumnPath.get(path); + PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType(); + ColumnChunkMetadata column = ColumnChunkMetadata.get( + columnPath, + primitiveType, + CompressionCodecName.fromParquet(metaData.codec), + convertEncodingStats(metaData.encoding_stats), + readEncodings(metaData.encodings), + MetadataReader.readStats(Optional.ofNullable(createdBy), Optional.ofNullable(metaData.statistics), primitiveType), + metaData.data_page_offset, + metaData.dictionary_page_offset, + metaData.num_values, + metaData.total_compressed_size, + metaData.total_uncompressed_size); + column.setColumnIndexReference(toColumnIndexReference(columnChunk)); + column.setOffsetIndexReference(toOffsetIndexReference(columnChunk)); + column.setBloomFilterOffset(metaData.bloom_filter_offset); + return column; + } + + private static MessageType readParquetSchema(List schema) + { + Iterator schemaIterator = schema.iterator(); + SchemaElement rootSchema = schemaIterator.next(); + Types.MessageTypeBuilder builder = Types.buildMessage(); + readTypeSchema(builder, schemaIterator, rootSchema.getNum_children()); + return builder.named(rootSchema.name); + } + + private static void readTypeSchema(Types.GroupBuilder builder, Iterator schemaIterator, int typeCount) + { + for (int i = 0; i < typeCount; i++) { + SchemaElement element = schemaIterator.next(); + Types.Builder typeBuilder; + if (element.type == null) { + typeBuilder = builder.group(Type.Repetition.valueOf(element.repetition_type.name())); + readTypeSchema((Types.GroupBuilder) typeBuilder, schemaIterator, element.num_children); + } + else { + Types.PrimitiveBuilder primitiveBuilder = builder.primitive(getPrimitive(element.type), Type.Repetition.valueOf(element.repetition_type.name())); + if (element.isSetType_length()) { + primitiveBuilder.length(element.type_length); + } + if (element.isSetPrecision()) { + primitiveBuilder.precision(element.precision); + } + if (element.isSetScale()) { + primitiveBuilder.scale(element.scale); + } + typeBuilder = primitiveBuilder; + } + + // Reading of element.logicalType and element.converted_type corresponds to parquet-mr's code at + // https://github.com/apache/parquet-mr/blob/apache-parquet-1.12.0/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L1568-L1582 + LogicalTypeAnnotation annotationFromLogicalType = null; + if (element.isSetLogicalType()) { + annotationFromLogicalType = getLogicalTypeAnnotation(element.logicalType); + typeBuilder.as(annotationFromLogicalType); + } + if (element.isSetConverted_type()) { + LogicalTypeAnnotation annotationFromConvertedType = getLogicalTypeAnnotation(element.converted_type, element); + if (annotationFromLogicalType != null) { + // Both element.logicalType and element.converted_type set + if (annotationFromLogicalType.toOriginalType() == annotationFromConvertedType.toOriginalType()) { + // element.converted_type matches element.logicalType, even though annotationFromLogicalType may differ from annotationFromConvertedType + // Following parquet-mr behavior, we favor LogicalTypeAnnotation derived from element.logicalType, as potentially containing more information. + } + else { + // Following parquet-mr behavior, issue warning and let converted_type take precedence. + log.warn("Converted type and logical type metadata map to different OriginalType (convertedType: %s, logical type: %s). Using value in converted type.", + element.converted_type, element.logicalType); + // parquet-mr reads only OriginalType from converted_type. We retain full LogicalTypeAnnotation + // 1. for compatibility, as previous Trino reader code would read LogicalTypeAnnotation from element.converted_type and some additional fields. + // 2. so that we override LogicalTypeAnnotation annotation read from element.logicalType in case of mismatch detected. + typeBuilder.as(annotationFromConvertedType); + } + } + else { + // parquet-mr reads only OriginalType from converted_type. We retain full LogicalTypeAnnotation for compatibility, as previous + // Trino reader code would read LogicalTypeAnnotation from element.converted_type and some additional fields. + typeBuilder.as(annotationFromConvertedType); + } + } + + if (element.isSetField_id()) { + typeBuilder.id(element.field_id); + } + typeBuilder.named(element.name.toLowerCase(Locale.ENGLISH)); + } + } + + private static Set readEncodings(List encodings) + { + Set columnEncodings = new HashSet<>(); + for (org.apache.parquet.format.Encoding encoding : encodings) { + columnEncodings.add(getEncoding(encoding)); + } + return Collections.unmodifiableSet(columnEncodings); + } + + private static MessageType readMessageType(FileMetaData parquetMetadata, ParquetDataSourceId dataSourceId) + throws ParquetCorruptionException + { + List schema = parquetMetadata.getSchema(); + validateParquet(!schema.isEmpty(), dataSourceId, "Schema is empty"); + + Iterator schemaIterator = schema.iterator(); + SchemaElement rootSchema = schemaIterator.next(); + Types.MessageTypeBuilder builder = Types.buildMessage(); + readTypeSchema(builder, schemaIterator, rootSchema.getNum_children()); + return builder.named(rootSchema.name); + } + + private static Map keyValueMetaData(FileMetaData parquetMetadata) + { + if (parquetMetadata.getKey_value_metadata() == null) { + return ImmutableMap.of(); + } + return parquetMetadata.getKey_value_metadata() + .stream() + .collect(toImmutableMap(KeyValue::getKey, KeyValue::getValue, (ignore, second) -> second)); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/PrunedBlockMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/PrunedBlockMetadata.java new file mode 100644 index 000000000000..4ceeadf9686f --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/metadata/PrunedBlockMetadata.java @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.metadata; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSourceId; +import org.apache.parquet.column.ColumnDescriptor; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static java.util.Arrays.asList; +import static java.util.function.Function.identity; + +public final class PrunedBlockMetadata +{ + /** + * Stores only the necessary columns metadata from BlockMetadata and indexes them by path for efficient look-ups + */ + public static PrunedBlockMetadata createPrunedColumnsMetadata(BlockMetadata blockMetadata, ParquetDataSourceId dataSourceId, Map, ColumnDescriptor> descriptorsByPath) + throws ParquetCorruptionException + { + Set> requiredPaths = descriptorsByPath.keySet(); + Map, ColumnChunkMetadata> columnMetadataByPath = blockMetadata.columns().stream() + .collect(toImmutableMap( + column -> asList(column.getPath().toArray()), + identity(), + // Same column name may occur more than once when the file is written by case-sensitive tools + (oldValue, ignore) -> oldValue)); + ImmutableMap.Builder, ColumnChunkMetadata> columnMetadataByPathBuilder = ImmutableMap.builderWithExpectedSize(requiredPaths.size()); + for (Map.Entry, ColumnDescriptor> entry : descriptorsByPath.entrySet()) { + List requiredPath = entry.getKey(); + ColumnDescriptor columnDescriptor = entry.getValue(); + ColumnChunkMetadata columnChunkMetadata = columnMetadataByPath.get(requiredPath); + if (columnChunkMetadata == null) { + throw new ParquetCorruptionException(dataSourceId, "Metadata is missing for column: %s", columnDescriptor); + } + columnMetadataByPathBuilder.put(requiredPath, columnChunkMetadata); + } + return new PrunedBlockMetadata(blockMetadata.rowCount(), dataSourceId, columnMetadataByPathBuilder.buildOrThrow()); + } + + private final long rowCount; + private final ParquetDataSourceId dataSourceId; + private final Map, ColumnChunkMetadata> columnMetadataByPath; + + private PrunedBlockMetadata(long rowCount, ParquetDataSourceId dataSourceId, Map, ColumnChunkMetadata> columnMetadataByPath) + { + this.rowCount = rowCount; + this.dataSourceId = dataSourceId; + this.columnMetadataByPath = columnMetadataByPath; + } + + public long getRowCount() + { + return rowCount; + } + + public List getColumns() + { + return ImmutableList.copyOf(columnMetadataByPath.values()); + } + + public ColumnChunkMetadata getColumnChunkMetaData(ColumnDescriptor columnDescriptor) + throws ParquetCorruptionException + { + ColumnChunkMetadata columnChunkMetadata = columnMetadataByPath.get(asList(columnDescriptor.getPath())); + if (columnChunkMetadata == null) { + throw new ParquetCorruptionException(dataSourceId, "Metadata is missing for column: %s", columnDescriptor); + } + return columnChunkMetadata; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("rowCount", rowCount) + .add("columnMetadataByPath", columnMetadataByPath) + .toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/DictionaryDescriptor.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/DictionaryDescriptor.java new file mode 100644 index 000000000000..ea5ccef5591a --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/DictionaryDescriptor.java @@ -0,0 +1,48 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.predicate; + +import io.trino.parquet.DictionaryPage; +import org.apache.parquet.column.ColumnDescriptor; + +import java.util.Optional; + +public class DictionaryDescriptor +{ + private final ColumnDescriptor columnDescriptor; + private final boolean nullAllowed; + private final Optional dictionaryPage; + + public DictionaryDescriptor(ColumnDescriptor columnDescriptor, boolean nullAllowed, Optional dictionaryPage) + { + this.columnDescriptor = columnDescriptor; + this.nullAllowed = nullAllowed; + this.dictionaryPage = dictionaryPage; + } + + public ColumnDescriptor getColumnDescriptor() + { + return columnDescriptor; + } + + public boolean isNullAllowed() + { + return nullAllowed; + } + + public Optional getDictionaryPage() + { + return dictionaryPage; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/PredicateUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/PredicateUtils.java new file mode 100644 index 000000000000..3230c7190a0a --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/PredicateUtils.java @@ -0,0 +1,366 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.predicate; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.slice.Slice; +import io.airlift.slice.SliceInput; +import io.trino.parquet.BloomFilterStore; +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.ParquetEncoding; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.metadata.BlockMetadata; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.metadata.ParquetMetadata; +import io.trino.parquet.metadata.PrunedBlockMetadata; +import io.trino.parquet.reader.RowGroupInfo; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Type; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.format.DictionaryPageHeader; +import org.apache.parquet.format.PageHeader; +import org.apache.parquet.format.PageType; +import org.apache.parquet.format.Util; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.MessageType; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.math.BigDecimal; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static io.trino.parquet.BloomFilterStore.getBloomFilterStore; +import static io.trino.parquet.ParquetCompressionUtils.decompress; +import static io.trino.parquet.ParquetReaderUtils.isOnlyDictionaryEncodingPages; +import static io.trino.parquet.ParquetTypeUtils.getParquetEncoding; +import static io.trino.parquet.metadata.PrunedBlockMetadata.createPrunedColumnsMetadata; +import static io.trino.parquet.reader.TrinoColumnIndexStore.getColumnIndexStore; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TinyintType.TINYINT; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public final class PredicateUtils +{ + // Maximum size of dictionary that we will read for row-group pruning. + // Reading larger dictionaries is typically not beneficial. Before checking + // the dictionary, the row-group, page indexes and bloomfilters have already been checked + // and when the dictionary does not eliminate a row-group, the work done to + // decode the dictionary and match it with predicates is wasted. + private static final int MAX_DICTIONARY_SIZE = 8096; + + private PredicateUtils() {} + + public static boolean isStatisticsOverflow(Type type, long min, long max) + { + if (type == TINYINT) { + return min < Byte.MIN_VALUE || max > Byte.MAX_VALUE; + } + if (type == SMALLINT) { + return min < Short.MIN_VALUE || max > Short.MAX_VALUE; + } + if (type == INTEGER || type == DATE) { + return min < Integer.MIN_VALUE || max > Integer.MAX_VALUE; + } + if (type == BIGINT) { + return false; + } + if (type instanceof DecimalType decimalType) { + if (!decimalType.isShort()) { + // Smallest long decimal type with 0 scale has broader range than representable in long, as used in ParquetLongStatistics + return false; + } + return BigDecimal.valueOf(min, decimalType.getScale()).compareTo(minimalValue(decimalType)) < 0 || + BigDecimal.valueOf(max, decimalType.getScale()).compareTo(maximalValue(decimalType)) > 0; + } + + throw new IllegalArgumentException("Unsupported type: " + type); + } + + private static BigDecimal minimalValue(DecimalType decimalType) + { + return new BigDecimal(format("-%s.%s", "9".repeat(decimalType.getPrecision() - decimalType.getScale()), "9".repeat(decimalType.getScale()))); + } + + private static BigDecimal maximalValue(DecimalType decimalType) + { + return new BigDecimal(format("+%s.%s", "9".repeat(decimalType.getPrecision() - decimalType.getScale()), "9".repeat(decimalType.getScale()))); + } + + public static TupleDomainParquetPredicate buildPredicate( + MessageType requestedSchema, + TupleDomain parquetTupleDomain, + Map, ColumnDescriptor> descriptorsByPath, + DateTimeZone timeZone) + { + ImmutableList.Builder columnReferences = ImmutableList.builder(); + for (String[] paths : requestedSchema.getPaths()) { + ColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(paths)); + if (descriptor != null) { + columnReferences.add(descriptor); + } + } + return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build(), timeZone); + } + + public static boolean predicateMatches( + TupleDomainParquetPredicate parquetPredicate, + PrunedBlockMetadata columnsMetadata, + ParquetDataSource dataSource, + Map, ColumnDescriptor> descriptorsByPath, + TupleDomain parquetTupleDomain, + Optional columnIndexStore, + Optional bloomFilterStore, + DateTimeZone timeZone, + int domainCompactionThreshold) + throws IOException + { + if (columnsMetadata.getRowCount() == 0) { + return false; + } + Map> columnStatistics = getStatistics(columnsMetadata, descriptorsByPath); + Map columnValueCounts = getColumnValueCounts(columnsMetadata, descriptorsByPath); + Optional> candidateColumns = parquetPredicate.getIndexLookupCandidates(columnValueCounts, columnStatistics, dataSource.getId()); + if (candidateColumns.isEmpty()) { + return false; + } + if (candidateColumns.get().isEmpty()) { + return true; + } + // Perform column index, bloom filter checks and dictionary lookups only for the subset of columns where it can be useful. + // This prevents unnecessary filesystem reads and decoding work when the predicate on a column comes from + // file-level min/max stats or more generally when the predicate selects a range equal to or wider than row-group min/max. + TupleDomainParquetPredicate indexPredicate = new TupleDomainParquetPredicate(parquetTupleDomain, candidateColumns.get(), timeZone); + + // Page stats is finer grained but relatively more expensive, so we do the filtering after above block filtering. + if (columnIndexStore.isPresent() && !indexPredicate.matches(columnValueCounts, columnIndexStore.get(), dataSource.getId())) { + return false; + } + + if (bloomFilterStore.isPresent() && !indexPredicate.matches(bloomFilterStore.get(), domainCompactionThreshold)) { + return false; + } + + return dictionaryPredicatesMatch( + indexPredicate, + columnsMetadata, + dataSource, + descriptorsByPath, + ImmutableSet.copyOf(candidateColumns.get()), + columnIndexStore); + } + + public static List getFilteredRowGroups( + long splitStart, + long splitLength, + ParquetDataSource dataSource, + ParquetMetadata parquetMetadata, + List> parquetTupleDomains, + List parquetPredicates, + Map, ColumnDescriptor> descriptorsByPath, + DateTimeZone timeZone, + int domainCompactionThreshold, + ParquetReaderOptions options) + throws IOException + { + ImmutableList.Builder rowGroupInfoBuilder = ImmutableList.builder(); + for (BlockMetadata block : parquetMetadata.getBlocks(splitStart, splitLength)) { + for (int i = 0; i < parquetTupleDomains.size(); i++) { + TupleDomain parquetTupleDomain = parquetTupleDomains.get(i); + TupleDomainParquetPredicate parquetPredicate = parquetPredicates.get(i); + Optional columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options); + Optional bloomFilterStore = getBloomFilterStore(dataSource, block, parquetTupleDomain, options); + PrunedBlockMetadata columnsMetadata = createPrunedColumnsMetadata(block, dataSource.getId(), descriptorsByPath); + if (predicateMatches( + parquetPredicate, + columnsMetadata, + dataSource, + descriptorsByPath, + parquetTupleDomain, + columnIndex, + bloomFilterStore, + timeZone, + domainCompactionThreshold)) { + rowGroupInfoBuilder.add(new RowGroupInfo(columnsMetadata, block.fileRowCountOffset(), columnIndex)); + break; + } + } + } + return rowGroupInfoBuilder.build(); + } + + private static Map> getStatistics(PrunedBlockMetadata columnsMetadata, Map, ColumnDescriptor> descriptorsByPath) + throws ParquetCorruptionException + { + ImmutableMap.Builder> statistics = ImmutableMap.builderWithExpectedSize(descriptorsByPath.size()); + for (ColumnDescriptor descriptor : descriptorsByPath.values()) { + ColumnChunkMetadata columnMetaData = columnsMetadata.getColumnChunkMetaData(descriptor); + Statistics columnStatistics = columnMetaData.getStatistics(); + if (columnStatistics != null) { + statistics.put(descriptor, columnStatistics); + } + } + return statistics.buildOrThrow(); + } + + private static Map getColumnValueCounts(PrunedBlockMetadata columnsMetadata, Map, ColumnDescriptor> descriptorsByPath) + throws ParquetCorruptionException + { + ImmutableMap.Builder columnValueCounts = ImmutableMap.builderWithExpectedSize(descriptorsByPath.size()); + for (ColumnDescriptor descriptor : descriptorsByPath.values()) { + ColumnChunkMetadata columnMetaData = columnsMetadata.getColumnChunkMetaData(descriptor); + columnValueCounts.put(descriptor, columnMetaData.getValueCount()); + } + return columnValueCounts.buildOrThrow(); + } + + private static boolean dictionaryPredicatesMatch( + TupleDomainParquetPredicate parquetPredicate, + PrunedBlockMetadata columnsMetadata, + ParquetDataSource dataSource, + Map, ColumnDescriptor> descriptorsByPath, + Set candidateColumns, + Optional columnIndexStore) + throws IOException + { + for (ColumnDescriptor descriptor : descriptorsByPath.values()) { + ColumnChunkMetadata columnMetaData = columnsMetadata.getColumnChunkMetaData(descriptor); + if (!candidateColumns.contains(descriptor)) { + continue; + } + if (isOnlyDictionaryEncodingPages(columnMetaData)) { + Statistics columnStatistics = columnMetaData.getStatistics(); + boolean nullAllowed = columnStatistics == null || columnStatistics.getNumNulls() != 0; + // Early abort, predicate already filters block so no more dictionaries need be read + if (!parquetPredicate.matches(new DictionaryDescriptor( + descriptor, + nullAllowed, + readDictionaryPage(dataSource, columnMetaData, columnIndexStore)))) { + return false; + } + } + } + return true; + } + + private static Optional readDictionaryPage( + ParquetDataSource dataSource, + ColumnChunkMetadata columnMetaData, + Optional columnIndexStore) + throws IOException + { + int dictionaryPageSize; + if (columnMetaData.getDictionaryPageOffset() == 0 || columnMetaData.getFirstDataPageOffset() <= columnMetaData.getDictionaryPageOffset()) { + /* + * See org.apache.parquet.hadoop.Offsets for reference. + * The offsets might not contain the proper values in the below cases: + * - The dictionaryPageOffset might not be set; in this case 0 is returned + * (0 cannot be a valid offset because of the MAGIC bytes) + * - The firstDataPageOffset might point to the dictionary page + * + * Such parquet files may have been produced by parquet-mr writers before PARQUET-1977. + * We find the dictionary page size from OffsetIndex if that exists, + * otherwise fallback to reading the full column chunk. + */ + dictionaryPageSize = columnIndexStore.flatMap(index -> getDictionaryPageSize(index, columnMetaData)) + .orElseGet(() -> toIntExact(columnMetaData.getTotalSize())); + } + else { + dictionaryPageSize = toIntExact(columnMetaData.getFirstDataPageOffset() - columnMetaData.getDictionaryPageOffset()); + } + // Get the dictionary page header and the dictionary in single read + Slice buffer = dataSource.readFully(columnMetaData.getStartingPos(), dictionaryPageSize); + return readPageHeaderWithData(buffer.getInput()).map(data -> decodeDictionaryPage(dataSource.getId(), data, columnMetaData)); + } + + private static Optional getDictionaryPageSize(ColumnIndexStore columnIndexStore, ColumnChunkMetadata columnMetaData) + { + OffsetIndex offsetIndex = columnIndexStore.getOffsetIndex(columnMetaData.getPath()); + if (offsetIndex == null) { + return Optional.empty(); + } + long rowGroupOffset = columnMetaData.getStartingPos(); + long firstPageOffset = offsetIndex.getOffset(0); + if (rowGroupOffset < firstPageOffset) { + return Optional.of(toIntExact(firstPageOffset - rowGroupOffset)); + } + return Optional.empty(); + } + + private static Optional readPageHeaderWithData(SliceInput inputStream) + { + PageHeader pageHeader; + try { + pageHeader = Util.readPageHeader(inputStream); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + + if (pageHeader.type != PageType.DICTIONARY_PAGE) { + return Optional.empty(); + } + DictionaryPageHeader dictionaryHeader = pageHeader.getDictionary_page_header(); + if (dictionaryHeader.getNum_values() > MAX_DICTIONARY_SIZE) { + return Optional.empty(); + } + return Optional.of(new PageHeaderWithData( + pageHeader, + inputStream.readSlice(pageHeader.getCompressed_page_size()))); + } + + private static DictionaryPage decodeDictionaryPage(ParquetDataSourceId dataSourceId, PageHeaderWithData pageHeaderWithData, ColumnChunkMetadata chunkMetaData) + { + PageHeader pageHeader = pageHeaderWithData.pageHeader(); + DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); + ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name())); + int dictionarySize = dicHeader.getNum_values(); + + Slice compressedData = pageHeaderWithData.compressedData(); + try { + return new DictionaryPage(decompress(dataSourceId, chunkMetaData.getCodec().getParquetCompressionCodec(), compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding); + } + catch (IOException e) { + throw new ParquetDecodingException("Could not decode the dictionary for " + chunkMetaData.getPath(), e); + } + } + + private record PageHeaderWithData(PageHeader pageHeader, Slice compressedData) + { + private PageHeaderWithData(PageHeader pageHeader, Slice compressedData) + { + this.pageHeader = requireNonNull(pageHeader, "pageHeader is null"); + this.compressedData = requireNonNull(compressedData, "compressedData is null"); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/TupleDomainParquetPredicate.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/TupleDomainParquetPredicate.java new file mode 100644 index 000000000000..949cb4253475 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/predicate/TupleDomainParquetPredicate.java @@ -0,0 +1,886 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.predicate; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.VerifyException; +import com.google.common.collect.ImmutableList; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.parquet.BloomFilterStore; +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.dictionary.Dictionary; +import io.trino.plugin.base.type.TrinoTimestampEncoder; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.SortedRangeSet; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.predicate.ValueSet; +import io.trino.spi.type.DecimalConversions; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Decimals; +import io.trino.spi.type.Int128; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.Type; +import io.trino.spi.type.UuidType; +import io.trino.spi.type.VarbinaryType; +import io.trino.spi.type.VarcharType; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.filter2.predicate.FilterApi; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.filter2.predicate.Operators; +import org.apache.parquet.filter2.predicate.UserDefinedPredicate; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.joda.time.DateTimeZone; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetMetadataConverter.isMinMaxStatsSupported; +import static io.trino.parquet.ParquetTimestampUtils.decodeInt64Timestamp; +import static io.trino.parquet.ParquetTimestampUtils.decodeInt96Timestamp; +import static io.trino.parquet.ParquetTypeUtils.getShortDecimalValue; +import static io.trino.parquet.predicate.PredicateUtils.isStatisticsOverflow; +import static io.trino.parquet.reader.ColumnReaderFactory.isDecimalRescaled; +import static io.trino.plugin.base.type.TrinoTimestampEncoderFactory.createTimestampEncoder; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.Decimals.longTenToNth; +import static io.trino.spi.type.DoubleType.DOUBLE; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.RealType.REAL; +import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TinyintType.TINYINT; +import static java.lang.Float.floatToRawIntBits; +import static java.lang.Float.intBitsToFloat; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.nio.ByteOrder.LITTLE_ENDIAN; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; + +public class TupleDomainParquetPredicate +{ + private final TupleDomain effectivePredicate; + private final List columns; + private final DateTimeZone timeZone; + + public TupleDomainParquetPredicate(TupleDomain effectivePredicate, List columns, DateTimeZone timeZone) + { + this.effectivePredicate = requireNonNull(effectivePredicate, "effectivePredicate is null"); + this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null")); + this.timeZone = requireNonNull(timeZone, "timeZone is null"); + } + + /** + * Should the Parquet Reader process a file section with the specified statistics, + * and if it should, then return the columns are candidates for further inspection of more + * granular statistics from column index and dictionary. + * + * @param valueCounts the number of values for a column in the segment; this can be used with + * Statistics to determine if a column is only null + * @param statistics column statistics + * @param id Parquet file name + * + * @return Optional.empty() if statistics were sufficient to eliminate the file section. + * Otherwise, a list of columns for which page-level indices and dictionary could be consulted + * to potentially eliminate the file section. An optional with empty list is returned if there is + * going to be no benefit in looking at column index or dictionary for any column. + */ + public Optional> getIndexLookupCandidates( + Map valueCounts, + Map> statistics, + ParquetDataSourceId id) + throws ParquetCorruptionException + { + if (effectivePredicate.isNone()) { + return Optional.empty(); + } + Map effectivePredicateDomains = effectivePredicate.getDomains() + .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains")); + + ImmutableList.Builder candidateColumns = ImmutableList.builder(); + for (ColumnDescriptor column : columns) { + Domain effectivePredicateDomain = effectivePredicateDomains.get(column); + if (effectivePredicateDomain == null) { + continue; + } + + Statistics columnStatistics = statistics.get(column); + if (columnStatistics == null || columnStatistics.isEmpty()) { + // no stats for column + candidateColumns.add(column); + continue; + } + + Long columnValueCount = valueCounts.get(column); + if (columnValueCount == null) { + throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id)); + } + Domain domain = getDomain( + column, + effectivePredicateDomain.getType(), + columnValueCount, + columnStatistics, + id, + timeZone); + if (!effectivePredicateDomain.overlaps(domain)) { + return Optional.empty(); + } + // If the predicate domain on a column includes the entire domain from column row-group statistics, + // then more granular statistics from page stats or dictionary for this column will not help to eliminate the row-group. + if (!effectivePredicateDomain.contains(domain)) { + candidateColumns.add(column); + } + } + return Optional.of(candidateColumns.build()); + } + + /** + * Should the Parquet Reader process a file section with the specified dictionary based on that + * single dictionary. This is safe to check repeatedly to avoid loading more parquet dictionaries + * if the section can already be eliminated. + * + * @param dictionary The single column dictionary + */ + public boolean matches(DictionaryDescriptor dictionary) + { + requireNonNull(dictionary, "dictionary is null"); + if (effectivePredicate.isNone()) { + return false; + } + Map effectivePredicateDomains = effectivePredicate.getDomains() + .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains")); + + Domain effectivePredicateDomain = effectivePredicateDomains.get(dictionary.getColumnDescriptor()); + + return effectivePredicateDomain == null || effectivePredicateMatches(effectivePredicateDomain, dictionary); + } + + /** + * Should the Parquet Reader process a file section with the specified statistics. + * + * @param valueCounts the number of values for a column in the segment; this can be used with + * Statistics to determine if a column is only null + * @param columnIndexStore column index (statistics) store + * @param id Parquet file name + */ + public boolean matches(Map valueCounts, ColumnIndexStore columnIndexStore, ParquetDataSourceId id) + throws ParquetCorruptionException + { + requireNonNull(columnIndexStore, "columnIndexStore is null"); + if (effectivePredicate.isNone()) { + return false; + } + + Map effectivePredicateDomains = effectivePredicate.getDomains() + .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains")); + + for (ColumnDescriptor column : columns) { + Domain effectivePredicateDomain = effectivePredicateDomains.get(column); + if (effectivePredicateDomain == null) { + continue; + } + + // ParquetMetadataConverter#fromParquetColumnIndex returns null if the parquet primitive type does not support min/max stats + if (!isMinMaxStatsSupported(column.getPrimitiveType())) { + continue; + } + ColumnIndex columnIndex = columnIndexStore.getColumnIndex(ColumnPath.get(column.getPath())); + if (columnIndex == null) { + continue; + } + + Long columnValueCount = valueCounts.get(column); + if (columnValueCount == null) { + throw new IllegalArgumentException(format("Missing columnValueCount for column %s in %s", column, id)); + } + Domain domain = getDomain(effectivePredicateDomain.getType(), columnValueCount, columnIndex, id, column, timeZone); + if (!effectivePredicateDomain.overlaps(domain)) { + return false; + } + } + + return true; + } + + /** + * Should the Parquet Reader process a file section with the specified bloomfilter Store + * + * @param bloomFilterStore bloomfilter Store + */ + public boolean matches(BloomFilterStore bloomFilterStore, int domainCompactionThreshold) + { + requireNonNull(bloomFilterStore, "bloomFilterStore is null"); + + if (effectivePredicate.isNone()) { + return false; + } + Map effectivePredicateDomains = effectivePredicate.getDomains() + .orElseThrow(() -> new IllegalStateException("Effective predicate other than none should have domains")); + + for (ColumnDescriptor column : columns) { + Domain effectivePredicateDomain = effectivePredicateDomains.get(column); + + // the bloom filter bitset contains only non-null values so isn't helpful + if (effectivePredicateDomain == null || effectivePredicateDomain.isNullAllowed()) { + continue; + } + + Optional> discreteValues = extractDiscreteValues(domainCompactionThreshold, effectivePredicateDomain.getValues()); + // values are not discrete, so bloom filter isn't helpful + if (discreteValues.isEmpty()) { + continue; + } + + Optional bloomFilterOptional = bloomFilterStore.getBloomFilter(ColumnPath.get(column.getPath())); + if (bloomFilterOptional.isEmpty()) { + continue; + } + BloomFilter bloomFilter = bloomFilterOptional.get(); + if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, effectivePredicateDomain.getType()))) { + return false; + } + } + return true; + } + + /** + * Convert Predicate to Parquet filter if possible. + * + * @param timeZone current Parquet timezone + * @return Converted Parquet filter or null if conversion not possible + */ + public Optional toParquetFilter(DateTimeZone timeZone) + { + return Optional.ofNullable(convertToParquetFilter(timeZone)); + } + + private boolean effectivePredicateMatches(Domain effectivePredicateDomain, DictionaryDescriptor dictionary) + { + return effectivePredicateDomain.overlaps(getDomain(effectivePredicateDomain.getType(), dictionary, timeZone)); + } + + @VisibleForTesting + public static Domain getDomain( + ColumnDescriptor column, + Type type, + long columnValuesCount, + Statistics statistics, + ParquetDataSourceId id, + DateTimeZone timeZone) + throws ParquetCorruptionException + { + if (statistics == null || statistics.isEmpty()) { + return Domain.all(type); + } + + if (statistics.isNumNullsSet() && statistics.getNumNulls() == columnValuesCount) { + return Domain.onlyNull(type); + } + + boolean hasNullValue = !statistics.isNumNullsSet() || statistics.getNumNulls() != 0L; + + if (!statistics.hasNonNullValue() || statistics.genericGetMin() == null || statistics.genericGetMax() == null) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + + try { + Object min = statistics.genericGetMin(); + Object max = statistics.genericGetMax(); + return getDomain( + column, + type, + ImmutableList.of(min instanceof Binary minValue ? Slices.wrappedBuffer(minValue.getBytes()) : min), + ImmutableList.of(max instanceof Binary maxValue ? Slices.wrappedBuffer(maxValue.getBytes()) : max), + hasNullValue, + timeZone); + } + catch (Exception e) { + throw corruptionException(column.toString(), id, statistics, e); + } + } + + /** + * Get a domain for the ranges defined by each pair of elements from {@code minimums} and {@code maximums}. + * Both arrays must have the same length. + */ + private static Domain getDomain( + ColumnDescriptor column, + Type type, + List minimums, + List maximums, + boolean hasNullValue, + DateTimeZone timeZone) + { + checkArgument(minimums.size() == maximums.size(), "Expected minimums and maximums to have the same size"); + + if (type.equals(BOOLEAN)) { + boolean hasTrueValues = minimums.stream().anyMatch(value -> (boolean) value) || maximums.stream().anyMatch(value -> (boolean) value); + boolean hasFalseValues = minimums.stream().anyMatch(value -> !(boolean) value) || maximums.stream().anyMatch(value -> !(boolean) value); + if (hasTrueValues && hasFalseValues) { + return Domain.all(type); + } + if (hasTrueValues) { + return Domain.create(ValueSet.of(type, true), hasNullValue); + } + if (hasFalseValues) { + return Domain.create(ValueSet.of(type, false), hasNullValue); + } + // All nulls case is handled earlier + throw new VerifyException("Impossible boolean statistics"); + } + + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(DATE) || type.equals(SMALLINT) || type.equals(TINYINT)) { + SortedRangeSet.Builder rangesBuilder = SortedRangeSet.builder(type, minimums.size()); + for (int i = 0; i < minimums.size(); i++) { + long min = asLong(minimums.get(i)); + long max = asLong(maximums.get(i)); + if (isStatisticsOverflow(type, min, max)) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + + rangesBuilder.addRangeInclusive(min, max); + } + + return Domain.create(rangesBuilder.build(), hasNullValue); + } + + if (type instanceof DecimalType decimalType) { + SortedRangeSet.Builder rangesBuilder = SortedRangeSet.builder(type, minimums.size()); + if (decimalType.isShort()) { + for (int i = 0; i < minimums.size(); i++) { + long minValue = getShortDecimal(minimums.get(i), decimalType, column); + long maxValue = getShortDecimal(maximums.get(i), decimalType, column); + + if (isStatisticsOverflow(type, minValue, maxValue)) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + + rangesBuilder.addRangeInclusive(minValue, maxValue); + } + } + else { + for (int i = 0; i < minimums.size(); i++) { + Int128 minValue = getLongDecimal(minimums.get(i), decimalType, column); + Int128 maxValue = getLongDecimal(maximums.get(i), decimalType, column); + + rangesBuilder.addRangeInclusive(minValue, maxValue); + } + } + + return Domain.create(rangesBuilder.build(), hasNullValue); + } + + if (type.equals(REAL)) { + SortedRangeSet.Builder rangesBuilder = SortedRangeSet.builder(type, minimums.size()); + for (int i = 0; i < minimums.size(); i++) { + Float min = (Float) minimums.get(i); + Float max = (Float) maximums.get(i); + + if (min.isNaN() || max.isNaN()) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + + rangesBuilder.addRangeInclusive((long) floatToRawIntBits(min), (long) floatToRawIntBits(max)); + } + return Domain.create(rangesBuilder.build(), hasNullValue); + } + + if (type.equals(DOUBLE)) { + SortedRangeSet.Builder rangesBuilder = SortedRangeSet.builder(type, minimums.size()); + for (int i = 0; i < minimums.size(); i++) { + Double min = (Double) minimums.get(i); + Double max = (Double) maximums.get(i); + + if (min.isNaN() || max.isNaN()) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + + rangesBuilder.addRangeInclusive(min, max); + } + return Domain.create(rangesBuilder.build(), hasNullValue); + } + + if (type instanceof VarcharType) { + SortedRangeSet.Builder rangesBuilder = SortedRangeSet.builder(type, minimums.size()); + for (int i = 0; i < minimums.size(); i++) { + Slice min = (Slice) minimums.get(i); + Slice max = (Slice) maximums.get(i); + rangesBuilder.addRangeInclusive(min, max); + } + return Domain.create(rangesBuilder.build(), hasNullValue); + } + + if (type instanceof TimestampType timestampType) { + if (column.getPrimitiveType().getPrimitiveTypeName().equals(INT96)) { + TrinoTimestampEncoder timestampEncoder = createTimestampEncoder(timestampType, timeZone); + SortedRangeSet.Builder rangesBuilder = SortedRangeSet.builder(type, minimums.size()); + for (int i = 0; i < minimums.size(); i++) { + Object min = minimums.get(i); + Object max = maximums.get(i); + + // Parquet INT96 timestamp values were compared incorrectly for the purposes of producing statistics by older parquet writers, so + // PARQUET-1065 deprecated them. The result is that any writer that produced stats was producing unusable incorrect values, except + // the special case where min == max and an incorrect ordering would not be material to the result. PARQUET-1026 made binary stats + // available and valid in that special case + if (!(min instanceof Slice minSlice) || !(max instanceof Slice) || !min.equals(max)) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + + rangesBuilder.addValue(timestampEncoder.getTimestamp(decodeInt96Timestamp(Binary.fromConstantByteArray(minSlice.getBytes())))); + } + return Domain.create(rangesBuilder.build(), hasNullValue); + } + if (column.getPrimitiveType().getPrimitiveTypeName().equals(INT64)) { + LogicalTypeAnnotation logicalTypeAnnotation = column.getPrimitiveType().getLogicalTypeAnnotation(); + if (!(logicalTypeAnnotation instanceof TimestampLogicalTypeAnnotation timestampTypeAnnotation)) { + // Invalid statistics. Unit and UTC adjustment are not known + return Domain.create(ValueSet.all(type), hasNullValue); + } + + // Bail out if the precision is not known + if (timestampTypeAnnotation.getUnit() == null) { + return Domain.create(ValueSet.all(type), hasNullValue); + } + TrinoTimestampEncoder timestampEncoder = createTimestampEncoder(timestampType, DateTimeZone.UTC); + + SortedRangeSet.Builder rangesBuilder = SortedRangeSet.builder(type, minimums.size()); + for (int i = 0; i < minimums.size(); i++) { + long min = (long) minimums.get(i); + long max = (long) maximums.get(i); + + rangesBuilder.addRangeInclusive( + timestampEncoder.getTimestamp(decodeInt64Timestamp(min, timestampTypeAnnotation.getUnit())), + timestampEncoder.getTimestamp(decodeInt64Timestamp(max, timestampTypeAnnotation.getUnit()))); + } + return Domain.create(rangesBuilder.build(), hasNullValue); + } + } + + return Domain.create(ValueSet.all(type), hasNullValue); + } + + private static long getShortDecimal(Object value, DecimalType columnType, ColumnDescriptor column) + { + LogicalTypeAnnotation annotation = column.getPrimitiveType().getLogicalTypeAnnotation(); + + if (annotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation) { + if (isDecimalRescaled(decimalAnnotation, columnType)) { + if (decimalAnnotation.getPrecision() <= Decimals.MAX_SHORT_PRECISION) { + long rescale = longTenToNth(Math.abs(columnType.getScale() - decimalAnnotation.getScale())); + return DecimalConversions.shortToShortCast( + value instanceof Slice slice ? getShortDecimalValue(slice.getBytes()) : asLong(value), + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + columnType.getPrecision(), + columnType.getScale(), + rescale, + rescale / 2); + } + Int128 int128Representation = value instanceof Slice minSlice ? Int128.fromBigEndian(minSlice.getBytes()) : Int128.valueOf(asLong(value)); + return DecimalConversions.longToShortCast( + int128Representation, + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + columnType.getPrecision(), + columnType.getScale()); + } + } + return value instanceof Slice slice ? getShortDecimalValue(slice.getBytes()) : asLong(value); + } + + private static Int128 getLongDecimal(Object value, DecimalType columnType, ColumnDescriptor column) + { + LogicalTypeAnnotation annotation = column.getPrimitiveType().getLogicalTypeAnnotation(); + + if (annotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation) { + if (isDecimalRescaled(decimalAnnotation, columnType)) { + if (decimalAnnotation.getPrecision() <= Decimals.MAX_SHORT_PRECISION) { + return DecimalConversions.shortToLongCast( + value instanceof Slice slice ? getShortDecimalValue(slice.getBytes()) : asLong(value), + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + columnType.getPrecision(), + columnType.getScale()); + } + Int128 int128Representation = value instanceof Slice slice ? Int128.fromBigEndian(slice.getBytes()) : Int128.valueOf(asLong(value)); + return DecimalConversions.longToLongCast( + int128Representation, + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + columnType.getPrecision(), + columnType.getScale()); + } + } + return value instanceof Slice slice ? Int128.fromBigEndian(slice.getBytes()) : Int128.valueOf(asLong(value)); + } + + @VisibleForTesting + public static Domain getDomain( + Type type, + long columnValuesCount, + ColumnIndex columnIndex, + ParquetDataSourceId id, + ColumnDescriptor descriptor, + DateTimeZone timeZone) + throws ParquetCorruptionException + { + if (columnIndex == null) { + return Domain.all(type); + } + + List maxValues = columnIndex.getMaxValues(); + List minValues = columnIndex.getMinValues(); + // Null counts is optional in the format, see org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder for reference + Optional> nullCounts = Optional.ofNullable(columnIndex.getNullCounts()); + List nullPages = columnIndex.getNullPages(); + + String columnName = descriptor.getPrimitiveType().getName(); + if (isCorruptedColumnIndex(minValues, maxValues, nullCounts, nullPages)) { + throw corruptionException(columnName, id, columnIndex, null); + } + if (maxValues.isEmpty()) { + return Domain.all(type); + } + + boolean hasNullValue = true; + if (nullCounts.isPresent()) { + long totalNullCount = nullCounts.orElseThrow().stream() + .mapToLong(value -> value) + .sum(); + if (totalNullCount == columnValuesCount) { + return Domain.onlyNull(type); + } + hasNullValue = totalNullCount > 0; + } + + try { + int pageCount = minValues.size(); + ColumnIndexValueConverter converter = new ColumnIndexValueConverter(); + Function converterFunction = converter.getConverter(descriptor.getPrimitiveType()); + List min = new ArrayList<>(pageCount); + List max = new ArrayList<>(pageCount); + for (int i = 0; i < pageCount; i++) { + if (nullPages.get(i)) { + continue; + } + min.add(converterFunction.apply(minValues.get(i))); + max.add(converterFunction.apply(maxValues.get(i))); + } + + return getDomain(descriptor, type, min, max, hasNullValue, timeZone); + } + catch (Exception e) { + throw corruptionException(columnName, id, columnIndex, e); + } + } + + @VisibleForTesting + public static Domain getDomain(Type type, DictionaryDescriptor dictionaryDescriptor) + { + return getDomain(type, dictionaryDescriptor, DateTimeZone.getDefault()); + } + + private static Domain getDomain(Type type, DictionaryDescriptor dictionaryDescriptor, DateTimeZone timeZone) + { + if (dictionaryDescriptor == null) { + return Domain.all(type); + } + + ColumnDescriptor columnDescriptor = dictionaryDescriptor.getColumnDescriptor(); + Optional dictionaryPage = dictionaryDescriptor.getDictionaryPage(); + if (dictionaryPage.isEmpty()) { + return Domain.all(type); + } + + Dictionary dictionary; + try { + dictionary = dictionaryPage.get().getEncoding().initDictionary(columnDescriptor, dictionaryPage.get()); + } + catch (Exception e) { + // In case of exception, just continue reading the data, not using dictionary page at all + // OK to ignore exception when reading dictionaries + return Domain.all(type); + } + + int dictionarySize = dictionaryPage.get().getDictionarySize(); + + if (dictionarySize == 0) { + if (dictionaryDescriptor.isNullAllowed()) { + return Domain.onlyNull(type); + } + return Domain.none(type); + } + + DictionaryValueConverter converter = new DictionaryValueConverter(dictionary); + Function convertFunction = converter.getConverter(columnDescriptor.getPrimitiveType()); + List values = new ArrayList<>(dictionarySize); + for (int i = 0; i < dictionarySize; i++) { + values.add(convertFunction.apply(i)); + } + + // TODO: when min == max (i.e., singleton ranges, the construction of Domains can be done more efficiently + return getDomain(columnDescriptor, type, values, values, dictionaryDescriptor.isNullAllowed(), timeZone); + } + + private static ParquetCorruptionException corruptionException(String column, ParquetDataSourceId id, Statistics statistics, Exception cause) + { + return new ParquetCorruptionException(cause, id, "Corrupted statistics for column \"%s\": [%s]", column, statistics); + } + + private static ParquetCorruptionException corruptionException(String column, ParquetDataSourceId id, ColumnIndex columnIndex, Exception cause) + { + return new ParquetCorruptionException(cause, id, "Corrupted statistics for column \"%s\". Corrupted column index: [%s]", column, columnIndex); + } + + private static boolean isCorruptedColumnIndex( + List minValues, + List maxValues, + Optional> nullCounts, + List nullPages) + { + if (maxValues == null || minValues == null || nullPages == null) { + return true; + } + + int pageCount = nullPages.size(); + return (nullCounts.isPresent() && nullCounts.get().size() != pageCount) + || minValues.size() != pageCount + || maxValues.size() != pageCount; + } + + public static long asLong(Object value) + { + if (value instanceof Byte || value instanceof Short || value instanceof Integer || value instanceof Long) { + return ((Number) value).longValue(); + } + + throw new IllegalArgumentException("Can't convert value to long: " + value.getClass().getName()); + } + + /** + * Check if the predicateValue might be in the bloomfilter + * + * @param bloomFilter parquet bloomfilter. + * @param predicateValue effective discrete predicate value. + * @param sqlType Type that contains information about the type schema from connector's metadata + * @return true if the predicateValue might be in the bloomfilter, false if the predicateValue absolutely is not in the bloomfilter + */ + @VisibleForTesting + public static boolean checkInBloomFilter(BloomFilter bloomFilter, Object predicateValue, Type sqlType) + { + // TODO: Support TIMESTAMP, CHAR and DECIMAL + if (sqlType == TINYINT || sqlType == SMALLINT || sqlType == INTEGER || sqlType == DATE) { + return bloomFilter.findHash(bloomFilter.hash(toIntExact(((Number) predicateValue).longValue()))); + } + if (sqlType == BIGINT) { + return bloomFilter.findHash(bloomFilter.hash(((Number) predicateValue).longValue())); + } + else if (sqlType == DOUBLE) { + return bloomFilter.findHash(bloomFilter.hash(((Double) predicateValue).doubleValue())); + } + else if (sqlType == REAL) { + return bloomFilter.findHash(bloomFilter.hash(intBitsToFloat(toIntExact(((Number) predicateValue).longValue())))); + } + else if (sqlType instanceof VarcharType || sqlType instanceof VarbinaryType) { + return bloomFilter.findHash(bloomFilter.hash(Binary.fromConstantByteBuffer(((Slice) predicateValue).toByteBuffer()))); + } + else if (sqlType instanceof UuidType) { + return bloomFilter.findHash(bloomFilter.hash(Binary.fromConstantByteArray(((Slice) predicateValue).getBytes()))); + } + + return true; + } + + private static Optional> extractDiscreteValues(int domainCompactionThreshold, ValueSet valueSet) + { + if (!valueSet.isDiscreteSet()) { + return valueSet.tryExpandRanges(domainCompactionThreshold); + } + + return Optional.of(valueSet.getDiscreteSet()); + } + + private FilterPredicate convertToParquetFilter(DateTimeZone timeZone) + { + FilterPredicate filter = null; + + for (ColumnDescriptor column : columns) { + Domain domain = effectivePredicate.getDomains().get().get(column); + if (domain == null || domain.isNone()) { + continue; + } + + if (domain.isAll()) { + continue; + } + + // ParquetMetadataConverter#fromParquetColumnIndex returns null if the parquet primitive type does not support min/max stats + if (!isMinMaxStatsSupported(column.getPrimitiveType())) { + continue; + } + + FilterPredicate columnFilter = FilterApi.userDefined( + new TrinoIntColumn(ColumnPath.get(column.getPath())), + new DomainUserDefinedPredicate<>(column, domain, timeZone)); + if (filter == null) { + filter = columnFilter; + } + else { + filter = FilterApi.and(filter, columnFilter); + } + } + + return filter; + } + + /** + * This class implements methods defined in UserDefinedPredicate based on the page statistic and tuple domain(for a column). + */ + static class DomainUserDefinedPredicate> + extends UserDefinedPredicate + implements Serializable // Required by argument of FilterApi.userDefined call + { + private final ColumnDescriptor columnDescriptor; + private final Domain columnDomain; + private final DateTimeZone timeZone; + + public DomainUserDefinedPredicate(ColumnDescriptor columnDescriptor, Domain domain, DateTimeZone timeZone) + { + this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor is null"); + this.columnDomain = domain; + this.timeZone = timeZone; + } + + @Override + public boolean keep(T value) + { + if (value == null && !columnDomain.isNullAllowed()) { + return false; + } + + return true; + } + + @Override + public boolean canDrop(org.apache.parquet.filter2.predicate.Statistics statistic) + { + if (statistic == null) { + return false; + } + + T min = statistic.getMin(); + T max = statistic.getMax(); + Domain domain = getDomain( + columnDescriptor, + columnDomain.getType(), + ImmutableList.of(min instanceof Binary minBinary ? Slices.wrappedBuffer(minBinary.getBytes()) : min), + ImmutableList.of(max instanceof Binary maxBinary ? Slices.wrappedBuffer(maxBinary.getBytes()) : max), + true, + timeZone); + return !columnDomain.overlaps(domain); + } + + @Override + public boolean inverseCanDrop(org.apache.parquet.filter2.predicate.Statistics statistics) + { + // Since we don't use LogicalNotUserDefined, this method is not called. + // To be safe, we just keep the record by returning false. + return false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("columnDescriptor", columnDescriptor) + .add("columnDomain", columnDomain) + .toString(); + } + } + + private static class ColumnIndexValueConverter + { + private ColumnIndexValueConverter() {} + + private Function getConverter(PrimitiveType primitiveType) + { + return switch (primitiveType.getPrimitiveTypeName()) { + case BOOLEAN -> buffer -> buffer.get(0) != 0; + case INT32 -> buffer -> buffer.order(LITTLE_ENDIAN).getInt(0); + case INT64 -> buffer -> buffer.order(LITTLE_ENDIAN).getLong(0); + case FLOAT -> buffer -> buffer.order(LITTLE_ENDIAN).getFloat(0); + case DOUBLE -> buffer -> buffer.order(LITTLE_ENDIAN).getDouble(0); + case FIXED_LEN_BYTE_ARRAY, BINARY, INT96 -> Slices::wrappedHeapBuffer; + }; + } + } + + private static class DictionaryValueConverter + { + private final Dictionary dictionary; + + private DictionaryValueConverter(Dictionary dictionary) + { + this.dictionary = dictionary; + } + + private Function getConverter(PrimitiveType primitiveType) + { + return switch (primitiveType.getPrimitiveTypeName()) { + case BOOLEAN -> throw new ParquetDecodingException("Dictionary encoding does not support: " + primitiveType.getPrimitiveTypeName()); + case INT32 -> dictionary::decodeToInt; + case INT64 -> dictionary::decodeToLong; + case FLOAT -> dictionary::decodeToFloat; + case DOUBLE -> dictionary::decodeToDouble; + case FIXED_LEN_BYTE_ARRAY, BINARY, INT96 -> dictionary::decodeToSlice; + }; + } + } + + // FilterApi#intColumn splits column name on ".". If column name contains a "." this leads to + // ColumnIndexFilter#calculateRowRanges failing to detect that column as part of the projection + // and treating it like a column with only NULL values. + private static final class TrinoIntColumn + extends Operators.Column + implements Operators.SupportsLtGt + { + TrinoIntColumn(ColumnPath columnPath) + { + super(columnPath, Integer.class); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/AbstractColumnReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/AbstractColumnReader.java new file mode 100644 index 000000000000..42e3ecac432e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/AbstractColumnReader.java @@ -0,0 +1,180 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.ParquetEncoding; +import io.trino.parquet.PrimitiveField; +import io.trino.parquet.reader.decoders.ValueDecoder; +import io.trino.parquet.reader.flat.ColumnAdapter; +import io.trino.parquet.reader.flat.DictionaryDecoder; +import io.trino.parquet.reader.flat.RowRangesIterator; +import io.trino.spi.block.Block; +import io.trino.spi.block.DictionaryBlock; +import io.trino.spi.type.AbstractVariableWidthType; +import io.trino.spi.type.DateType; +import io.trino.spi.type.Type; +import jakarta.annotation.Nullable; +import org.apache.parquet.io.ParquetDecodingException; + +import java.util.Optional; +import java.util.OptionalLong; + +import static io.trino.parquet.ParquetEncoding.PLAIN_DICTIONARY; +import static io.trino.parquet.ParquetEncoding.RLE_DICTIONARY; +import static io.trino.parquet.reader.decoders.ValueDecoder.ValueDecodersProvider; +import static io.trino.parquet.reader.flat.DictionaryDecoder.DictionaryDecoderProvider; +import static io.trino.parquet.reader.flat.RowRangesIterator.createRowRangesIterator; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public abstract class AbstractColumnReader + implements ColumnReader +{ + private static final Logger log = Logger.get(AbstractColumnReader.class); + + protected final PrimitiveField field; + protected final ValueDecodersProvider decodersProvider; + protected final ColumnAdapter columnAdapter; + private final DictionaryDecoderProvider dictionaryDecoderProvider; + + protected PageReader pageReader; + protected RowRangesIterator rowRanges; + @Nullable + protected DictionaryDecoder dictionaryDecoder; + private boolean produceDictionaryBlock; + + public AbstractColumnReader( + PrimitiveField field, + ValueDecodersProvider decodersProvider, + DictionaryDecoderProvider dictionaryDecoderProvider, + ColumnAdapter columnAdapter) + { + this.field = requireNonNull(field, "field is null"); + this.decodersProvider = requireNonNull(decodersProvider, "decoders is null"); + this.dictionaryDecoderProvider = requireNonNull(dictionaryDecoderProvider, "dictionaryDecoderProvider is null"); + this.columnAdapter = requireNonNull(columnAdapter, "columnAdapter is null"); + } + + @Override + public void setPageReader(PageReader pageReader, Optional rowRanges) + { + this.pageReader = requireNonNull(pageReader, "pageReader"); + // The dictionary page must be placed at the first position of the column chunk + // if it is partly or completely dictionary encoded. At most one dictionary page + // can be placed in a column chunk. + DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); + + // For dictionary based encodings - https://github.com/apache/parquet-format/blob/master/Encodings.md + if (dictionaryPage != null) { + log.debug("field %s, readDictionaryPage %s", field, dictionaryPage); + dictionaryDecoder = dictionaryDecoderProvider.create(dictionaryPage, isNonNull()); + produceDictionaryBlock = shouldProduceDictionaryBlock(rowRanges); + } + this.rowRanges = createRowRangesIterator(rowRanges); + } + + protected abstract boolean isNonNull(); + + protected boolean produceDictionaryBlock() + { + return produceDictionaryBlock; + } + + protected ValueDecoder createValueDecoder(ValueDecodersProvider decodersProvider, ParquetEncoding encoding, Slice data) + { + ValueDecoder valueDecoder; + if (encoding == PLAIN_DICTIONARY || encoding == RLE_DICTIONARY) { + if (dictionaryDecoder == null) { + throw new ParquetDecodingException(format("Dictionary is missing for %s", field)); + } + valueDecoder = dictionaryDecoder; + } + else { + valueDecoder = decodersProvider.create(encoding); + } + valueDecoder.init(new SimpleSliceInputStream(data)); + return valueDecoder; + } + + protected static void throwEndOfBatchException(int remainingInBatch) + { + throw new ParquetDecodingException(format("Corrupted Parquet file: extra %d values to be consumed when scanning current batch", remainingInBatch)); + } + + protected static void unpackDictionaryNullId( + int[] source, + int[] destination, + boolean[] isNull, + int destOffset, + int chunkSize, + int nullId) + { + int srcOffset = 0; + for (int i = destOffset; i < destOffset + chunkSize; i++) { + if (isNull[i]) { + destination[i] = nullId; + } + else { + destination[i] = source[srcOffset++]; + } + } + } + + protected static ColumnChunk createDictionaryBlock(int[] dictionaryIds, Block dictionary, int[] definitions, int[] repetitions) + { + int positionsCount = dictionaryIds.length; + return new ColumnChunk( + DictionaryBlock.create(positionsCount, dictionary, dictionaryIds), + definitions, + repetitions, + OptionalLong.of(getMaxDictionaryBlockSize(dictionary, positionsCount))); + } + + private boolean shouldProduceDictionaryBlock(Optional filteredRowRanges) + { + // Parquet writer may choose to fall back to a non-dictionary encoding after starting with dictionary encoding if + // 1. If the size of the dictionary exceeds a threshold (1MB for parquet-mr by default). + // 2. Number of dictionary entries exceeds a threshold (Integer.MAX_VALUE for parquet-mr by default). + // Trino dictionary blocks are produced only when the entire column chunk is dictionary encoded + if (pageReader.hasOnlyDictionaryEncodedPages()) { + if (!shouldProduceDictionaryForType(field.getType())) { + return false; + } + requireNonNull(dictionaryDecoder, "dictionaryDecoder is null"); + // Filtering of parquet pages using column indexes may result in the total number of values read from the + // column chunk being lower than the size of the dictionary + return filteredRowRanges.map(rowRanges -> rowRanges.getRowCount() > dictionaryDecoder.getDictionarySize()) + .orElse(true); + } + return false; + } + + static boolean shouldProduceDictionaryForType(Type type) + { + // TODO: DictionaryBlocks are currently restricted to variable width and date types where dictionary processing is most beneficial. + // Dictionary processing for other data types can be enabled after validating improvements on benchmarks. + return type instanceof AbstractVariableWidthType || type instanceof DateType; + } + + private static long getMaxDictionaryBlockSize(Block dictionary, long batchSize) + { + // An approximate upper bound on size of DictionaryBlock is derived here instead of using + // DictionaryBlock#getSizeInBytes directly because that method is expensive + double maxDictionaryFractionUsed = Math.min((double) batchSize / dictionary.getPositionCount(), 1.0); + return (long) (batchSize * Integer.BYTES + dictionary.getSizeInBytes() * maxDictionaryFractionUsed); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ChunkedInputStream.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ChunkedInputStream.java new file mode 100644 index 000000000000..7d6a31dc423b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ChunkedInputStream.java @@ -0,0 +1,154 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.airlift.slice.BasicSliceInput; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.parquet.ChunkReader; +import io.trino.parquet.ParquetReaderOptions; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collection; +import java.util.Iterator; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkPositionIndexes; +import static com.google.common.base.Verify.verify; +import static io.airlift.slice.Slices.EMPTY_SLICE; +import static java.util.Objects.requireNonNull; + +/** + * A single continuous {@link InputStream} over multiple {@link Slice}s read on demand using given collection of {@link ChunkReader}s. + * It is used to read parquet column chunk in limited (small) byte chunks (8MB by default, controlled by {@link ParquetReaderOptions#getMaxReadBlockSize()}). + * Column chunks consists of multiple pages. + * This abstraction is used because the page size is unknown until the page header is read + * and page header and page data can be split between two or more byte chunks. + */ +public final class ChunkedInputStream + extends InputStream +{ + private final Iterator chunks; + private ChunkReader currentChunkReader; + // current is explicitly initialized to EMPTY_SLICE as this field is set to null when the stream is closed + private BasicSliceInput current = EMPTY_SLICE.getInput(); + + public ChunkedInputStream(Collection chunks) + { + requireNonNull(chunks, "chunks is null"); + checkArgument(!chunks.isEmpty(), "At least one chunk is expected but got none"); + this.chunks = chunks.iterator(); + } + + public Slice getSlice(int length) + throws IOException + { + if (length == 0) { + return EMPTY_SLICE; + } + ensureOpen(); + while (!current.isReadable()) { + checkArgument(chunks.hasNext(), "Requested %s bytes but 0 was available", length); + readNextChunk(); + } + if (current.available() >= length) { + return current.readSlice(length); + } + // requested length crosses the slice boundary + byte[] bytes = new byte[length]; + try { + int read = this.readNBytes(bytes, 0, bytes.length); + verify(read == length, "expected to read %s bytes but got %s", length, read); + } + catch (IOException e) { + throw new RuntimeException("Failed to read " + length + " bytes", e); + } + return Slices.wrappedBuffer(bytes); + } + + @Override + public int read(byte[] b, int off, int len) + throws IOException + { + checkPositionIndexes(off, off + len, b.length); + if (len == 0) { + return 0; + } + ensureOpen(); + while (!current.isReadable()) { + if (!chunks.hasNext()) { + return -1; + } + readNextChunk(); + } + + return current.read(b, off, len); + } + + @Override + public int read() + throws IOException + { + ensureOpen(); + while (!current.isReadable() && chunks.hasNext()) { + readNextChunk(); + } + + return current.read(); + } + + @Override + public int available() + throws IOException + { + ensureOpen(); + return current.available(); + } + + @Override + public void close() + { + if (current == null) { + // already closed + return; + } + if (currentChunkReader != null) { + currentChunkReader.free(); + } + while (chunks.hasNext()) { + chunks.next().free(); + } + current = null; + } + + private void ensureOpen() + throws IOException + { + if (current == null) { + throw new IOException("Stream closed"); + } + } + + private void readNextChunk() + { + if (currentChunkReader != null) { + currentChunkReader.free(); + } + currentChunkReader = chunks.next(); + Slice slice = currentChunkReader.readUnchecked(); + checkArgument(slice.length() > 0, "all chunks have to be not empty"); + current = slice.getInput(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnChunk.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnChunk.java new file mode 100644 index 000000000000..6c2e9468db49 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnChunk.java @@ -0,0 +1,64 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.trino.spi.block.Block; + +import java.util.OptionalLong; + +import static java.util.Objects.requireNonNull; + +public class ColumnChunk +{ + private final Block block; + private final int[] definitionLevels; + private final int[] repetitionLevels; + private OptionalLong maxBlockSize; + + public ColumnChunk(Block block, int[] definitionLevels, int[] repetitionLevels) + { + this(block, definitionLevels, repetitionLevels, OptionalLong.empty()); + } + + public ColumnChunk(Block block, int[] definitionLevels, int[] repetitionLevels, OptionalLong maxBlockSize) + { + this.block = requireNonNull(block, "block is null"); + this.definitionLevels = requireNonNull(definitionLevels, "definitionLevels is null"); + this.repetitionLevels = requireNonNull(repetitionLevels, "repetitionLevels is null"); + this.maxBlockSize = maxBlockSize; + } + + public Block getBlock() + { + return block; + } + + public int[] getDefinitionLevels() + { + return definitionLevels; + } + + public int[] getRepetitionLevels() + { + return repetitionLevels; + } + + public long getMaxBlockSize() + { + if (maxBlockSize.isEmpty()) { + maxBlockSize = OptionalLong.of(block.getSizeInBytes()); + } + return maxBlockSize.getAsLong(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnReader.java new file mode 100644 index 000000000000..e06686ba7ec6 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnReader.java @@ -0,0 +1,27 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import java.util.Optional; + +public interface ColumnReader +{ + boolean hasPageReader(); + + void setPageReader(PageReader pageReader, Optional rowRanges); + + void prepareNextRead(int batchSize); + + ColumnChunk readPrimitive(); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnReaderFactory.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnReaderFactory.java new file mode 100644 index 000000000000..95b51247cb25 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ColumnReaderFactory.java @@ -0,0 +1,387 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.trino.memory.context.AggregatedMemoryContext; +import io.trino.memory.context.LocalMemoryContext; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.PrimitiveField; +import io.trino.parquet.reader.decoders.ValueDecoders; +import io.trino.parquet.reader.flat.ColumnAdapter; +import io.trino.parquet.reader.flat.FlatColumnReader; +import io.trino.spi.TrinoException; +import io.trino.spi.type.AbstractIntType; +import io.trino.spi.type.AbstractLongType; +import io.trino.spi.type.AbstractVariableWidthType; +import io.trino.spi.type.CharType; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Decimals; +import io.trino.spi.type.TimeType; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.TimestampWithTimeZoneType; +import io.trino.spi.type.Type; +import io.trino.spi.type.VarcharType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DateLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.IntLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.LogicalTypeAnnotationVisitor; +import org.apache.parquet.schema.LogicalTypeAnnotation.TimeLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.joda.time.DateTimeZone; + +import java.util.Optional; + +import static io.trino.parquet.ParquetEncoding.PLAIN; +import static io.trino.parquet.reader.decoders.ValueDecoder.ValueDecodersProvider; +import static io.trino.parquet.reader.decoders.ValueDecoder.createLevelsDecoder; +import static io.trino.parquet.reader.flat.BinaryColumnAdapter.BINARY_ADAPTER; +import static io.trino.parquet.reader.flat.ByteColumnAdapter.BYTE_ADAPTER; +import static io.trino.parquet.reader.flat.DictionaryDecoder.DictionaryDecoderProvider; +import static io.trino.parquet.reader.flat.DictionaryDecoder.getDictionaryDecoder; +import static io.trino.parquet.reader.flat.Fixed12ColumnAdapter.FIXED12_ADAPTER; +import static io.trino.parquet.reader.flat.FlatDefinitionLevelDecoder.getFlatDefinitionLevelDecoder; +import static io.trino.parquet.reader.flat.Int128ColumnAdapter.INT128_ADAPTER; +import static io.trino.parquet.reader.flat.IntColumnAdapter.INT_ADAPTER; +import static io.trino.parquet.reader.flat.LongColumnAdapter.LONG_ADAPTER; +import static io.trino.parquet.reader.flat.ShortColumnAdapter.SHORT_ADAPTER; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.DoubleType.DOUBLE; +import static io.trino.spi.type.RealType.REAL; +import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TinyintType.TINYINT; +import static io.trino.spi.type.UuidType.UUID; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.Boolean.FALSE; +import static java.lang.Boolean.TRUE; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS; +import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MILLIS; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; + +public final class ColumnReaderFactory +{ + private static final int PREFERRED_BIT_WIDTH = getVectorBitSize(); + + private final DateTimeZone timeZone; + private final boolean vectorizedDecodingEnabled; + + public ColumnReaderFactory(DateTimeZone timeZone, ParquetReaderOptions readerOptions) + { + this.timeZone = requireNonNull(timeZone, "dateTimeZone is null"); + this.vectorizedDecodingEnabled = readerOptions.isVectorizedDecodingEnabled() && isVectorizedDecodingSupported(); + } + + public ColumnReader create(PrimitiveField field, AggregatedMemoryContext aggregatedMemoryContext) + { + Type type = field.getType(); + PrimitiveTypeName primitiveType = field.getDescriptor().getPrimitiveType().getPrimitiveTypeName(); + LogicalTypeAnnotation annotation = field.getDescriptor().getPrimitiveType().getLogicalTypeAnnotation(); + LocalMemoryContext memoryContext = aggregatedMemoryContext.newLocalMemoryContext(ColumnReader.class.getSimpleName()); + ValueDecoders valueDecoders = new ValueDecoders(field, vectorizedDecodingEnabled); + if (BOOLEAN.equals(type) && primitiveType == PrimitiveTypeName.BOOLEAN) { + return createColumnReader(field, valueDecoders::getBooleanDecoder, BYTE_ADAPTER, memoryContext); + } + if (TINYINT.equals(type) && isIntegerOrDecimalPrimitive(primitiveType)) { + if (isZeroScaleShortDecimalAnnotation(annotation)) { + return createColumnReader(field, valueDecoders::getShortDecimalToByteDecoder, BYTE_ADAPTER, memoryContext); + } + if (!isIntegerAnnotationAndPrimitive(annotation, primitiveType)) { + throw unsupportedException(type, field); + } + return createColumnReader(field, valueDecoders::getByteDecoder, BYTE_ADAPTER, memoryContext); + } + if (SMALLINT.equals(type) && isIntegerOrDecimalPrimitive(primitiveType)) { + if (isZeroScaleShortDecimalAnnotation(annotation)) { + return createColumnReader(field, valueDecoders::getShortDecimalToShortDecoder, SHORT_ADAPTER, memoryContext); + } + if (!isIntegerAnnotationAndPrimitive(annotation, primitiveType)) { + throw unsupportedException(type, field); + } + return createColumnReader(field, valueDecoders::getShortDecoder, SHORT_ADAPTER, memoryContext); + } + if (DATE.equals(type) && primitiveType == INT32) { + if (isIntegerAnnotation(annotation) || annotation instanceof DateLogicalTypeAnnotation) { + return createColumnReader(field, valueDecoders::getIntDecoder, INT_ADAPTER, memoryContext); + } + throw unsupportedException(type, field); + } + if (type instanceof AbstractIntType && isIntegerOrDecimalPrimitive(primitiveType)) { + if (isZeroScaleShortDecimalAnnotation(annotation)) { + return createColumnReader(field, valueDecoders::getShortDecimalToIntDecoder, INT_ADAPTER, memoryContext); + } + if (!isIntegerAnnotationAndPrimitive(annotation, primitiveType)) { + throw unsupportedException(type, field); + } + return createColumnReader(field, valueDecoders::getIntDecoder, INT_ADAPTER, memoryContext); + } + if (type instanceof TimeType) { + if (!(annotation instanceof TimeLogicalTypeAnnotation timeAnnotation)) { + throw unsupportedException(type, field); + } + if (primitiveType == INT64 && timeAnnotation.getUnit() == MICROS) { + return createColumnReader(field, valueDecoders::getTimeMicrosDecoder, LONG_ADAPTER, memoryContext); + } + if (primitiveType == INT32 && timeAnnotation.getUnit() == MILLIS) { + return createColumnReader(field, valueDecoders::getTimeMillisDecoder, LONG_ADAPTER, memoryContext); + } + throw unsupportedException(type, field); + } + if (BIGINT.equals(type) && primitiveType == INT64 + && (annotation instanceof TimestampLogicalTypeAnnotation || annotation instanceof TimeLogicalTypeAnnotation)) { + return createColumnReader(field, valueDecoders::getLongDecoder, LONG_ADAPTER, memoryContext); + } + if (type instanceof AbstractLongType && isIntegerOrDecimalPrimitive(primitiveType)) { + if (isZeroScaleShortDecimalAnnotation(annotation)) { + return createColumnReader(field, valueDecoders::getShortDecimalDecoder, LONG_ADAPTER, memoryContext); + } + if (!isIntegerAnnotationAndPrimitive(annotation, primitiveType)) { + throw unsupportedException(type, field); + } + if (primitiveType == INT32) { + return createColumnReader(field, valueDecoders::getInt32ToLongDecoder, LONG_ADAPTER, memoryContext); + } + if (primitiveType == INT64) { + return createColumnReader(field, valueDecoders::getLongDecoder, LONG_ADAPTER, memoryContext); + } + } + if (REAL.equals(type) && primitiveType == FLOAT) { + return createColumnReader(field, valueDecoders::getRealDecoder, INT_ADAPTER, memoryContext); + } + if (DOUBLE.equals(type)) { + if (primitiveType == PrimitiveTypeName.DOUBLE) { + return createColumnReader(field, valueDecoders::getDoubleDecoder, LONG_ADAPTER, memoryContext); + } + if (primitiveType == FLOAT) { + return createColumnReader(field, valueDecoders::getFloatToDoubleDecoder, LONG_ADAPTER, memoryContext); + } + } + if (type instanceof TimestampType timestampType && primitiveType == INT96) { + if (timestampType.isShort()) { + return createColumnReader( + field, + (encoding) -> valueDecoders.getInt96ToShortTimestampDecoder(encoding, timeZone), + LONG_ADAPTER, + memoryContext); + } + return createColumnReader( + field, + (encoding) -> valueDecoders.getInt96ToLongTimestampDecoder(encoding, timeZone), + FIXED12_ADAPTER, + memoryContext); + } + if (type instanceof TimestampWithTimeZoneType timestampWithTimeZoneType && primitiveType == INT96) { + if (timestampWithTimeZoneType.isShort()) { + return createColumnReader(field, valueDecoders::getInt96ToShortTimestampWithTimeZoneDecoder, LONG_ADAPTER, memoryContext); + } + return createColumnReader(field, valueDecoders::getInt96ToLongTimestampWithTimeZoneDecoder, FIXED12_ADAPTER, memoryContext); + } + if (type instanceof TimestampType timestampType && primitiveType == INT64) { + if (!(annotation instanceof TimestampLogicalTypeAnnotation timestampAnnotation)) { + throw unsupportedException(type, field); + } + DateTimeZone readTimeZone = timestampAnnotation.isAdjustedToUTC() ? timeZone : DateTimeZone.UTC; + if (timestampType.isShort()) { + return switch (timestampAnnotation.getUnit()) { + case MILLIS -> createColumnReader(field, encoding -> valueDecoders.getInt64TimestampMillisToShortTimestampDecoder(encoding, readTimeZone), LONG_ADAPTER, memoryContext); + case MICROS -> createColumnReader(field, encoding -> valueDecoders.getInt64TimestampMicrosToShortTimestampDecoder(encoding, readTimeZone), LONG_ADAPTER, memoryContext); + case NANOS -> createColumnReader(field, encoding -> valueDecoders.getInt64TimestampNanosToShortTimestampDecoder(encoding, readTimeZone), LONG_ADAPTER, memoryContext); + }; + } + return switch (timestampAnnotation.getUnit()) { + case MILLIS -> createColumnReader(field, encoding -> valueDecoders.getInt64TimestampMillisToLongTimestampDecoder(encoding, readTimeZone), FIXED12_ADAPTER, memoryContext); + case MICROS -> createColumnReader(field, encoding -> valueDecoders.getInt64TimestampMicrosToLongTimestampDecoder(encoding, readTimeZone), FIXED12_ADAPTER, memoryContext); + case NANOS -> createColumnReader(field, encoding -> valueDecoders.getInt64TimestampNanosToLongTimestampDecoder(encoding, readTimeZone), FIXED12_ADAPTER, memoryContext); + }; + } + if (type instanceof TimestampWithTimeZoneType timestampWithTimeZoneType && primitiveType == INT64) { + if (!(annotation instanceof TimestampLogicalTypeAnnotation timestampAnnotation)) { + throw unsupportedException(type, field); + } + if (timestampWithTimeZoneType.isShort()) { + return switch (timestampAnnotation.getUnit()) { + case MILLIS -> createColumnReader(field, valueDecoders::getInt64TimestampMillsToShortTimestampWithTimeZoneDecoder, LONG_ADAPTER, memoryContext); + case MICROS -> createColumnReader(field, valueDecoders::getInt64TimestampMicrosToShortTimestampWithTimeZoneDecoder, LONG_ADAPTER, memoryContext); + case NANOS -> throw unsupportedException(type, field); + }; + } + return switch (timestampAnnotation.getUnit()) { + case MILLIS, NANOS -> throw unsupportedException(type, field); + case MICROS -> createColumnReader(field, valueDecoders::getInt64TimestampMicrosToLongTimestampWithTimeZoneDecoder, FIXED12_ADAPTER, memoryContext); + }; + } + if (type instanceof DecimalType decimalType && decimalType.isShort() + && isIntegerOrDecimalPrimitive(primitiveType)) { + if (primitiveType == INT32 && isIntegerAnnotation(annotation)) { + return createColumnReader(field, valueDecoders::getInt32ToShortDecimalDecoder, LONG_ADAPTER, memoryContext); + } + if (!(annotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation)) { + throw unsupportedException(type, field); + } + if (isDecimalRescaled(decimalAnnotation, decimalType)) { + return createColumnReader(field, valueDecoders::getRescaledShortDecimalDecoder, LONG_ADAPTER, memoryContext); + } + return createColumnReader(field, valueDecoders::getShortDecimalDecoder, LONG_ADAPTER, memoryContext); + } + if (type instanceof DecimalType decimalType && !decimalType.isShort() + && isIntegerOrDecimalPrimitive(primitiveType)) { + if (!(annotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation)) { + throw unsupportedException(type, field); + } + if (isDecimalRescaled(decimalAnnotation, decimalType)) { + return createColumnReader(field, valueDecoders::getRescaledLongDecimalDecoder, INT128_ADAPTER, memoryContext); + } + return createColumnReader(field, valueDecoders::getLongDecimalDecoder, INT128_ADAPTER, memoryContext); + } + if (type instanceof VarcharType varcharType && !varcharType.isUnbounded() && primitiveType == BINARY) { + return createColumnReader(field, valueDecoders::getBoundedVarcharBinaryDecoder, BINARY_ADAPTER, memoryContext); + } + if (type instanceof CharType && primitiveType == BINARY) { + return createColumnReader(field, valueDecoders::getCharBinaryDecoder, BINARY_ADAPTER, memoryContext); + } + if (type instanceof AbstractVariableWidthType && primitiveType == BINARY) { + return createColumnReader(field, valueDecoders::getBinaryDecoder, BINARY_ADAPTER, memoryContext); + } + if ((VARBINARY.equals(type) || VARCHAR.equals(type)) && primitiveType == FIXED_LEN_BYTE_ARRAY) { + if (annotation instanceof DecimalLogicalTypeAnnotation) { + throw unsupportedException(type, field); + } + return createColumnReader(field, valueDecoders::getFixedWidthBinaryDecoder, BINARY_ADAPTER, memoryContext); + } + if (UUID.equals(type) && primitiveType == FIXED_LEN_BYTE_ARRAY) { + // Iceberg 0.11.1 writes UUID as FIXED_LEN_BYTE_ARRAY without logical type annotation (see https://github.com/apache/iceberg/pull/2913) + // To support such files, we bet on the logical type to be UUID based on the Trino UUID type check. + if (annotation == null || isLogicalUuid(annotation)) { + return createColumnReader(field, valueDecoders::getUuidDecoder, INT128_ADAPTER, memoryContext); + } + } + throw unsupportedException(type, field); + } + + private ColumnReader createColumnReader( + PrimitiveField field, + ValueDecodersProvider decodersProvider, + ColumnAdapter columnAdapter, + LocalMemoryContext memoryContext) + { + DictionaryDecoderProvider dictionaryDecoderProvider = (dictionaryPage, isNonNull) -> getDictionaryDecoder( + dictionaryPage, + columnAdapter, + decodersProvider.create(PLAIN), + isNonNull, + vectorizedDecodingEnabled); + if (isFlatColumn(field)) { + return new FlatColumnReader<>( + field, + decodersProvider, + maxDefinitionLevel -> getFlatDefinitionLevelDecoder(maxDefinitionLevel, vectorizedDecodingEnabled), + dictionaryDecoderProvider, + columnAdapter, + memoryContext); + } + return new NestedColumnReader<>( + field, + decodersProvider, + maxLevel -> createLevelsDecoder(maxLevel, vectorizedDecodingEnabled), + dictionaryDecoderProvider, + columnAdapter, + memoryContext); + } + + private static boolean isFlatColumn(PrimitiveField field) + { + return field.getDescriptor().getPath().length == 1 && field.getRepetitionLevel() == 0; + } + + private static boolean isLogicalUuid(LogicalTypeAnnotation annotation) + { + return Optional.ofNullable(annotation) + .flatMap(logicalTypeAnnotation -> logicalTypeAnnotation.accept(new LogicalTypeAnnotationVisitor() + { + @Override + public Optional visit(UUIDLogicalTypeAnnotation uuidLogicalType) + { + return Optional.of(TRUE); + } + })) + .orElse(FALSE); + } + + public static boolean isDecimalRescaled(DecimalLogicalTypeAnnotation decimalAnnotation, DecimalType trinoType) + { + return decimalAnnotation.getPrecision() != trinoType.getPrecision() + || decimalAnnotation.getScale() != trinoType.getScale(); + } + + private static boolean isIntegerAnnotation(LogicalTypeAnnotation typeAnnotation) + { + return typeAnnotation == null || typeAnnotation instanceof IntLogicalTypeAnnotation; + } + + private static boolean isZeroScaleShortDecimalAnnotation(LogicalTypeAnnotation typeAnnotation) + { + return typeAnnotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation + && decimalAnnotation.getScale() == 0 + && decimalAnnotation.getPrecision() <= Decimals.MAX_SHORT_PRECISION; + } + + private static boolean isIntegerOrDecimalPrimitive(PrimitiveTypeName primitiveType) + { + // Integers may be stored in INT32 or INT64 + // Decimals may be stored as INT32, INT64, BINARY or FIXED_LEN_BYTE_ARRAY + // Short decimals with zero scale in parquet files may be read as integers in Trino + return primitiveType == INT32 || primitiveType == INT64 || primitiveType == BINARY || primitiveType == FIXED_LEN_BYTE_ARRAY; + } + + public static boolean isIntegerAnnotationAndPrimitive(LogicalTypeAnnotation typeAnnotation, PrimitiveTypeName primitiveType) + { + return isIntegerAnnotation(typeAnnotation) && (primitiveType == INT32 || primitiveType == INT64); + } + + private static TrinoException unsupportedException(Type type, PrimitiveField field) + { + return new TrinoException(NOT_SUPPORTED, format("Unsupported Trino column type (%s) for Parquet column (%s)", type, field.getDescriptor())); + } + + private static boolean isVectorizedDecodingSupported() + { + // Performance gains with vectorized decoding are validated only when the hardware platform provides at least 256 bit width registers + // Graviton 2 machines return false here, whereas x86 and Graviton 3 machines return true + return PREFERRED_BIT_WIDTH >= 256; + } + + // get VectorShape bit size via reflection to avoid requiring the preview feature is enabled + private static int getVectorBitSize() + { + try { + Class clazz = Class.forName("jdk.incubator.vector.VectorShape"); + return (int) clazz.getMethod("vectorBitSize").invoke(clazz.getMethod("preferredShape").invoke(null)); + } + catch (Throwable e) { + return -1; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/FilteredOffsetIndex.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/FilteredOffsetIndex.java new file mode 100644 index 000000000000..8ae54646e414 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/FilteredOffsetIndex.java @@ -0,0 +1,156 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Formatter; +import java.util.List; + +class FilteredOffsetIndex + implements OffsetIndex +{ + /* + * Returns the filtered offset index containing only the pages which are overlapping with rowRanges. + */ + public static FilteredOffsetIndex filterOffsetIndex(OffsetIndex offsetIndex, RowRanges rowRanges, long totalRowCount) + { + IntList indexMap = new IntArrayList(); + for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { + long from = offsetIndex.getFirstRowIndex(i); + if (rowRanges.isOverlapping(from, offsetIndex.getLastRowIndex(i, totalRowCount))) { + indexMap.add(i); + } + } + return new FilteredOffsetIndex(offsetIndex, indexMap.toIntArray()); + } + + private final OffsetIndex offsetIndex; + private final int[] indexMap; + + private FilteredOffsetIndex(OffsetIndex offsetIndex, int[] indexMap) + { + this.offsetIndex = offsetIndex; + this.indexMap = indexMap; + } + + @Override + public int getPageCount() + { + return indexMap.length; + } + + @Override + public long getOffset(int pageIndex) + { + return offsetIndex.getOffset(indexMap[pageIndex]); + } + + @Override + public int getCompressedPageSize(int pageIndex) + { + return offsetIndex.getCompressedPageSize(indexMap[pageIndex]); + } + + @Override + public long getFirstRowIndex(int pageIndex) + { + return offsetIndex.getFirstRowIndex(indexMap[pageIndex]); + } + + @Override + public long getLastRowIndex(int pageIndex, long totalRowCount) + { + int nextIndex = indexMap[pageIndex] + 1; + return (nextIndex >= offsetIndex.getPageCount() ? totalRowCount : offsetIndex.getFirstRowIndex(nextIndex)) - 1; + } + + @Override + public String toString() + { + try (Formatter formatter = new Formatter()) { + formatter.format("%-12s %20s %16s %20s\n", "", "offset", "compressed size", "first row index"); + for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) { + int index = Arrays.binarySearch(indexMap, i); + boolean isHidden = index < 0; + formatter.format("%spage-%-5d %20d %16d %20d\n", + isHidden ? "- " : " ", + isHidden ? i : index, + offsetIndex.getOffset(i), + offsetIndex.getCompressedPageSize(i), + offsetIndex.getFirstRowIndex(i)); + } + return formatter.toString(); + } + } + + public List calculateOffsetRanges(long rowGroupOffset) + { + List ranges = new ArrayList<>(); + int pageCount = getPageCount(); + if (pageCount > 0) { + long firstPageOffset = offsetIndex.getOffset(0); + // Add a range for the dictionary page if required + if (rowGroupOffset < firstPageOffset) { + // We need to adjust the offset by startingPosition for Trino because dataSource.readFully() started at startingPosition + ranges.add(new OffsetRange(rowGroupOffset, firstPageOffset - rowGroupOffset)); + } + + long currentOffset = getOffset(0); + long currentLength = getCompressedPageSize(0); + for (int i = 1; i < pageCount; ++i) { + long offset = getOffset(i); + int length = getCompressedPageSize(i); + + if (currentOffset + currentLength == offset) { + currentLength += length; + } + else { + ranges.add(new OffsetRange(currentOffset, currentLength)); + currentOffset = offset; + currentLength = length; + } + } + ranges.add(new OffsetRange(currentOffset, currentLength)); + } + return ranges; + } + + public static class OffsetRange + { + private final long offset; + private final long length; + + public OffsetRange(long offset, long length) + { + this.offset = offset; + this.length = length; + } + + long getOffset() + { + return offset; + } + + long getLength() + { + return length; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/FilteredRowRanges.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/FilteredRowRanges.java new file mode 100644 index 000000000000..0cf729c18f81 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/FilteredRowRanges.java @@ -0,0 +1,80 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import org.apache.parquet.internal.filter2.columnindex.RowRanges; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.util.Objects.requireNonNull; + +// PrimitiveColumnReader iterates over RowRanges value-by-value +// FlatColumnReader iterates over range of values using FilteredRowRangesIterator +public class FilteredRowRanges +{ + private final RowRanges parquetRowRanges; + private final List rowRanges; + private final long rowCount; + + public FilteredRowRanges(RowRanges parquetRowRanges) + { + this.parquetRowRanges = requireNonNull(parquetRowRanges, "parquetRowRanges is null"); + this.rowRanges = constructRanges(parquetRowRanges); + this.rowCount = parquetRowRanges.rowCount(); + } + + public RowRanges getParquetRowRanges() + { + return parquetRowRanges; + } + + public List getRowRanges() + { + return rowRanges; + } + + public long getRowCount() + { + return rowCount; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("parquetRowRanges", parquetRowRanges) + .add("rowRanges", rowRanges) + .add("rowCount", rowCount) + .toString(); + } + + /** + * Construct a list of row ranges from the given `rowRanges`. For example, suppose the + * `rowRanges` are `[0, 1, 2, 4, 5, 7, 8, 9]`, it will be converted into 3 row ranges: + * `[0-2], [4-5], [7-9]`. + */ + private static List constructRanges(RowRanges rowRanges) + { + return rowRanges.getRanges().stream() + .map(range -> new RowRange(range.from, range.to)) + .collect(toImmutableList()); + } + + /** + * Helper struct to represent a range of row indexes `[start, end]`. + */ + public record RowRange(long start, long end) {} +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ListColumnReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ListColumnReader.java new file mode 100644 index 000000000000..b6a68e55bcbb --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ListColumnReader.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.trino.parquet.Field; +import it.unimi.dsi.fastutil.booleans.BooleanArrayList; +import it.unimi.dsi.fastutil.ints.IntArrayList; + +import java.util.Optional; + +import static io.trino.parquet.ParquetTypeUtils.isOptionalFieldValueNull; + +public final class ListColumnReader +{ + private ListColumnReader() {} + + /** + * Each collection (Array or Map) has four variants of presence: + * 1) Collection is not defined, because one of it's optional parent fields is null + * 2) Collection is null + * 3) Collection is defined but empty + * 4) Collection is defined and not empty. In this case offset value is increased by the number of elements in that collection + */ + public static BlockPositions calculateCollectionOffsets(Field field, int[] definitionLevels, int[] repetitionLevels) + { + int maxDefinitionLevel = field.getDefinitionLevel(); + int maxElementRepetitionLevel = field.getRepetitionLevel() + 1; + boolean required = field.isRequired(); + int offset = 0; + IntArrayList offsets = new IntArrayList(); + offsets.add(offset); + if (required) { + for (int i = 0; i < definitionLevels.length; i = getNextCollectionStartIndex(repetitionLevels, maxElementRepetitionLevel, i)) { + if (definitionLevels[i] == maxDefinitionLevel) { + // Collection is defined but empty + offsets.add(offset); + } + else if (definitionLevels[i] > maxDefinitionLevel) { + // Collection is defined and not empty + offset += getCollectionSize(repetitionLevels, maxElementRepetitionLevel, i + 1); + offsets.add(offset); + } + } + return new BlockPositions(Optional.empty(), offsets.toIntArray()); + } + + BooleanArrayList collectionIsNull = new BooleanArrayList(); + int nullValuesCount = 0; + for (int i = 0; i < definitionLevels.length; i = getNextCollectionStartIndex(repetitionLevels, maxElementRepetitionLevel, i)) { + if (definitionLevels[i] >= maxDefinitionLevel - 1) { + boolean isNull = isOptionalFieldValueNull(definitionLevels[i], maxDefinitionLevel); + collectionIsNull.add(isNull); + nullValuesCount += isNull ? 1 : 0; + // definitionLevels[i] == maxDefinitionLevel - 1 => Collection is null + // definitionLevels[i] == maxDefinitionLevel => Collection is defined but empty + if (definitionLevels[i] > maxDefinitionLevel) { + // Collection is defined and not empty + offset += getCollectionSize(repetitionLevels, maxElementRepetitionLevel, i + 1); + } + offsets.add(offset); + } + } + if (nullValuesCount == 0) { + return new BlockPositions(Optional.empty(), offsets.toIntArray()); + } + return new BlockPositions(Optional.of(collectionIsNull.elements()), offsets.toIntArray()); + } + + public record BlockPositions(Optional isNull, int[] offsets) {} + + private static int getNextCollectionStartIndex(int[] repetitionLevels, int maxRepetitionLevel, int elementIndex) + { + do { + elementIndex++; + } + while (hasMoreElements(repetitionLevels, elementIndex) && !isCollectionBeginningMarker(repetitionLevels, maxRepetitionLevel, elementIndex)); + return elementIndex; + } + + /** + * This method is only called for non-empty collections + */ + private static int getCollectionSize(int[] repetitionLevels, int maxRepetitionLevel, int nextIndex) + { + int size = 1; + while (hasMoreElements(repetitionLevels, nextIndex) && !isCollectionBeginningMarker(repetitionLevels, maxRepetitionLevel, nextIndex)) { + // Collection elements cannot only be primitive, but also can have nested structure + // Counting only elements which belong to current collection, skipping inner elements of nested collections/structs + if (repetitionLevels[nextIndex] <= maxRepetitionLevel) { + size++; + } + nextIndex++; + } + return size; + } + + private static boolean isCollectionBeginningMarker(int[] repetitionLevels, int maxRepetitionLevel, int nextIndex) + { + return repetitionLevels[nextIndex] < maxRepetitionLevel; + } + + private static boolean hasMoreElements(int[] repetitionLevels, int nextIndex) + { + return nextIndex < repetitionLevels.length; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/MetadataReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/MetadataReader.java new file mode 100644 index 000000000000..369ce467e131 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/MetadataReader.java @@ -0,0 +1,222 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.airlift.units.DataSize; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.ParquetWriteValidation; +import io.trino.parquet.metadata.FileMetadata; +import io.trino.parquet.metadata.ParquetMetadata; +import org.apache.parquet.CorruptStatistics; +import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.format.FileMetaData; +import org.apache.parquet.format.Statistics; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Optional; + +import static io.trino.parquet.ParquetMetadataConverter.fromParquetStatistics; +import static io.trino.parquet.ParquetValidationUtils.validateParquet; +import static java.lang.Boolean.FALSE; +import static java.lang.Boolean.TRUE; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static org.apache.parquet.format.Util.readFileMetaData; + +public final class MetadataReader +{ + private static final Slice MAGIC = Slices.utf8Slice("PAR1"); + private static final int POST_SCRIPT_SIZE = Integer.BYTES + MAGIC.length(); + // Typical 1GB files produced by Trino were found to have footer size between 30-40KB + private static final int EXPECTED_FOOTER_SIZE = 48 * 1024; + + private MetadataReader() {} + + public static ParquetMetadata readFooter(ParquetDataSource dataSource) + throws IOException + { + return readFooter(dataSource, Optional.empty(), Optional.empty()); + } + + public static ParquetMetadata readFooter(ParquetDataSource dataSource, DataSize maxFooterReadSize) + throws IOException + { + return readFooter(dataSource, Optional.of(maxFooterReadSize), Optional.empty()); + } + + public static ParquetMetadata readFooter(ParquetDataSource dataSource, Optional maxFooterReadSize, Optional parquetWriteValidation) + throws IOException + { + // Parquet File Layout: + // + // MAGIC + // variable: Data + // variable: Metadata + // 4 bytes: MetadataLength + // MAGIC + + validateParquet(dataSource.getEstimatedSize() >= MAGIC.length() + POST_SCRIPT_SIZE, dataSource.getId(), "%s is not a valid Parquet File", dataSource.getId()); + + // Read the tail of the file + long estimatedFileSize = dataSource.getEstimatedSize(); + long expectedReadSize = min(estimatedFileSize, EXPECTED_FOOTER_SIZE); + Slice buffer = dataSource.readTail(toIntExact(expectedReadSize)); + + Slice magic = buffer.slice(buffer.length() - MAGIC.length(), MAGIC.length()); + validateParquet(MAGIC.equals(magic), dataSource.getId(), "Expected magic number: %s got: %s", MAGIC.toStringUtf8(), magic.toStringUtf8()); + + int metadataLength = buffer.getInt(buffer.length() - POST_SCRIPT_SIZE); + long metadataIndex = estimatedFileSize - POST_SCRIPT_SIZE - metadataLength; + validateParquet( + metadataIndex >= MAGIC.length() && metadataIndex < estimatedFileSize - POST_SCRIPT_SIZE, + dataSource.getId(), + "Metadata index: %s out of range", + metadataIndex); + + int completeFooterSize = metadataLength + POST_SCRIPT_SIZE; + if (maxFooterReadSize.isPresent() && completeFooterSize > maxFooterReadSize.get().toBytes()) { + throw new ParquetCorruptionException( + dataSource.getId(), + "Parquet footer size %s exceeds maximum allowed size %s", + DataSize.ofBytes(completeFooterSize).succinct(), + maxFooterReadSize.get().succinct()); + } + if (completeFooterSize > buffer.length()) { + // initial read was not large enough, so just read again with the correct size + buffer = dataSource.readTail(completeFooterSize); + } + InputStream metadataStream = buffer.slice(buffer.length() - completeFooterSize, metadataLength).getInput(); + + FileMetaData fileMetaData = readFileMetaData(metadataStream); + ParquetMetadata parquetMetadata = new ParquetMetadata(fileMetaData, dataSource.getId()); + validateFileMetadata(dataSource.getId(), parquetMetadata.getFileMetaData(), parquetWriteValidation); + return parquetMetadata; + } + + public static org.apache.parquet.column.statistics.Statistics readStats(Optional fileCreatedBy, Optional statisticsFromFile, PrimitiveType type) + { + Statistics statistics = statisticsFromFile.orElse(null); + org.apache.parquet.column.statistics.Statistics columnStatistics = fromParquetStatistics(fileCreatedBy.orElse(null), statistics, type); + + if (isStringType(type) + && statistics != null + && !statistics.isSetMin_value() && !statistics.isSetMax_value() // the min,max fields used for UTF8 since Parquet PARQUET-1025 + && statistics.isSetMin() && statistics.isSetMax() // the min,max fields used for UTF8 before Parquet PARQUET-1025 + && columnStatistics.genericGetMin() == null && columnStatistics.genericGetMax() == null + && !CorruptStatistics.shouldIgnoreStatistics(fileCreatedBy.orElse(null), type.getPrimitiveTypeName())) { + columnStatistics = tryReadOldUtf8Stats(statistics, (BinaryStatistics) columnStatistics); + } + + return columnStatistics; + } + + private static boolean isStringType(PrimitiveType type) + { + if (type.getLogicalTypeAnnotation() == null) { + return false; + } + + return type.getLogicalTypeAnnotation() + .accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() + { + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) + { + return Optional.of(TRUE); + } + }) + .orElse(FALSE); + } + + private static org.apache.parquet.column.statistics.Statistics tryReadOldUtf8Stats(Statistics statistics, BinaryStatistics columnStatistics) + { + byte[] min = statistics.getMin(); + byte[] max = statistics.getMax(); + + if (Arrays.equals(min, max)) { + // If min=max, then there is single value only + min = min.clone(); + max = min; + } + else { + int commonPrefix = commonPrefix(min, max); + + // For min we can retain all-ASCII, because this produces a strictly lower value. + int minGoodLength = commonPrefix; + while (minGoodLength < min.length && isAscii(min[minGoodLength])) { + minGoodLength++; + } + + // For max we can be sure only of the part matching the min. When they differ, we can consider only one next, and only if both are ASCII + int maxGoodLength = commonPrefix; + if (maxGoodLength < max.length && maxGoodLength < min.length && isAscii(min[maxGoodLength]) && isAscii(max[maxGoodLength])) { + maxGoodLength++; + } + // Incrementing 127 would overflow. Incrementing within non-ASCII can have side-effects. + while (maxGoodLength > 0 && (max[maxGoodLength - 1] == 127 || !isAscii(max[maxGoodLength - 1]))) { + maxGoodLength--; + } + if (maxGoodLength == 0) { + // We can return just min bound, but code downstream likely expects both are present or both are absent. + return columnStatistics; + } + + min = Arrays.copyOf(min, minGoodLength); + max = Arrays.copyOf(max, maxGoodLength); + max[maxGoodLength - 1]++; + } + + return org.apache.parquet.column.statistics.Statistics + .getBuilderForReading(columnStatistics.type()) + .withMin(min) + .withMax(max) + .withNumNulls(!columnStatistics.isNumNullsSet() && statistics.isSetNull_count() ? statistics.getNull_count() : columnStatistics.getNumNulls()) + .build(); + } + + private static boolean isAscii(byte b) + { + return 0 <= b; + } + + private static int commonPrefix(byte[] a, byte[] b) + { + int commonPrefixLength = 0; + while (commonPrefixLength < a.length && commonPrefixLength < b.length && a[commonPrefixLength] == b[commonPrefixLength]) { + commonPrefixLength++; + } + return commonPrefixLength; + } + + private static void validateFileMetadata(ParquetDataSourceId dataSourceId, FileMetadata fileMetaData, Optional parquetWriteValidation) + throws ParquetCorruptionException + { + if (parquetWriteValidation.isEmpty()) { + return; + } + ParquetWriteValidation writeValidation = parquetWriteValidation.get(); + writeValidation.validateTimeZone( + dataSourceId, + Optional.ofNullable(fileMetaData.getKeyValueMetaData().get("writer.time.zone"))); + writeValidation.validateColumns(dataSourceId, fileMetaData.getSchema()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/NestedColumnReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/NestedColumnReader.java new file mode 100644 index 000000000000..3a79207a231f --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/NestedColumnReader.java @@ -0,0 +1,780 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import com.google.common.primitives.Booleans; +import com.google.common.primitives.Ints; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.trino.memory.context.LocalMemoryContext; +import io.trino.parquet.DataPage; +import io.trino.parquet.DataPageV1; +import io.trino.parquet.DataPageV2; +import io.trino.parquet.ParquetEncoding; +import io.trino.parquet.PrimitiveField; +import io.trino.parquet.reader.decoders.ValueDecoder; +import io.trino.parquet.reader.flat.ColumnAdapter; +import io.trino.parquet.reader.flat.DictionaryDecoder; +import io.trino.spi.block.RunLengthEncodedBlock; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static io.airlift.slice.SizeOf.sizeOf; +import static io.trino.parquet.ParquetEncoding.RLE; +import static io.trino.parquet.ParquetReaderUtils.castToByte; +import static io.trino.parquet.reader.decoders.ValueDecoder.LevelsDecoderProvider; +import static io.trino.parquet.reader.decoders.ValueDecoder.ValueDecodersProvider; +import static io.trino.parquet.reader.flat.DictionaryDecoder.DictionaryDecoderProvider; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +/** + * This class works similarly to FlatColumnReader. The difference is that the resulting number + * of values might (and usually is) different from the number of chunks. Therefore the output buffers + * are dynamically sized and some repetition/definition levels logic is added. + * This reader is universal i.e. it will properly read flat data, yet flat readers are preferred + * due to better performance. + *

+ * Brief explanation of reading repetition and definition levels: + * Repetition level equal to 0 means that we should start a new row, i.e. set of values + * Any other value means that we continue adding to the current row + * Following data (containing 3 rows): + * repetition levels: 0,1,1,0,0,1,[0] (last 0 implicit) + * values: 1,2,3,4,5,6 + * will result in sets of (1,2,3), (4), (5,6). + *

+ * The biggest complication here is that in order to know if n-th value is the last in a row we need + * to check the n-th+1 repetition level. So if the page has n values we need to wait for the beginning + * of the next page to figure out whether the row is finished or contains additional values. + * Above example split into 3 pages would look like: + * repetition levels: 0,1 1,0 0,1 + * values: 1,2 3,4 5,6 + * Reading the first page will only give us information that the first row starts with values (1,2), but + * we need to wait for another page to figure out that it contains another value (3). After reading another + * row from page 2 we still need to read page 3 just to find out that the first repetition level is '0' and + * the row is already over. + *

+ * Definition levels encodes one of 3 options: + * -value exists and is non-null (level = maxDef) + * -value is null (level = maxDef - 1) + * -there is no value (level < maxDef - 1) + * For non-nullable (REQUIRED) fields the (level = maxDef - 1) condition means non-existing value as well. + *

+ * Quick example (maxDef level is 2): + * Read 3 rows out of: + * repetition levels: 0,1,1,0,0,1,0,... + * definition levels: 0,1,2,1,0,2,... + * values: 1,2,3,4,5,6,... + * Resulting buffer: n,3,n, 6 + * that is later translated to (n,3),(n),(6) + * where n = null + */ +public class NestedColumnReader + extends AbstractColumnReader +{ + private static final Logger log = Logger.get(NestedColumnReader.class); + + private final LevelsDecoderProvider levelsDecoderProvider; + private final LocalMemoryContext memoryContext; + + private ValueDecoder definitionLevelDecoder; + private ValueDecoder repetitionLevelDecoder; + private ValueDecoder valueDecoder; + private int[] repetitionBuffer; + private int readOffset; + private int nextBatchSize; + private boolean pageLastRowUnfinished; + // True if the last row of the previously read page was skipped, instead of read. + // This way the remaining part of this row that may be stored in the next page + // will be skipped as well + private boolean pageLastRowSkipped; + + private int remainingPageValueCount; + private int pageValueCount; + + public NestedColumnReader( + PrimitiveField field, + ValueDecodersProvider decodersProvider, + LevelsDecoderProvider levelsDecoderProvider, + DictionaryDecoderProvider dictionaryDecoderProvider, + ColumnAdapter columnAdapter, + LocalMemoryContext memoryContext) + { + super(field, decodersProvider, dictionaryDecoderProvider, columnAdapter); + this.levelsDecoderProvider = requireNonNull(levelsDecoderProvider, "levelsDecoderProvider is null"); + this.memoryContext = requireNonNull(memoryContext, "memoryContext is null"); + } + + @Override + public boolean hasPageReader() + { + return pageReader != null; + } + + @Override + protected boolean isNonNull() + { + return field.isRequired(); + } + + @Override + public ColumnChunk readPrimitive() + { + seek(); + ColumnChunk columnChunk; + if (isNonNull()) { + columnChunk = readNonNull(); + } + else { + columnChunk = readNullable(); + } + + readOffset = 0; + nextBatchSize = 0; + return columnChunk; + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + private ColumnChunk readNullable() + { + log.debug("readNullable field %s, nextBatchSize %d", field, nextBatchSize); + NullableValuesBuffer data = createNullableValuesBuffer(); + BooleansBuffer isNull = new BooleansBuffer(); + IntegersBuffer outputRepetitionLevels = new IntegersBuffer(); + IntegersBuffer outputDefinitionLevels = new IntegersBuffer(); + int remainingInBatch = nextBatchSize; + + while (remainingInBatch > 0) { + if (remainingPageValueCount == 0) { + if (!readNextPage()) { + if (pageLastRowUnfinished && remainingInBatch == 1) { + break; // No more data so the last row is closed + } + throwEndOfBatchException(remainingInBatch); + } + + if (pageLastRowUnfinished) { + int pageIndex = readUnfinishedRow(); + if (pageLastRowSkipped) { + remainingPageValueCount -= pageIndex; + int[] definitionLevels = new int[pageIndex]; + definitionLevelDecoder.read(definitionLevels, 0, definitionLevels.length); + int existingValueCount = countExistingValues(field.getDefinitionLevel(), definitionLevels); + valueDecoder.skip(existingValueCount); + } + else { + if (pageIndex > 0) { + int[] definitionLevels = new int[pageIndex]; + int existingValueCount = readDefinitionLevels(outputDefinitionLevels, definitionLevels); + readNullableValues(data, isNull, outputRepetitionLevels, 0, pageIndex, definitionLevels, existingValueCount); + } + if (pageIndex == pageValueCount) { // Current row spans more than two Parquet pages + checkState(pageLastRowUnfinished, "pageLastRowUnfinished not set when run out of values to read"); + continue; + } + remainingInBatch--; + } + } + } + + if (skip(field.getDefinitionLevel())) { + continue; + } + ValueCount valueCount = getNextPositions(Math.min(rowRanges.getRowsLeftInCurrentRange(), remainingInBatch)); + rowRanges.advanceRange(valueCount.rows + (pageLastRowUnfinished ? 1 : 0)); + int pageValuesIndex = pageValueCount - remainingPageValueCount; + + int[] definitionLevels = new int[valueCount.values]; + int existingValueCount = readDefinitionLevels(outputDefinitionLevels, definitionLevels); + readNullableValues(data, isNull, outputRepetitionLevels, pageValuesIndex, valueCount.values, definitionLevels, existingValueCount); + + remainingInBatch -= valueCount.rows; + } + + return data.createNullableBlock(isNull, outputDefinitionLevels.getMergedBuffer(), outputRepetitionLevels.getMergedBuffer()); + } + + private ColumnChunk readNonNull() + { + log.debug("readNonNull field %s, nextBatchSize %d", field, nextBatchSize); + NonNullValuesBuffer data = createNonNullValuesBuffer(); + IntegersBuffer outputRepetitionLevels = new IntegersBuffer(); + IntegersBuffer outputDefinitionLevels = new IntegersBuffer(); + int remainingInBatch = nextBatchSize; + while (remainingInBatch > 0) { + if (remainingPageValueCount == 0) { + if (!readNextPage()) { + if (pageLastRowUnfinished && remainingInBatch == 1) { + break; // No more data so the last row is closed + } + throwEndOfBatchException(remainingInBatch); + } + + if (pageLastRowUnfinished) { + int pageIndex = readUnfinishedRow(); + if (pageLastRowSkipped) { + remainingPageValueCount -= pageIndex; + int[] definitionLevels = new int[pageIndex]; + definitionLevelDecoder.read(definitionLevels, 0, definitionLevels.length); + int existingValueCount = countExistingValues(field.getDefinitionLevel(), definitionLevels); + valueDecoder.skip(existingValueCount); + if (pageIndex == pageValueCount) { // Current row spans more than two Parquet pages + continue; + } + } + else { + if (pageIndex > 0) { + readNonNullValues(data, outputRepetitionLevels, outputDefinitionLevels, 0, pageIndex); + } + if (pageIndex == pageValueCount) { // Current row spans more than two Parquet pages + continue; + } + remainingInBatch--; + } + } + } + + if (skip(field.getDefinitionLevel())) { + continue; + } + ValueCount valueCount = getNextPositions(Math.min(rowRanges.getRowsLeftInCurrentRange(), remainingInBatch)); + rowRanges.advanceRange(valueCount.rows + (pageLastRowUnfinished ? 1 : 0)); + int pageValuesIndex = pageValueCount - remainingPageValueCount; + readNonNullValues(data, outputRepetitionLevels, outputDefinitionLevels, pageValuesIndex, valueCount.values); + + remainingInBatch -= valueCount.rows; + } + + return data.createNonNullBlock(outputDefinitionLevels.getMergedBuffer(), outputRepetitionLevels.getMergedBuffer()); + } + + private void seek() + { + if (readOffset > 0) { + log.debug("seek field %s, readOffset %d, remainingPageValueCount %d, pageLastRowUnfinished %b", field, readOffset, remainingPageValueCount, pageLastRowUnfinished); + } + int remainingInBatch = readOffset; + while (remainingInBatch > 0) { + if (remainingPageValueCount == 0) { + if (!readNextPage()) { + break; + } + if (pageLastRowUnfinished) { + int pageIndex = readUnfinishedRow(); + if (pageIndex > 0) { + seek(pageIndex); + } + if (remainingPageValueCount == 0) { // The row spans more than two Parquet pages + checkState(pageLastRowUnfinished, "pageLastRowUnfinished not set when run out of values to skip"); + checkState(pageLastRowSkipped, "pageLastRowSkipped not set when run out of values to skip"); + continue; + } + remainingInBatch--; + } + } + + int chunkSize = Math.min(remainingPageValueCount, remainingInBatch); + ValueCount valueCount = getNextPositions(toIntExact(chunkSize)); + + seek(valueCount.values); + + int seek = rowRanges.seekForward(valueCount.rows + (pageLastRowUnfinished ? 1 : 0)); + remainingInBatch -= seek - (pageLastRowUnfinished ? 1 : 0); + } + } + + private int readDefinitionLevels(IntegersBuffer outputDefinitionLevels, int[] definitionLevels) + { + definitionLevelDecoder.read(definitionLevels, 0, definitionLevels.length); + outputDefinitionLevels.add(definitionLevels); + return countExistingValues(field.getDefinitionLevel() - 1, definitionLevels); + } + + private void readNullableValues( + NullableValuesBuffer data, + BooleansBuffer isNull, + IntegersBuffer outputRepetitionLevels, + int pageValuesIndex, + int valueCount, + int[] definitionLevels, + int existingValueCount) + { + boolean[] isNullChunk = new boolean[existingValueCount]; + isNull.add(isNullChunk); + int nonNullCount = getNulls(definitionLevels, field.getDefinitionLevel(), isNullChunk); + checkState( + nonNullCount <= existingValueCount, + "nonNullCount %s cannot be greater than existingValueCount %s, field %s", + nonNullCount, + existingValueCount, + field); + + outputRepetitionLevels.add(Arrays.copyOfRange(repetitionBuffer, pageValuesIndex, pageValuesIndex + valueCount)); + + data.readNullableValues(valueDecoder, isNullChunk, nonNullCount, existingValueCount); + remainingPageValueCount -= valueCount; + } + + private void readNonNullValues(NonNullValuesBuffer data, IntegersBuffer outputRepetitionLevels, IntegersBuffer outputDefinitionLevels, int pageValuesIndex, int valueCount) + { + int[] definitionLevels = new int[valueCount]; + definitionLevelDecoder.read(definitionLevels, 0, definitionLevels.length); + int existingValueCount = countExistingValues(field.getDefinitionLevel(), definitionLevels); + + outputRepetitionLevels.add(Arrays.copyOfRange(repetitionBuffer, pageValuesIndex, pageValuesIndex + valueCount)); + outputDefinitionLevels.add(definitionLevels); + + if (existingValueCount > 0) { + data.readNonNullValues(valueDecoder, existingValueCount); + } + remainingPageValueCount -= valueCount; + } + + private boolean skip(int minDefinitionLevel) + { + long skipCount = rowRanges.skipToRangeStart(); + if (skipCount > 0) { + log.debug("skipCount %d, remainingPageValueCount %d", skipCount, remainingPageValueCount); + } + if (skipCount >= remainingPageValueCount) { + remainingPageValueCount = 0; + pageLastRowUnfinished = true; + pageLastRowSkipped = true; + return true; + } + if (skipCount > 0) { + ValueCount toSkip = getNextPositions(toIntExact(skipCount)); + if (toSkip.values == remainingPageValueCount) { + remainingPageValueCount = 0; + pageLastRowSkipped = true; + return true; + } + int[] definitionLevels = new int[toSkip.values]; + definitionLevelDecoder.read(definitionLevels, 0, toSkip.values); + int valuesToSkip = countExistingValues(minDefinitionLevel, definitionLevels); + valueDecoder.skip(valuesToSkip); + remainingPageValueCount -= toSkip.values; + } + return false; + } + + private int getNulls(int[] definitionLevels, int maxDefinitionLevel, boolean[] localIsNull) + { + // Value is null if its definition level is equal to (max def level - 1) + int outputIndex = 0; + int nonNullCount = 0; + for (int definitionLevel : definitionLevels) { + boolean isValueNull = definitionLevel == maxDefinitionLevel - 1; + boolean isValueNonNull = definitionLevel == maxDefinitionLevel; + if (isValueNull) { + localIsNull[outputIndex] = true; + } + outputIndex += castToByte(isValueNull | isValueNonNull); + nonNullCount += castToByte(isValueNonNull); + } + return nonNullCount; + } + + private int countExistingValues(int minDefinitionLevel, int[] definitionLevels) + { + int valueCount = 0; + for (int definitionLevel : definitionLevels) { + valueCount += castToByte(definitionLevel >= minDefinitionLevel); + } + return valueCount; + } + + private int readUnfinishedRow() + { + int pageIndex = 0; + while (pageIndex < remainingPageValueCount && repetitionBuffer[pageIndex] != 0) { + pageIndex++; + } + return pageIndex; + } + + /** + * Calculates number of values upto a desired number of rows or the end of the page, whichever comes first. + * Return two values: + * -number of rows read fully. If the end of page is reached the number of rows is always lower by 1, since + * the information whether the row is finished is stored in repetition levels of the next page. In that case + * `pageLastRowUnfinished` is set. + * -number of values read + */ + private ValueCount getNextPositions(int desiredRowCount) + { + int valueCount = 0; + int rowCount = 0; + int pageValuesIndex = pageValueCount - remainingPageValueCount; + for (; rowCount < desiredRowCount && valueCount < remainingPageValueCount - 1; valueCount++) { + rowCount += castToByte(repetitionBuffer[pageValuesIndex + valueCount + 1] == 0); + } + + boolean pageReadUptoLastValue = rowCount != desiredRowCount; + if (pageReadUptoLastValue) { + valueCount++; + pageLastRowUnfinished = true; + pageLastRowSkipped = false; + } + else { + pageLastRowUnfinished = false; + } + + return new ValueCount(rowCount, valueCount); + } + + /** + * Skip first `valueCount` values as a result of seek operation before reading the data + */ + private void seek(int valueCount) + { + int[] definitionLevels = new int[valueCount]; + definitionLevelDecoder.read(definitionLevels, 0, definitionLevels.length); + int maxDefinitionLevel = field.getDefinitionLevel(); + int valuesToSkip = 0; + for (int definitionLevel : definitionLevels) { + valuesToSkip += castToByte(definitionLevel == maxDefinitionLevel); + } + + valueDecoder.skip(valuesToSkip); + remainingPageValueCount -= valueCount; + if (remainingPageValueCount == 0) { + pageLastRowUnfinished = true; + pageLastRowSkipped = true; + } + } + + private boolean readNextPage() + { + DataPage page = pageReader.readPage(); + if (page == null) { + return false; + } + + log.debug("readNextPage field %s, page %s, pageLastRowUnfinished %b", field, page, pageLastRowUnfinished); + if (page instanceof DataPageV1 dataPageV1) { + readFlatPageV1(dataPageV1); + } + else if (page instanceof DataPageV2 dataPageV2) { + readFlatPageV2(dataPageV2); + } + + pageValueCount = page.getValueCount(); + // We don't know how much values we will get in one batch so we read the whole page. + // This makes decoding faster unless major parts of the page are skipped + repetitionBuffer = new int[pageValueCount]; + repetitionLevelDecoder.read(repetitionBuffer, 0, pageValueCount); + remainingPageValueCount = pageValueCount; + rowRanges.resetForNewPage(page.getFirstRowIndex()); + + // For a compressed data page, the memory used by the decompressed values data needs to be accounted + // for separately as ParquetCompressionUtils#decompress allocates a new byte array for the decompressed result. + // For an uncompressed data page, we read directly from input Slices whose memory usage is already accounted + // for in AbstractParquetDataSource#ReferenceCountedReader. + int dataPageSizeInBytes = pageReader.arePagesCompressed() ? page.getUncompressedSize() : 0; + long dictionarySizeInBytes = dictionaryDecoder == null ? 0 : dictionaryDecoder.getRetainedSizeInBytes(); + long repetitionBufferSizeInBytes = sizeOf(repetitionBuffer); + memoryContext.setBytes(dataPageSizeInBytes + dictionarySizeInBytes + repetitionBufferSizeInBytes); + return true; + } + + private void readFlatPageV1(DataPageV1 page) + { + Slice buffer = page.getSlice(); + ParquetEncoding definitionEncoding = page.getDefinitionLevelEncoding(); + ParquetEncoding repetitionEncoding = page.getRepetitionLevelEncoding(); + int maxDefinitionLevel = field.getDefinitionLevel(); + int maxRepetitionLevel = field.getRepetitionLevel(); + + checkArgument(maxDefinitionLevel == 0 || definitionEncoding == RLE, "Invalid definition level encoding: %s", definitionEncoding); + checkArgument(maxRepetitionLevel == 0 || repetitionEncoding == RLE, "Invalid repetition level encoding: %s", repetitionEncoding); + + repetitionLevelDecoder = levelsDecoderProvider.create(maxRepetitionLevel); + if (maxRepetitionLevel > 0) { + int bufferSize = buffer.getInt(0); // We need to read the size even if there is no repetition data + repetitionLevelDecoder.init(new SimpleSliceInputStream(buffer.slice(Integer.BYTES, bufferSize))); + buffer = buffer.slice(bufferSize + Integer.BYTES, buffer.length() - bufferSize - Integer.BYTES); + } + + definitionLevelDecoder = levelsDecoderProvider.create(maxDefinitionLevel); + if (maxDefinitionLevel > 0) { + int bufferSize = buffer.getInt(0); // We need to read the size even if there is no definition + definitionLevelDecoder.init(new SimpleSliceInputStream(buffer.slice(Integer.BYTES, bufferSize))); + buffer = buffer.slice(bufferSize + Integer.BYTES, buffer.length() - bufferSize - Integer.BYTES); + } + + valueDecoder = createValueDecoder(decodersProvider, page.getValueEncoding(), buffer); + } + + private void readFlatPageV2(DataPageV2 page) + { + int maxDefinitionLevel = field.getDefinitionLevel(); + int maxRepetitionLevel = field.getRepetitionLevel(); + + definitionLevelDecoder = levelsDecoderProvider.create(maxDefinitionLevel); + definitionLevelDecoder.init(new SimpleSliceInputStream(page.getDefinitionLevels())); + + repetitionLevelDecoder = levelsDecoderProvider.create(maxRepetitionLevel); + repetitionLevelDecoder.init(new SimpleSliceInputStream(page.getRepetitionLevels())); + + valueDecoder = createValueDecoder(decodersProvider, page.getDataEncoding(), page.getSlice()); + } + + /** + * The reader will attempt to produce dictionary Trino pages using shared dictionary. + * In case of data not being dictionary-encoded it falls back to normal decoding. + */ + private NonNullValuesBuffer createNonNullValuesBuffer() + { + if (produceDictionaryBlock()) { + return new DictionaryValuesBuffer<>(field, dictionaryDecoder); + } + return new DataValuesBuffer<>(field, columnAdapter); + } + + private NullableValuesBuffer createNullableValuesBuffer() + { + if (produceDictionaryBlock()) { + return new DictionaryValuesBuffer<>(field, dictionaryDecoder); + } + return new DataValuesBuffer<>(field, columnAdapter); + } + + private interface NonNullValuesBuffer + { + void readNonNullValues(ValueDecoder valueDecoder, int existingValueCount); + + ColumnChunk createNonNullBlock(int[] definitions, int[] repetitions); + } + + private interface NullableValuesBuffer + { + void readNullableValues(ValueDecoder valueDecoder, boolean[] isNull, int nonNullCount, int existingValueCount); + + ColumnChunk createNullableBlock(BooleansBuffer isNull, int[] definitions, int[] repetitions); + } + + private static final class DataValuesBuffer + implements NonNullValuesBuffer, NullableValuesBuffer + { + private final PrimitiveField field; + private final ColumnAdapter columnAdapter; + private final List valueBuffers = new ArrayList<>(); + private int totalExistingValueCount; + private int totalNonNullsCount; + + private DataValuesBuffer(PrimitiveField field, ColumnAdapter columnAdapter) + { + this.field = field; + this.columnAdapter = columnAdapter; + } + + @Override + public void readNonNullValues(ValueDecoder valueDecoder, int existingValueCount) + { + T valueBuffer = columnAdapter.createBuffer(existingValueCount); + valueDecoder.read(valueBuffer, 0, existingValueCount); + valueBuffers.add(valueBuffer); + totalNonNullsCount += existingValueCount; + totalExistingValueCount += existingValueCount; + } + + @Override + public void readNullableValues(ValueDecoder valueDecoder, boolean[] isNull, int nonNullCount, int existingValueCount) + { + // No nulls + if (nonNullCount > 0 && nonNullCount == existingValueCount) { + readNonNullValues(valueDecoder, existingValueCount); + return; + } + + // Only nulls + if (nonNullCount == 0) { + valueBuffers.add(columnAdapter.createBuffer(existingValueCount)); + } + else { + // Read data values to a temporary array and unpack the nulls to the actual destination + T outBuffer = columnAdapter.createBuffer(existingValueCount); + T tmpBuffer = columnAdapter.createTemporaryBuffer(0, nonNullCount, outBuffer); + valueDecoder.read(tmpBuffer, 0, nonNullCount); + columnAdapter.unpackNullValues(tmpBuffer, outBuffer, isNull, 0, nonNullCount, existingValueCount); + valueBuffers.add(outBuffer); + } + totalNonNullsCount += nonNullCount; + totalExistingValueCount += existingValueCount; + } + + @Override + public ColumnChunk createNonNullBlock(int[] definitions, int[] repetitions) + { + checkState( + totalNonNullsCount == totalExistingValueCount, + "totalNonNullsCount %s should be equal to totalExistingValueCount %s when creating non-null block", + totalNonNullsCount, + totalExistingValueCount); + log.debug("DataValuesBuffer createNonNullBlock field %s, totalNonNullsCount %d, totalExistingValueCount %d", field, totalNonNullsCount, totalExistingValueCount); + return new ColumnChunk(columnAdapter.createNonNullBlock(getMergedValues()), definitions, repetitions); + } + + @Override + public ColumnChunk createNullableBlock(BooleansBuffer isNull, int[] definitions, int[] repetitions) + { + log.debug("DataValuesBuffer createNullableBlock field %s, totalNonNullsCount %d, totalExistingValueCount %d", field, totalNonNullsCount, totalExistingValueCount); + if (totalNonNullsCount == 0) { + return new ColumnChunk(RunLengthEncodedBlock.create(field.getType(), null, totalExistingValueCount), definitions, repetitions); + } + if (totalNonNullsCount == totalExistingValueCount) { + return new ColumnChunk(columnAdapter.createNonNullBlock(getMergedValues()), definitions, repetitions); + } + return new ColumnChunk(columnAdapter.createNullableBlock(isNull.getMergedBuffer(), getMergedValues()), definitions, repetitions); + } + + private T getMergedValues() + { + if (valueBuffers.size() == 1) { + return valueBuffers.get(0); + } + return columnAdapter.merge(valueBuffers); + } + } + + private static final class DictionaryValuesBuffer + implements NonNullValuesBuffer, NullableValuesBuffer + { + private final PrimitiveField field; + private final DictionaryDecoder dictionaryDecoder; + private final IntegersBuffer ids; + private int totalExistingValueCount; + private int totalNonNullsCount; + + private DictionaryValuesBuffer(PrimitiveField field, DictionaryDecoder dictionaryDecoder) + { + this.ids = new IntegersBuffer(); + this.field = field; + this.dictionaryDecoder = dictionaryDecoder; + } + + @Override + public void readNonNullValues(ValueDecoder valueDecoder, int existingValueCount) + { + int[] positionsBuffer = new int[existingValueCount]; + dictionaryDecoder.readDictionaryIds(positionsBuffer, 0, existingValueCount); + ids.add(positionsBuffer); + totalNonNullsCount += existingValueCount; + totalExistingValueCount += existingValueCount; + } + + @Override + public void readNullableValues(ValueDecoder valueDecoder, boolean[] isNull, int nonNullCount, int existingValueCount) + { + // No nulls + if (nonNullCount > 0 && nonNullCount == existingValueCount) { + readNonNullValues(valueDecoder, existingValueCount); + return; + } + + // Parquet dictionary encodes only non-null values + // Dictionary size is used as the id to denote nulls for Trino dictionary block + if (nonNullCount == 0) { + // Only nulls were encountered in existingValueCount, add empty values for nulls + int[] dummy = new int[existingValueCount]; + Arrays.fill(dummy, dictionaryDecoder.getDictionarySize()); + ids.add(dummy); + } + else { + // Read data values to a temporary array and unpack the nulls to the actual destination + int[] tmpBuffer = new int[nonNullCount]; + dictionaryDecoder.readDictionaryIds(tmpBuffer, 0, nonNullCount); + int[] positionsBuffer = new int[existingValueCount]; + unpackDictionaryNullId(tmpBuffer, positionsBuffer, isNull, 0, existingValueCount, dictionaryDecoder.getDictionarySize()); + ids.add(positionsBuffer); + } + totalNonNullsCount += nonNullCount; + totalExistingValueCount += existingValueCount; + } + + @Override + public ColumnChunk createNonNullBlock(int[] definitions, int[] repetitions) + { + // This will return a nullable dictionary even if we are returning a batch of non-null values + // for a nullable column. We avoid creating a new non-nullable dictionary to allow the engine + // to optimize for the unchanged dictionary case. + checkState( + totalNonNullsCount == totalExistingValueCount, + "totalNonNullsCount %s should be equal to totalExistingValueCount %s when creating non-null block", + totalNonNullsCount, + totalExistingValueCount); + log.debug("DictionaryValuesBuffer createNonNullBlock field %s, totalNonNullsCount %d, totalExistingValueCount %d", field, totalNonNullsCount, totalExistingValueCount); + return createDictionaryBlock(ids.getMergedBuffer(), dictionaryDecoder.getDictionaryBlock(), definitions, repetitions); + } + + @Override + public ColumnChunk createNullableBlock(BooleansBuffer isNull, int[] definitions, int[] repetitions) + { + log.debug("DictionaryValuesBuffer createNullableBlock field %s, totalNonNullsCount %d, totalExistingValueCount %d", field, totalNonNullsCount, totalExistingValueCount); + if (totalNonNullsCount == 0) { + return new ColumnChunk(RunLengthEncodedBlock.create(field.getType(), null, totalExistingValueCount), definitions, repetitions); + } + return createDictionaryBlock(ids.getMergedBuffer(), dictionaryDecoder.getDictionaryBlock(), definitions, repetitions); + } + } + + private static class BooleansBuffer + { + private final List buffers = new ArrayList<>(); + + private void add(boolean[] buffer) + { + buffers.add(buffer); + } + + private boolean[] getMergedBuffer() + { + if (buffers.size() == 1) { + return buffers.get(0); + } + return Booleans.concat(buffers.toArray(boolean[][]::new)); + } + } + + private static class IntegersBuffer + { + private final List buffers = new ArrayList<>(); + + private void add(int[] buffer) + { + buffers.add(buffer); + } + + private int[] getMergedBuffer() + { + if (buffers.size() == 1) { + return buffers.get(0); + } + return Ints.concat(buffers.toArray(int[][]::new)); + } + } + + record ValueCount(int rows, int values) {} +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/PageReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/PageReader.java new file mode 100644 index 000000000000..d8ec35c52fbe --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/PageReader.java @@ -0,0 +1,202 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Iterators; +import com.google.common.collect.PeekingIterator; +import io.trino.parquet.DataPage; +import io.trino.parquet.DataPageV1; +import io.trino.parquet.DataPageV2; +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.Page; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.format.CompressionCodec; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static io.trino.parquet.ParquetCompressionUtils.decompress; +import static io.trino.parquet.ParquetReaderUtils.isOnlyDictionaryEncodingPages; +import static java.util.Objects.requireNonNull; + +public final class PageReader +{ + private final ParquetDataSourceId dataSourceId; + private final CompressionCodec codec; + private final boolean hasOnlyDictionaryEncodedPages; + private final boolean hasNoNulls; + private final PeekingIterator compressedPages; + + private boolean dictionaryAlreadyRead; + private int dataPageReadCount; + + public static PageReader createPageReader( + ParquetDataSourceId dataSourceId, + ChunkedInputStream columnChunk, + ColumnChunkMetadata metadata, + ColumnDescriptor columnDescriptor, + @Nullable OffsetIndex offsetIndex, + Optional fileCreatedBy) + { + // Parquet schema may specify a column definition as OPTIONAL even though there are no nulls in the actual data. + // Row-group column statistics can be used to identify such cases and switch to faster non-nullable read + // paths in FlatColumnReader. + Statistics columnStatistics = metadata.getStatistics(); + boolean hasNoNulls = columnStatistics != null && columnStatistics.getNumNulls() == 0; + boolean hasOnlyDictionaryEncodedPages = isOnlyDictionaryEncodingPages(metadata); + ParquetColumnChunkIterator compressedPages = new ParquetColumnChunkIterator( + dataSourceId, + fileCreatedBy, + columnDescriptor, + metadata, + columnChunk, + offsetIndex); + + return new PageReader( + dataSourceId, + metadata.getCodec().getParquetCompressionCodec(), + compressedPages, + hasOnlyDictionaryEncodedPages, + hasNoNulls); + } + + @VisibleForTesting + public PageReader( + ParquetDataSourceId dataSourceId, + CompressionCodec codec, + Iterator compressedPages, + boolean hasOnlyDictionaryEncodedPages, + boolean hasNoNulls) + { + this.dataSourceId = requireNonNull(dataSourceId, "dataSourceId is null"); + this.codec = codec; + this.compressedPages = Iterators.peekingIterator(compressedPages); + this.hasOnlyDictionaryEncodedPages = hasOnlyDictionaryEncodedPages; + this.hasNoNulls = hasNoNulls; + } + + public boolean hasNoNulls() + { + return hasNoNulls; + } + + public boolean hasOnlyDictionaryEncodedPages() + { + return hasOnlyDictionaryEncodedPages; + } + + public DataPage readPage() + { + if (!compressedPages.hasNext()) { + return null; + } + Page compressedPage = compressedPages.next(); + checkState(compressedPage instanceof DataPage, "Found page %s instead of a DataPage", compressedPage); + dataPageReadCount++; + try { + if (compressedPage instanceof DataPageV1 dataPageV1) { + if (!arePagesCompressed()) { + return dataPageV1; + } + return new DataPageV1( + decompress(dataSourceId, codec, dataPageV1.getSlice(), dataPageV1.getUncompressedSize()), + dataPageV1.getValueCount(), + dataPageV1.getUncompressedSize(), + dataPageV1.getFirstRowIndex(), + dataPageV1.getRepetitionLevelEncoding(), + dataPageV1.getDefinitionLevelEncoding(), + dataPageV1.getValueEncoding()); + } + DataPageV2 dataPageV2 = (DataPageV2) compressedPage; + if (!dataPageV2.isCompressed()) { + return dataPageV2; + } + int uncompressedSize = dataPageV2.getUncompressedSize() + - dataPageV2.getDefinitionLevels().length() + - dataPageV2.getRepetitionLevels().length(); + return new DataPageV2( + dataPageV2.getRowCount(), + dataPageV2.getNullCount(), + dataPageV2.getValueCount(), + dataPageV2.getRepetitionLevels(), + dataPageV2.getDefinitionLevels(), + dataPageV2.getDataEncoding(), + decompress(dataSourceId, codec, dataPageV2.getSlice(), uncompressedSize), + dataPageV2.getUncompressedSize(), + dataPageV2.getFirstRowIndex(), + dataPageV2.getStatistics(), + false); + } + catch (IOException e) { + throw new RuntimeException("Could not decompress page", e); + } + } + + public DictionaryPage readDictionaryPage() + { + checkState(!dictionaryAlreadyRead, "Dictionary was already read"); + checkState(dataPageReadCount == 0, "Dictionary has to be read first but " + dataPageReadCount + " was read already"); + dictionaryAlreadyRead = true; + if (!(compressedPages.peek() instanceof DictionaryPage)) { + return null; + } + try { + DictionaryPage compressedDictionaryPage = (DictionaryPage) compressedPages.next(); + return new DictionaryPage( + decompress(dataSourceId, codec, compressedDictionaryPage.getSlice(), compressedDictionaryPage.getUncompressedSize()), + compressedDictionaryPage.getDictionarySize(), + compressedDictionaryPage.getEncoding()); + } + catch (IOException e) { + throw new RuntimeException("Error reading dictionary page", e); + } + } + + public boolean hasNext() + { + return compressedPages.hasNext(); + } + + public DataPage getNextPage() + { + verifyDictionaryPageRead(); + + return (DataPage) compressedPages.peek(); + } + + public void skipNextPage() + { + verifyDictionaryPageRead(); + compressedPages.next(); + } + + public boolean arePagesCompressed() + { + return codec != CompressionCodec.UNCOMPRESSED; + } + + private void verifyDictionaryPageRead() + { + checkArgument(dictionaryAlreadyRead, "Dictionary has to be read first"); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetColumnChunkIterator.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetColumnChunkIterator.java new file mode 100644 index 000000000000..235c1b2d3d76 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetColumnChunkIterator.java @@ -0,0 +1,188 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.trino.parquet.DataPageV1; +import io.trino.parquet.DataPageV2; +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.Page; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.format.DataPageHeader; +import org.apache.parquet.format.DataPageHeaderV2; +import org.apache.parquet.format.DictionaryPageHeader; +import org.apache.parquet.format.PageHeader; +import org.apache.parquet.format.Util; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Optional; +import java.util.OptionalLong; + +import static com.google.common.base.Preconditions.checkState; +import static io.trino.parquet.ParquetTypeUtils.getParquetEncoding; +import static java.util.Objects.requireNonNull; + +public final class ParquetColumnChunkIterator + implements Iterator +{ + private final ParquetDataSourceId dataSourceId; + private final Optional fileCreatedBy; + private final ColumnDescriptor descriptor; + private final ColumnChunkMetadata metadata; + private final ChunkedInputStream input; + private final OffsetIndex offsetIndex; + + private long valueCount; + private int dataPageCount; + + public ParquetColumnChunkIterator( + ParquetDataSourceId dataSourceId, + Optional fileCreatedBy, + ColumnDescriptor descriptor, + ColumnChunkMetadata metadata, + ChunkedInputStream input, + @Nullable OffsetIndex offsetIndex) + { + this.dataSourceId = requireNonNull(dataSourceId, "dataSourceId is null"); + this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null"); + this.descriptor = requireNonNull(descriptor, "descriptor is null"); + this.metadata = requireNonNull(metadata, "metadata is null"); + this.input = requireNonNull(input, "input is null"); + this.offsetIndex = offsetIndex; + } + + @Override + public boolean hasNext() + { + return hasMorePages(valueCount, dataPageCount); + } + + @Override + public Page next() + { + checkState(hasNext(), "No more data left to read in column (%s), metadata (%s), valueCount %s, dataPageCount %s", descriptor, metadata, valueCount, dataPageCount); + + try { + PageHeader pageHeader = readPageHeader(); + int uncompressedPageSize = pageHeader.getUncompressed_page_size(); + int compressedPageSize = pageHeader.getCompressed_page_size(); + Page result = null; + switch (pageHeader.type) { + case DICTIONARY_PAGE: + if (dataPageCount != 0) { + throw new ParquetCorruptionException(dataSourceId, "Column (%s) has a dictionary page after the first position in column chunk", descriptor); + } + result = readDictionaryPage(pageHeader, pageHeader.getUncompressed_page_size(), pageHeader.getCompressed_page_size()); + break; + case DATA_PAGE: + result = readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, getFirstRowIndex(dataPageCount, offsetIndex)); + ++dataPageCount; + break; + case DATA_PAGE_V2: + result = readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, getFirstRowIndex(dataPageCount, offsetIndex)); + ++dataPageCount; + break; + default: + input.skip(compressedPageSize); + break; + } + return result; + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + private PageHeader readPageHeader() + throws IOException + { + return Util.readPageHeader(input); + } + + private boolean hasMorePages(long valuesCountReadSoFar, int dataPageCountReadSoFar) + { + if (offsetIndex == null) { + return valuesCountReadSoFar < metadata.getValueCount(); + } + return dataPageCountReadSoFar < offsetIndex.getPageCount(); + } + + private DictionaryPage readDictionaryPage(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize) + throws IOException + { + DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); + return new DictionaryPage( + input.getSlice(compressedPageSize), + uncompressedPageSize, + dicHeader.getNum_values(), + getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()))); + } + + private DataPageV1 readDataPageV1( + PageHeader pageHeader, + int uncompressedPageSize, + int compressedPageSize, + OptionalLong firstRowIndex) + throws IOException + { + DataPageHeader dataHeaderV1 = pageHeader.getData_page_header(); + valueCount += dataHeaderV1.getNum_values(); + return new DataPageV1( + input.getSlice(compressedPageSize), + dataHeaderV1.getNum_values(), + uncompressedPageSize, + firstRowIndex, + getParquetEncoding(Encoding.valueOf(dataHeaderV1.getRepetition_level_encoding().name())), + getParquetEncoding(Encoding.valueOf(dataHeaderV1.getDefinition_level_encoding().name())), + getParquetEncoding(Encoding.valueOf(dataHeaderV1.getEncoding().name()))); + } + + private DataPageV2 readDataPageV2( + PageHeader pageHeader, + int uncompressedPageSize, + int compressedPageSize, + OptionalLong firstRowIndex) + throws IOException + { + DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2(); + int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length(); + valueCount += dataHeaderV2.getNum_values(); + return new DataPageV2( + dataHeaderV2.getNum_rows(), + dataHeaderV2.getNum_nulls(), + dataHeaderV2.getNum_values(), + input.getSlice(dataHeaderV2.getRepetition_levels_byte_length()), + input.getSlice(dataHeaderV2.getDefinition_levels_byte_length()), + getParquetEncoding(Encoding.valueOf(dataHeaderV2.getEncoding().name())), + input.getSlice(dataSize), + uncompressedPageSize, + firstRowIndex, + MetadataReader.readStats( + fileCreatedBy, + Optional.ofNullable(dataHeaderV2.getStatistics()), + descriptor.getPrimitiveType()), + dataHeaderV2.isIs_compressed()); + } + + private static OptionalLong getFirstRowIndex(int pageIndex, OffsetIndex offsetIndex) + { + return offsetIndex == null ? OptionalLong.empty() : OptionalLong.of(offsetIndex.getFirstRowIndex(pageIndex)); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetReader.java new file mode 100644 index 000000000000..5287190921c3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetReader.java @@ -0,0 +1,879 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import com.google.common.base.Throwables; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ListMultimap; +import com.google.errorprone.annotations.CheckReturnValue; +import com.google.errorprone.annotations.FormatMethod; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.trino.memory.context.AggregatedMemoryContext; +import io.trino.parquet.ChunkKey; +import io.trino.parquet.Column; +import io.trino.parquet.DiskRange; +import io.trino.parquet.Field; +import io.trino.parquet.GroupField; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.ParquetWriteValidation; +import io.trino.parquet.ParquetWriteValidation.StatisticsValidation; +import io.trino.parquet.ParquetWriteValidation.WriteChecksumBuilder; +import io.trino.parquet.PrimitiveField; +import io.trino.parquet.VariantField; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.metadata.PrunedBlockMetadata; +import io.trino.parquet.predicate.TupleDomainParquetPredicate; +import io.trino.parquet.reader.FilteredOffsetIndex.OffsetRange; +import io.trino.parquet.spark.Variant; +import io.trino.plugin.base.metrics.LongCount; +import io.trino.spi.Page; +import io.trino.spi.block.ArrayBlock; +import io.trino.spi.block.Block; +import io.trino.spi.block.BlockBuilder; +import io.trino.spi.block.DictionaryBlock; +import io.trino.spi.block.LongArrayBlock; +import io.trino.spi.block.RowBlock; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.metrics.Metric; +import io.trino.spi.metrics.Metrics; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.MapType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.Type; +import io.trino.util.Reflection; +import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.joda.time.DateTimeZone; + +import java.io.Closeable; +import java.io.IOException; +import java.lang.invoke.MethodHandle; +import java.time.ZoneId; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.function.ObjLongConsumer; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.SizeOf.sizeOf; +import static io.airlift.slice.Slices.utf8Slice; +import static io.trino.parquet.ParquetValidationUtils.validateParquet; +import static io.trino.parquet.ParquetWriteValidation.StatisticsValidation.createStatisticsValidationBuilder; +import static io.trino.parquet.ParquetWriteValidation.WriteChecksumBuilder.createWriteChecksumBuilder; +import static io.trino.parquet.reader.ListColumnReader.calculateCollectionOffsets; +import static io.trino.parquet.reader.PageReader.createPageReader; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.util.Objects.checkIndex; +import static java.util.Objects.requireNonNull; + +public class ParquetReader + implements Closeable +{ + private static final Logger log = Logger.get(ParquetReader.class); + + private static final int INITIAL_BATCH_SIZE = 1; + private static final int BATCH_SIZE_GROWTH_FACTOR = 2; + public static final String PARQUET_CODEC_METRIC_PREFIX = "ParquetReaderCompressionFormat_"; + public static final String COLUMN_INDEX_ROWS_FILTERED = "ParquetColumnIndexRowsFiltered"; + + private final Optional fileCreatedBy; + private final List rowGroups; + private final List columnFields; + private final boolean appendRowNumberColumn; + private final List primitiveFields; + private final ParquetDataSource dataSource; + private final ZoneId zoneId; + private final ColumnReaderFactory columnReaderFactory; + private final AggregatedMemoryContext memoryContext; + + private int currentRowGroup = -1; + private PrunedBlockMetadata currentBlockMetadata; + private long currentGroupRowCount; + /** + * Index in the Parquet file of the first row of the current group + */ + private long firstRowIndexInGroup; + /** + * Index in the current group of the next row + */ + private long nextRowInGroup; + private int batchSize; + private int nextBatchSize = INITIAL_BATCH_SIZE; + private final Map columnReaders; + private final Map maxBytesPerCell; + private double maxCombinedBytesPerRow; + private final ParquetReaderOptions options; + private int maxBatchSize; + + private AggregatedMemoryContext currentRowGroupMemoryContext; + private final Map chunkReaders; + private final Optional writeValidation; + private final Optional writeChecksumBuilder; + private final Optional rowGroupStatisticsValidation; + private final FilteredRowRanges[] blockRowRanges; + private final Function exceptionTransform; + private final Map> codecMetrics; + + private int currentPageId; + + private long columnIndexRowsFiltered = -1; + + public ParquetReader( + Optional fileCreatedBy, + List columnFields, + boolean appendRowNumberColumn, + List rowGroups, + ParquetDataSource dataSource, + DateTimeZone timeZone, + AggregatedMemoryContext memoryContext, + ParquetReaderOptions options, + Function exceptionTransform, + Optional parquetPredicate, + Optional writeValidation) + throws IOException + { + this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null"); + requireNonNull(columnFields, "columnFields is null"); + this.columnFields = ImmutableList.copyOf(columnFields); + this.appendRowNumberColumn = appendRowNumberColumn; + this.primitiveFields = getPrimitiveFields(columnFields.stream().map(Column::field).collect(toImmutableList())); + this.rowGroups = requireNonNull(rowGroups, "rowGroups is null"); + this.dataSource = requireNonNull(dataSource, "dataSource is null"); + this.zoneId = requireNonNull(timeZone, "timeZone is null").toTimeZone().toZoneId(); + this.columnReaderFactory = new ColumnReaderFactory(timeZone, options); + this.memoryContext = requireNonNull(memoryContext, "memoryContext is null"); + this.currentRowGroupMemoryContext = memoryContext.newAggregatedMemoryContext(); + this.options = requireNonNull(options, "options is null"); + this.maxBatchSize = options.getMaxReadBlockRowCount(); + this.columnReaders = new HashMap<>(); + this.maxBytesPerCell = new HashMap<>(); + + this.writeValidation = requireNonNull(writeValidation, "writeValidation is null"); + validateWrite( + validation -> fileCreatedBy.equals(Optional.of(validation.getCreatedBy())), + "Expected created by %s, found %s", + writeValidation.map(ParquetWriteValidation::getCreatedBy), + fileCreatedBy); + validateBlockMetadata(rowGroups); + this.writeChecksumBuilder = writeValidation.map(validation -> createWriteChecksumBuilder(validation.getTypes())); + this.rowGroupStatisticsValidation = writeValidation.map(validation -> createStatisticsValidationBuilder(validation.getTypes())); + + requireNonNull(parquetPredicate, "parquetPredicate is null"); + Optional filter = Optional.empty(); + if (parquetPredicate.isPresent() && options.isUseColumnIndex()) { + filter = parquetPredicate.get().toParquetFilter(timeZone); + } + this.blockRowRanges = calculateFilteredRowRanges(rowGroups, filter, primitiveFields); + + this.exceptionTransform = exceptionTransform; + ListMultimap ranges = ArrayListMultimap.create(); + Map codecMetrics = new HashMap<>(); + for (int rowGroup = 0; rowGroup < rowGroups.size(); rowGroup++) { + PrunedBlockMetadata blockMetadata = rowGroups.get(rowGroup).prunedBlockMetadata(); + long rowGroupRowCount = blockMetadata.getRowCount(); + for (PrimitiveField field : primitiveFields) { + int columnId = field.getId(); + ColumnChunkMetadata chunkMetadata = blockMetadata.getColumnChunkMetaData(field.getDescriptor()); + ColumnPath columnPath = chunkMetadata.getPath(); + + long startingPosition = chunkMetadata.getStartingPos(); + long totalLength = chunkMetadata.getTotalSize(); + long totalDataSize = 0; + FilteredOffsetIndex filteredOffsetIndex = null; + if (blockRowRanges[rowGroup] != null) { + filteredOffsetIndex = getFilteredOffsetIndex(blockRowRanges[rowGroup], rowGroup, rowGroupRowCount, columnPath); + } + if (filteredOffsetIndex == null) { + DiskRange range = new DiskRange(startingPosition, totalLength); + totalDataSize = range.getLength(); + ranges.put(new ChunkKey(columnId, rowGroup), range); + } + else { + List offsetRanges = filteredOffsetIndex.calculateOffsetRanges(startingPosition); + for (OffsetRange offsetRange : offsetRanges) { + DiskRange range = new DiskRange(offsetRange.getOffset(), offsetRange.getLength()); + totalDataSize += range.getLength(); + ranges.put(new ChunkKey(columnId, rowGroup), range); + } + // Initialize columnIndexRowsFiltered only when column indexes are found and used + columnIndexRowsFiltered = 0; + } + // Update the metrics which records the codecs used along with data size + codecMetrics.merge( + PARQUET_CODEC_METRIC_PREFIX + chunkMetadata.getCodec().name(), + new LongCount(totalDataSize), + LongCount::mergeWith); + } + } + this.codecMetrics = ImmutableMap.copyOf(codecMetrics); + this.chunkReaders = dataSource.planRead(ranges, memoryContext); + } + + @Override + public void close() + throws IOException + { + // Release memory usage from column readers + columnReaders.clear(); + currentRowGroupMemoryContext.close(); + + for (ChunkedInputStream chunkedInputStream : chunkReaders.values()) { + chunkedInputStream.close(); + } + dataSource.close(); + + if (writeChecksumBuilder.isPresent()) { + ParquetWriteValidation parquetWriteValidation = writeValidation.orElseThrow(); + parquetWriteValidation.validateChecksum(dataSource.getId(), writeChecksumBuilder.get().build()); + } + } + + public Page nextPage() + throws IOException + { + int batchSize = nextBatch(); + if (batchSize <= 0) { + return null; + } + // create a lazy page + currentPageId++; + Page page = new ParquetSourcePage(batchSize).getPage(); + validateWritePageChecksum(page); + return page; + } + + private class ParquetSourcePage + { + private static final long INSTANCE_SIZE = instanceSize(ParquetSourcePage.class); + + private final int expectedPageId = currentPageId; + private final Block[] blocks = new Block[columnFields.size() + (appendRowNumberColumn ? 1 : 0)]; + private final int rowNumberColumnIndex = appendRowNumberColumn ? columnFields.size() : -1; + private SelectedPositions selectedPositions; + + private long sizeInBytes; + private long retainedSizeInBytes; + + public ParquetSourcePage(int positionCount) + { + selectedPositions = new SelectedPositions(positionCount, null); + retainedSizeInBytes = shallowRetainedSizeInBytes(); + } + + public int getPositionCount() + { + return selectedPositions.positionCount(); + } + + public long getSizeInBytes() + { + return sizeInBytes; + } + + public long getRetainedSizeInBytes() + { + return retainedSizeInBytes; + } + + private long shallowRetainedSizeInBytes() + { + return INSTANCE_SIZE + + sizeOf(blocks) + + selectedPositions.retainedSizeInBytes(); + } + + public void retainedBytesForEachPart(ObjLongConsumer consumer) + { + consumer.accept(this, INSTANCE_SIZE); + consumer.accept(blocks, sizeOf(blocks)); + consumer.accept(selectedPositions, selectedPositions.retainedSizeInBytes()); + for (Block block : blocks) { + if (block != null) { + block.retainedBytesForEachPart(consumer); + } + } + } + + public int getChannelCount() + { + return blocks.length; + } + + public Block getBlock(int channel) + { + checkState(currentPageId == expectedPageId, "Parquet reader has been advanced beyond block"); + Block block = blocks[channel]; + if (block == null) { + if (channel == rowNumberColumnIndex) { + block = selectedPositions.createRowNumberBlock(lastBatchStartRow()); + } + else { + try { + // todo use selected positions to improve read performance + block = readBlock(columnFields.get(channel).field()); + } + catch (IOException e) { + throw exceptionTransform.apply(e); + } + block = selectedPositions.apply(block); + } + blocks[channel] = block; + sizeInBytes += block.getSizeInBytes(); + retainedSizeInBytes += block.getRetainedSizeInBytes(); + } + return block; + } + + public Page getPage() + { + // ensure all blocks are loaded + for (int i = 0; i < blocks.length; i++) { + getBlock(i); + } + return new Page(selectedPositions.positionCount(), blocks); + } + + public void selectPositions(int[] positions, int offset, int size) + { + selectedPositions = selectedPositions.selectPositions(positions, offset, size); + retainedSizeInBytes = shallowRetainedSizeInBytes(); + for (int i = 0; i < blocks.length; i++) { + Block block = blocks[i]; + if (block != null) { + block = selectedPositions.apply(block); + retainedSizeInBytes += block.getRetainedSizeInBytes(); + blocks[i] = block; + } + } + } + } + + private record SelectedPositions(int positionCount, @Nullable int[] positions) + { + private static final long INSTANCE_SIZE = instanceSize(SelectedPositions.class); + + public long retainedSizeInBytes() + { + return INSTANCE_SIZE + sizeOf(positions); + } + + @CheckReturnValue + public Block apply(Block block) + { + if (positions == null) { + return block; + } + return block.getPositions(positions, 0, positionCount); + } + + public Block createRowNumberBlock(long startRowNumber) + { + long[] rowNumbers = new long[positionCount]; + for (int i = 0; i < positionCount; i++) { + int position = positions == null ? i : positions[i]; + rowNumbers[i] = startRowNumber + position; + } + return new LongArrayBlock(positionCount, Optional.empty(), rowNumbers); + } + + @CheckReturnValue + public SelectedPositions selectPositions(int[] positions, int offset, int size) + { + if (this.positions == null) { + for (int i = 0; i < size; i++) { + checkIndex(offset + i, positionCount); + } + return new SelectedPositions(size, Arrays.copyOfRange(positions, offset, offset + size)); + } + + int[] newPositions = new int[size]; + for (int i = 0; i < size; i++) { + newPositions[i] = this.positions[positions[offset + i]]; + } + return new SelectedPositions(size, newPositions); + } + } + + /** + * Get the global row index of the first row in the last batch. + */ + public long lastBatchStartRow() + { + return firstRowIndexInGroup + nextRowInGroup - batchSize; + } + + private int nextBatch() + throws IOException + { + if (nextRowInGroup >= currentGroupRowCount && !advanceToNextRowGroup()) { + return -1; + } + + batchSize = min(nextBatchSize, maxBatchSize); + nextBatchSize = min(batchSize * BATCH_SIZE_GROWTH_FACTOR, options.getMaxReadBlockRowCount()); + batchSize = toIntExact(min(batchSize, currentGroupRowCount - nextRowInGroup)); + + nextRowInGroup += batchSize; + columnReaders.values().forEach(reader -> reader.prepareNextRead(batchSize)); + return batchSize; + } + + private boolean advanceToNextRowGroup() + throws IOException + { + currentRowGroupMemoryContext.close(); + currentRowGroupMemoryContext = memoryContext.newAggregatedMemoryContext(); + freeCurrentRowGroupBuffers(); + + if (currentRowGroup >= 0 && rowGroupStatisticsValidation.isPresent()) { + StatisticsValidation statisticsValidation = rowGroupStatisticsValidation.get(); + writeValidation.orElseThrow().validateRowGroupStatistics(dataSource.getId(), currentBlockMetadata, statisticsValidation.build()); + statisticsValidation.reset(); + } + + currentRowGroup++; + if (currentRowGroup == rowGroups.size()) { + return false; + } + RowGroupInfo rowGroupInfo = rowGroups.get(currentRowGroup); + currentBlockMetadata = rowGroupInfo.prunedBlockMetadata(); + firstRowIndexInGroup = rowGroupInfo.fileRowOffset(); + currentGroupRowCount = currentBlockMetadata.getRowCount(); + FilteredRowRanges currentGroupRowRanges = blockRowRanges[currentRowGroup]; + log.debug("advanceToNextRowGroup dataSource %s, currentRowGroup %d, rowRanges %s, currentBlockMetadata %s", dataSource.getId(), currentRowGroup, currentGroupRowRanges, currentBlockMetadata); + if (currentGroupRowRanges != null) { + long rowCount = currentGroupRowRanges.getRowCount(); + columnIndexRowsFiltered += currentGroupRowCount - rowCount; + if (rowCount == 0) { + // Filters on multiple columns with page indexes may yield non-overlapping row ranges and eliminate the entire row group. + // Advance to next row group to ensure that we don't return a null Page and close the page source before all row groups are processed + return advanceToNextRowGroup(); + } + currentGroupRowCount = rowCount; + } + nextRowInGroup = 0L; + initializeColumnReaders(); + return true; + } + + private void freeCurrentRowGroupBuffers() + { + if (currentRowGroup < 0) { + return; + } + + for (PrimitiveField field : primitiveFields) { + ChunkedInputStream chunkedStream = chunkReaders.get(new ChunkKey(field.getId(), currentRowGroup)); + if (chunkedStream != null) { + chunkedStream.close(); + } + } + } + + private ColumnChunk readVariant(VariantField field) + throws IOException + { + ColumnChunk metadataChunk = readColumnChunk(field.getMetadata()); + + int positionCount = metadataChunk.getBlock().getPositionCount(); + BlockBuilder variantBlock = VARCHAR.createBlockBuilder(null, max(1, positionCount)); + if (positionCount == 0) { + variantBlock.appendNull(); + } + else { + ColumnChunk valueChunk = readColumnChunk(field.getValue()); + for (int position = 0; position < positionCount; position++) { + Slice metadata = VARBINARY.getSlice(metadataChunk.getBlock(), position); + if (metadata.length() == 0) { + variantBlock.appendNull(); + continue; + } + Slice value = VARBINARY.getSlice(valueChunk.getBlock(), position); + Variant variant = new Variant(value.getBytes(), metadata.getBytes()); + VARCHAR.writeSlice(variantBlock, utf8Slice(variant.toJson(zoneId))); + } + } + return new ColumnChunk(variantBlock.build(), metadataChunk.getDefinitionLevels(), metadataChunk.getRepetitionLevels()); + } + + private ColumnChunk readArray(GroupField field) + throws IOException + { + List parameters = field.getType().getTypeParameters(); + checkArgument(parameters.size() == 1, "Arrays must have a single type parameter, found %s", parameters.size()); + Optional children = field.getChildren().get(0); + if (children.isEmpty()) { + return new ColumnChunk(createNullBlock(field.getType()), new int[] {}, new int[] {}); + } + Field elementField = children.get(); + ColumnChunk columnChunk = readColumnChunk(elementField); + + ListColumnReader.BlockPositions collectionPositions = calculateCollectionOffsets(field, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + int positionsCount = collectionPositions.offsets().length - 1; + Block arrayBlock = ArrayBlock.fromElementBlock(positionsCount, collectionPositions.isNull(), collectionPositions.offsets(), columnChunk.getBlock()); + return new ColumnChunk(arrayBlock, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + } + + private ColumnChunk readMap(GroupField field) + throws IOException + { + List parameters = field.getType().getTypeParameters(); + checkArgument(parameters.size() == 2, "Maps must have two type parameters, found %s", parameters.size()); + Block[] blocks = new Block[parameters.size()]; + + ColumnChunk columnChunk = readColumnChunk(field.getChildren().get(0).get()); + blocks[0] = columnChunk.getBlock(); + Optional valueField = field.getChildren().get(1); + blocks[1] = valueField.isPresent() ? readColumnChunk(valueField.get()).getBlock() : createNullBlock(parameters.get(1)); + ListColumnReader.BlockPositions collectionPositions = calculateCollectionOffsets(field, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + Block mapBlock = ((MapType) field.getType()).createBlockFromKeyValue(collectionPositions.isNull(), collectionPositions.offsets(), blocks[0], blocks[1]); + return new ColumnChunk(mapBlock, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + } + + private ColumnChunk readStruct(GroupField field) + throws IOException + { + Block[] blocks = new Block[field.getType().getTypeParameters().size()]; + ColumnChunk columnChunk = null; + List> parameters = field.getChildren(); + for (int i = 0; i < blocks.length; i++) { + Optional parameter = parameters.get(i); + if (parameter.isPresent()) { + columnChunk = readColumnChunk(parameter.get()); + blocks[i] = columnChunk.getBlock(); + } + } + + if (columnChunk == null) { + throw new ParquetCorruptionException(dataSource.getId(), "Struct field does not have any children: %s", field); + } + + StructColumnReader.RowBlockPositions structIsNull = StructColumnReader.calculateStructOffsets(field, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + Optional isNull = structIsNull.isNull(); + for (int i = 0; i < blocks.length; i++) { + if (blocks[i] == null) { + blocks[i] = RunLengthEncodedBlock.create(field.getType().getTypeParameters().get(i), null, structIsNull.positionsCount()); + } + else if (isNull.isPresent()) { + blocks[i] = toNotNullSupressedBlock(structIsNull.positionsCount(), isNull.get(), blocks[i]); + } + } + Block rowBlock = fromNotNullSuppressedFieldBlocks(structIsNull.positionsCount(), structIsNull.isNull(), blocks); + return new ColumnChunk(rowBlock, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + } + + private static Block toNotNullSupressedBlock(int positionCount, boolean[] rowIsNull, Block fieldBlock) + { + // find a existing position in the block that is null + int nullIndex = -1; + if (fieldBlock.mayHaveNull()) { + for (int position = 0; position < fieldBlock.getPositionCount(); position++) { + if (fieldBlock.isNull(position)) { + nullIndex = position; + break; + } + } + } + // if there are no null positions, append a null to the end of the block + if (nullIndex == -1) { + nullIndex = fieldBlock.getPositionCount(); + fieldBlock = fieldBlock.copyWithAppendedNull(); + } + + // create a dictionary that maps null positions to the null index + int[] dictionaryIds = new int[positionCount]; + int nullSuppressedPosition = 0; + for (int position = 0; position < positionCount; position++) { + if (rowIsNull[position]) { + dictionaryIds[position] = nullIndex; + } + else { + dictionaryIds[position] = nullSuppressedPosition; + nullSuppressedPosition++; + } + } + return DictionaryBlock.create(positionCount, fieldBlock, dictionaryIds); + } + + @Nullable + private FilteredOffsetIndex getFilteredOffsetIndex(FilteredRowRanges rowRanges, int rowGroup, long rowGroupRowCount, ColumnPath columnPath) + { + Optional rowGroupColumnIndexStore = this.rowGroups.get(rowGroup).columnIndexStore(); + if (rowGroupColumnIndexStore.isEmpty()) { + return null; + } + // We have a selective rowRanges for the rowGroup, every column must have a valid offset index + // to figure out which rows need to be read from the required parquet pages + OffsetIndex offsetIndex = requireNonNull( + rowGroupColumnIndexStore.get().getOffsetIndex(columnPath), + format("Missing OffsetIndex for column %s", columnPath)); + return FilteredOffsetIndex.filterOffsetIndex(offsetIndex, rowRanges.getParquetRowRanges(), rowGroupRowCount); + } + + private ColumnChunk readPrimitive(PrimitiveField field) + throws IOException + { + ColumnDescriptor columnDescriptor = field.getDescriptor(); + int fieldId = field.getId(); + ColumnReader columnReader = columnReaders.get(fieldId); + if (!columnReader.hasPageReader()) { + validateParquet(currentBlockMetadata.getRowCount() > 0, dataSource.getId(), "Row group has 0 rows"); + ColumnChunkMetadata metadata = currentBlockMetadata.getColumnChunkMetaData(columnDescriptor); + FilteredRowRanges rowRanges = blockRowRanges[currentRowGroup]; + OffsetIndex offsetIndex = null; + if (rowRanges != null) { + offsetIndex = getFilteredOffsetIndex(rowRanges, currentRowGroup, currentBlockMetadata.getRowCount(), metadata.getPath()); + } + ChunkedInputStream columnChunkInputStream = chunkReaders.get(new ChunkKey(fieldId, currentRowGroup)); + columnReader.setPageReader( + createPageReader(dataSource.getId(), columnChunkInputStream, metadata, columnDescriptor, offsetIndex, fileCreatedBy), + Optional.ofNullable(rowRanges)); + } + ColumnChunk columnChunk = columnReader.readPrimitive(); + + // update max size per primitive column chunk + double bytesPerCell = ((double) columnChunk.getMaxBlockSize()) / batchSize; + double bytesPerCellDelta = bytesPerCell - maxBytesPerCell.getOrDefault(fieldId, 0.0); + if (bytesPerCellDelta > 0) { + // update batch size + maxCombinedBytesPerRow += bytesPerCellDelta; + maxBatchSize = toIntExact(min(maxBatchSize, max(1, (long) (options.getMaxReadBlockSize().toBytes() / maxCombinedBytesPerRow)))); + maxBytesPerCell.put(fieldId, bytesPerCell); + } + return columnChunk; + } + + public List getColumnFields() + { + return columnFields; + } + + public Metrics getMetrics() + { + ImmutableMap.Builder> metrics = ImmutableMap.>builder() + .putAll(codecMetrics); + if (columnIndexRowsFiltered >= 0) { + metrics.put(COLUMN_INDEX_ROWS_FILTERED, new LongCount(columnIndexRowsFiltered)); + } + + return new Metrics(metrics.buildOrThrow()); + } + + private void initializeColumnReaders() + { + for (PrimitiveField field : primitiveFields) { + columnReaders.put( + field.getId(), + columnReaderFactory.create(field, currentRowGroupMemoryContext)); + } + } + + public static List getPrimitiveFields(List fields) + { + Map primitiveFields = new HashMap<>(); + fields.forEach(field -> parseField(field, primitiveFields)); + + return ImmutableList.copyOf(primitiveFields.values()); + } + + private static void parseField(Field field, Map primitiveFields) + { + if (field instanceof PrimitiveField primitiveField) { + primitiveFields.put(primitiveField.getId(), primitiveField); + } + else if (field instanceof GroupField groupField) { + groupField.getChildren().stream() + .flatMap(Optional::stream) + .forEach(child -> parseField(child, primitiveFields)); + } + else if (field instanceof VariantField variantField) { + parseField(variantField.getValue(), primitiveFields); + parseField(variantField.getMetadata(), primitiveFields); + } + } + + public Block readBlock(Field field) + throws IOException + { + return readColumnChunk(field).getBlock(); + } + + private ColumnChunk readColumnChunk(Field field) + throws IOException + { + ColumnChunk columnChunk; + if (field instanceof VariantField variantField) { + columnChunk = readVariant(variantField); + } + else if (field.getType() instanceof RowType) { + columnChunk = readStruct((GroupField) field); + } + else if (field.getType() instanceof MapType) { + columnChunk = readMap((GroupField) field); + } + else if (field.getType() instanceof ArrayType) { + columnChunk = readArray((GroupField) field); + } + else { + columnChunk = readPrimitive((PrimitiveField) field); + } + return columnChunk; + } + + public ParquetDataSource getDataSource() + { + return dataSource; + } + + public AggregatedMemoryContext getMemoryContext() + { + return memoryContext; + } + + private static FilteredRowRanges[] calculateFilteredRowRanges( + List rowGroups, + Optional filter, + List primitiveFields) + { + FilteredRowRanges[] blockRowRanges = new FilteredRowRanges[rowGroups.size()]; + if (filter.isEmpty()) { + return blockRowRanges; + } + Set paths = primitiveFields.stream() + .map(field -> ColumnPath.get(field.getDescriptor().getPath())) + .collect(toImmutableSet()); + for (int rowGroup = 0; rowGroup < rowGroups.size(); rowGroup++) { + RowGroupInfo rowGroupInfo = rowGroups.get(rowGroup); + Optional rowGroupColumnIndexStore = rowGroupInfo.columnIndexStore(); + if (rowGroupColumnIndexStore.isEmpty()) { + continue; + } + long rowGroupRowCount = rowGroupInfo.prunedBlockMetadata().getRowCount(); + FilteredRowRanges rowRanges = new FilteredRowRanges(ColumnIndexFilter.calculateRowRanges( + FilterCompat.get(filter.get()), + rowGroupColumnIndexStore.get(), + paths, + rowGroupRowCount)); + if (rowRanges.getRowCount() < rowGroupRowCount) { + blockRowRanges[rowGroup] = rowRanges; + } + } + return blockRowRanges; + } + + private void validateWritePageChecksum(Page sourcePage) + { + if (writeChecksumBuilder.isPresent()) { + Page page = sourcePage; + writeChecksumBuilder.get().addPage(page); + rowGroupStatisticsValidation.orElseThrow().addPage(page); + } + } + + private void validateBlockMetadata(List rowGroups) + throws ParquetCorruptionException + { + if (writeValidation.isPresent()) { + writeValidation.get().validateBlocksMetadata(dataSource.getId(), rowGroups); + } + } + + @SuppressWarnings("FormatStringAnnotation") + @FormatMethod + private void validateWrite(java.util.function.Predicate test, String messageFormat, Object... args) + throws ParquetCorruptionException + { + if (writeValidation.isPresent() && !test.test(writeValidation.get())) { + throw new ParquetCorruptionException(dataSource.getId(), "Write validation failed: " + messageFormat, args); + } + } + + public static Block createNullBlock(Type type) + { + return type.createBlockBuilder(null, 1, 0) + .appendNull() + .build(); + } + + public static RowBlock fromFieldBlocks(int positionCount, Block[] fieldBlocks) + { + return createRowBlockInternal(positionCount, null, fieldBlocks); + } + + private static RowBlock createRowBlockInternal(int positionCount, boolean[] rowIsNull, Block[] fieldBlocks) + { + MethodHandle createRowBlockInternalMethod = Reflection.methodHandle(RowBlock.class, "createRowBlockInternal", int.class, boolean[].class, Block[].class); + try { + return (RowBlock) createRowBlockInternalMethod.invoke(null, positionCount, rowIsNull, fieldBlocks); + } + catch (Throwable e) { + Throwables.throwIfUnchecked(e); + throw new RuntimeException(e); + } + } + + /** + * Create a row block directly from field blocks that are not null-suppressed. The field value of a null row must be null. + */ + public static RowBlock fromNotNullSuppressedFieldBlocks(int positionCount, Optional rowIsNullOptional, Block[] fieldBlocks) + { + // verify that field values for null rows are null + if (rowIsNullOptional.isPresent()) { + boolean[] rowIsNull = rowIsNullOptional.get(); + checkArrayRange(rowIsNull, 0, positionCount); + + for (int fieldIndex = 0; fieldIndex < fieldBlocks.length; fieldIndex++) { + Block field = fieldBlocks[fieldIndex]; + for (int position = 0; position < positionCount; position++) { + if (rowIsNull[position] && !field.isNull(position)) { + throw new IllegalArgumentException(format("Field value for null row must be null: field %s, position %s", fieldIndex, position)); + } + } + } + } + return createRowBlockInternal(positionCount, null, fieldBlocks); + } + + static void checkArrayRange(boolean[] array, int offset, int length) + { + requireNonNull(array, "array is null"); + if (offset < 0 || length < 0 || offset + length > array.length) { + throw new IndexOutOfBoundsException(format("Invalid offset %s and length %s in array with %s elements", offset, length, array.length)); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetReaderNew.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetReaderNew.java new file mode 100644 index 000000000000..02b1b78bc2c8 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/ParquetReaderNew.java @@ -0,0 +1,823 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ListMultimap; +import com.google.errorprone.annotations.CheckReturnValue; +import com.google.errorprone.annotations.FormatMethod; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.trino.memory.context.AggregatedMemoryContext; +import io.trino.parquet.ChunkKey; +import io.trino.parquet.Column; +import io.trino.parquet.DiskRange; +import io.trino.parquet.Field; +import io.trino.parquet.GroupField; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.ParquetWriteValidation; +import io.trino.parquet.PrimitiveField; +import io.trino.parquet.VariantField; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.metadata.PrunedBlockMetadata; +import io.trino.parquet.predicate.TupleDomainParquetPredicate; +import io.trino.parquet.reader.FilteredOffsetIndex.OffsetRange; +import io.trino.parquet.spark.Variant; +import io.trino.plugin.base.metrics.LongCount; +import io.trino.spi.Page; +import io.trino.spi.block.ArrayBlock; +import io.trino.spi.block.Block; +import io.trino.spi.block.BlockBuilder; +import io.trino.spi.block.DictionaryBlock; +import io.trino.spi.block.LongArrayBlock; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.metrics.Metric; +import io.trino.spi.metrics.Metrics; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.MapType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.Type; +import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.filter2.compat.FilterCompat; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.joda.time.DateTimeZone; + +import java.io.Closeable; +import java.io.IOException; +import java.time.ZoneId; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Function; +import java.util.function.ObjLongConsumer; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.SizeOf.sizeOf; +import static io.airlift.slice.Slices.utf8Slice; +import static io.trino.parquet.ParquetValidationUtils.validateParquet; +import static io.trino.parquet.ParquetWriteValidation.StatisticsValidation; +import static io.trino.parquet.ParquetWriteValidation.StatisticsValidation.createStatisticsValidationBuilder; +import static io.trino.parquet.ParquetWriteValidation.WriteChecksumBuilder; +import static io.trino.parquet.ParquetWriteValidation.WriteChecksumBuilder.createWriteChecksumBuilder; +import static io.trino.parquet.reader.ListColumnReader.calculateCollectionOffsets; +import static io.trino.parquet.reader.PageReader.createPageReader; +import static io.trino.plugin.iceberg.IcebergUtil.createNullBlock; +import static io.trino.plugin.iceberg.IcebergUtil.fromNotNullSuppressedFieldBlocks; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.util.Objects.checkIndex; +import static java.util.Objects.requireNonNull; + +public class ParquetReaderNew + implements Closeable +{ + private static final Logger log = Logger.get(ParquetReader.class); + + private static final int INITIAL_BATCH_SIZE = 1; + private static final int BATCH_SIZE_GROWTH_FACTOR = 2; + public static final String PARQUET_CODEC_METRIC_PREFIX = "ParquetReaderCompressionFormat_"; + public static final String COLUMN_INDEX_ROWS_FILTERED = "ParquetColumnIndexRowsFiltered"; + + private final Optional fileCreatedBy; + private final List rowGroups; + private final List columnFields; + private final boolean appendRowNumberColumn; + private final List primitiveFields; + private final ParquetDataSource dataSource; + private final ZoneId zoneId; + private final ColumnReaderFactory columnReaderFactory; + private final AggregatedMemoryContext memoryContext; + + private int currentRowGroup = -1; + private PrunedBlockMetadata currentBlockMetadata; + private long currentGroupRowCount; + /** + * Index in the Parquet file of the first row of the current group + */ + private long firstRowIndexInGroup; + /** + * Index in the current group of the next row + */ + private long nextRowInGroup; + private int batchSize; + private int nextBatchSize = INITIAL_BATCH_SIZE; + private final Map columnReaders; + private final Map maxBytesPerCell; + private double maxCombinedBytesPerRow; + private final ParquetReaderOptions options; + private int maxBatchSize; + + private AggregatedMemoryContext currentRowGroupMemoryContext; + private final Map chunkReaders; + private final Optional writeValidation; + private final Optional writeChecksumBuilder; + private final Optional rowGroupStatisticsValidation; + private final FilteredRowRanges[] blockRowRanges; + private final Function exceptionTransform; + private final Map> codecMetrics; + + private int currentPageId; + + private long columnIndexRowsFiltered = -1; + + public ParquetReaderNew( + Optional fileCreatedBy, + List columnFields, + boolean appendRowNumberColumn, + List rowGroups, + ParquetDataSource dataSource, + DateTimeZone timeZone, + AggregatedMemoryContext memoryContext, + ParquetReaderOptions options, + Function exceptionTransform, + Optional parquetPredicate, + Optional writeValidation) + throws IOException + { + this.fileCreatedBy = requireNonNull(fileCreatedBy, "fileCreatedBy is null"); + requireNonNull(columnFields, "columnFields is null"); + this.columnFields = ImmutableList.copyOf(columnFields); + this.appendRowNumberColumn = appendRowNumberColumn; + this.primitiveFields = getPrimitiveFields(columnFields.stream().map(Column::field).collect(toImmutableList())); + this.rowGroups = requireNonNull(rowGroups, "rowGroups is null"); + this.dataSource = requireNonNull(dataSource, "dataSource is null"); + this.zoneId = requireNonNull(timeZone, "timeZone is null").toTimeZone().toZoneId(); + this.columnReaderFactory = new ColumnReaderFactory(timeZone, options); + this.memoryContext = requireNonNull(memoryContext, "memoryContext is null"); + this.currentRowGroupMemoryContext = memoryContext.newAggregatedMemoryContext(); + this.options = requireNonNull(options, "options is null"); + this.maxBatchSize = options.getMaxReadBlockRowCount(); + this.columnReaders = new HashMap<>(); + this.maxBytesPerCell = new HashMap<>(); + + this.writeValidation = requireNonNull(writeValidation, "writeValidation is null"); + validateWrite( + validation -> fileCreatedBy.equals(Optional.of(validation.getCreatedBy())), + "Expected created by %s, found %s", + writeValidation.map(ParquetWriteValidation::getCreatedBy), + fileCreatedBy); + validateBlockMetadata(rowGroups); + this.writeChecksumBuilder = writeValidation.map(validation -> createWriteChecksumBuilder(validation.getTypes())); + this.rowGroupStatisticsValidation = writeValidation.map(validation -> createStatisticsValidationBuilder(validation.getTypes())); + + requireNonNull(parquetPredicate, "parquetPredicate is null"); + Optional filter = Optional.empty(); + if (parquetPredicate.isPresent() && options.isUseColumnIndex()) { + filter = parquetPredicate.get().toParquetFilter(timeZone); + } + this.blockRowRanges = calculateFilteredRowRanges(rowGroups, filter, primitiveFields); + + this.exceptionTransform = exceptionTransform; + ListMultimap ranges = ArrayListMultimap.create(); + Map codecMetrics = new HashMap<>(); + for (int rowGroup = 0; rowGroup < rowGroups.size(); rowGroup++) { + PrunedBlockMetadata blockMetadata = rowGroups.get(rowGroup).prunedBlockMetadata(); + long rowGroupRowCount = blockMetadata.getRowCount(); + for (PrimitiveField field : primitiveFields) { + int columnId = field.getId(); + ColumnChunkMetadata chunkMetadata = blockMetadata.getColumnChunkMetaData(field.getDescriptor()); + ColumnPath columnPath = chunkMetadata.getPath(); + + long startingPosition = chunkMetadata.getStartingPos(); + long totalLength = chunkMetadata.getTotalSize(); + long totalDataSize = 0; + FilteredOffsetIndex filteredOffsetIndex = null; + if (blockRowRanges[rowGroup] != null) { + filteredOffsetIndex = getFilteredOffsetIndex(blockRowRanges[rowGroup], rowGroup, rowGroupRowCount, columnPath); + } + if (filteredOffsetIndex == null) { + DiskRange range = new DiskRange(startingPosition, totalLength); + totalDataSize = range.getLength(); + ranges.put(new ChunkKey(columnId, rowGroup), range); + } + else { + List offsetRanges = filteredOffsetIndex.calculateOffsetRanges(startingPosition); + for (OffsetRange offsetRange : offsetRanges) { + DiskRange range = new DiskRange(offsetRange.getOffset(), offsetRange.getLength()); + totalDataSize += range.getLength(); + ranges.put(new ChunkKey(columnId, rowGroup), range); + } + // Initialize columnIndexRowsFiltered only when column indexes are found and used + columnIndexRowsFiltered = 0; + } + // Update the metrics which records the codecs used along with data size + codecMetrics.merge( + PARQUET_CODEC_METRIC_PREFIX + chunkMetadata.getCodec().name(), + new LongCount(totalDataSize), + LongCount::mergeWith); + } + } + this.codecMetrics = ImmutableMap.copyOf(codecMetrics); + this.chunkReaders = dataSource.planRead(ranges, memoryContext); + } + + @Override + public void close() + throws IOException + { + // Release memory usage from column readers + columnReaders.clear(); + currentRowGroupMemoryContext.close(); + + for (ChunkedInputStream chunkedInputStream : chunkReaders.values()) { + chunkedInputStream.close(); + } + dataSource.close(); + + if (writeChecksumBuilder.isPresent()) { + ParquetWriteValidation parquetWriteValidation = writeValidation.orElseThrow(); + parquetWriteValidation.validateChecksum(dataSource.getId(), writeChecksumBuilder.get().build()); + } + } + + public Page nextPage() + throws IOException + { + int batchSize = nextBatch(); + if (batchSize <= 0) { + return null; + } + // create a lazy page + currentPageId++; + Page page = new ParquetSourcePage(batchSize).getPage(); + validateWritePageChecksum(page); + return page; + } + + private class ParquetSourcePage + { + private static final long INSTANCE_SIZE = instanceSize(ParquetSourcePage.class); + + private final int expectedPageId = currentPageId; + private final Block[] blocks = new Block[columnFields.size() + (appendRowNumberColumn ? 1 : 0)]; + private final int rowNumberColumnIndex = appendRowNumberColumn ? columnFields.size() : -1; + private SelectedPositions selectedPositions; + + private long sizeInBytes; + private long retainedSizeInBytes; + + public ParquetSourcePage(int positionCount) + { + selectedPositions = new SelectedPositions(positionCount, null); + retainedSizeInBytes = shallowRetainedSizeInBytes(); + } + + public int getPositionCount() + { + return selectedPositions.positionCount(); + } + + public long getSizeInBytes() + { + return sizeInBytes; + } + + public long getRetainedSizeInBytes() + { + return retainedSizeInBytes; + } + + private long shallowRetainedSizeInBytes() + { + return INSTANCE_SIZE + + sizeOf(blocks) + + selectedPositions.retainedSizeInBytes(); + } + + public void retainedBytesForEachPart(ObjLongConsumer consumer) + { + consumer.accept(this, INSTANCE_SIZE); + consumer.accept(blocks, sizeOf(blocks)); + consumer.accept(selectedPositions, selectedPositions.retainedSizeInBytes()); + for (Block block : blocks) { + if (block != null) { + block.retainedBytesForEachPart(consumer); + } + } + } + + public int getChannelCount() + { + return blocks.length; + } + + public Block getBlock(int channel) + { + checkState(currentPageId == expectedPageId, "Parquet reader has been advanced beyond block"); + Block block = blocks[channel]; + if (block == null) { + if (channel == rowNumberColumnIndex) { + block = selectedPositions.createRowNumberBlock(lastBatchStartRow()); + } + else { + try { + // todo use selected positions to improve read performance + block = readBlock(columnFields.get(channel).field()); + } + catch (IOException e) { + throw exceptionTransform.apply(e); + } + block = selectedPositions.apply(block); + } + blocks[channel] = block; + sizeInBytes += block.getSizeInBytes(); + retainedSizeInBytes += block.getRetainedSizeInBytes(); + } + return block; + } + + public Page getPage() + { + // ensure all blocks are loaded + for (int i = 0; i < blocks.length; i++) { + getBlock(i); + } + return new Page(selectedPositions.positionCount(), blocks); + } + + public void selectPositions(int[] positions, int offset, int size) + { + selectedPositions = selectedPositions.selectPositions(positions, offset, size); + retainedSizeInBytes = shallowRetainedSizeInBytes(); + for (int i = 0; i < blocks.length; i++) { + Block block = blocks[i]; + if (block != null) { + block = selectedPositions.apply(block); + retainedSizeInBytes += block.getRetainedSizeInBytes(); + blocks[i] = block; + } + } + } + } + + private record SelectedPositions(int positionCount, @Nullable int[] positions) + { + private static final long INSTANCE_SIZE = instanceSize(SelectedPositions.class); + + public long retainedSizeInBytes() + { + return INSTANCE_SIZE + sizeOf(positions); + } + + @CheckReturnValue + public Block apply(Block block) + { + if (positions == null) { + return block; + } + return block.getPositions(positions, 0, positionCount); + } + + public Block createRowNumberBlock(long startRowNumber) + { + long[] rowNumbers = new long[positionCount]; + for (int i = 0; i < positionCount; i++) { + int position = positions == null ? i : positions[i]; + rowNumbers[i] = startRowNumber + position; + } + return new LongArrayBlock(positionCount, Optional.empty(), rowNumbers); + } + + @CheckReturnValue + public SelectedPositions selectPositions(int[] positions, int offset, int size) + { + if (this.positions == null) { + for (int i = 0; i < size; i++) { + checkIndex(offset + i, positionCount); + } + return new SelectedPositions(size, Arrays.copyOfRange(positions, offset, offset + size)); + } + + int[] newPositions = new int[size]; + for (int i = 0; i < size; i++) { + newPositions[i] = this.positions[positions[offset + i]]; + } + return new SelectedPositions(size, newPositions); + } + } + + /** + * Get the global row index of the first row in the last batch. + */ + public long lastBatchStartRow() + { + return firstRowIndexInGroup + nextRowInGroup - batchSize; + } + + private int nextBatch() + throws IOException + { + if (nextRowInGroup >= currentGroupRowCount && !advanceToNextRowGroup()) { + return -1; + } + + batchSize = min(nextBatchSize, maxBatchSize); + nextBatchSize = min(batchSize * BATCH_SIZE_GROWTH_FACTOR, options.getMaxReadBlockRowCount()); + batchSize = toIntExact(min(batchSize, currentGroupRowCount - nextRowInGroup)); + + nextRowInGroup += batchSize; + columnReaders.values().forEach(reader -> reader.prepareNextRead(batchSize)); + return batchSize; + } + + private boolean advanceToNextRowGroup() + throws IOException + { + currentRowGroupMemoryContext.close(); + currentRowGroupMemoryContext = memoryContext.newAggregatedMemoryContext(); + freeCurrentRowGroupBuffers(); + + if (currentRowGroup >= 0 && rowGroupStatisticsValidation.isPresent()) { + StatisticsValidation statisticsValidation = rowGroupStatisticsValidation.get(); + //writeValidation.orElseThrow().validateRowGroupStatistics(dataSource.getId(), currentBlockMetadata, statisticsValidation.build()); + statisticsValidation.reset(); + } + + currentRowGroup++; + if (currentRowGroup == rowGroups.size()) { + return false; + } + RowGroupInfo rowGroupInfo = rowGroups.get(currentRowGroup); + currentBlockMetadata = rowGroupInfo.prunedBlockMetadata(); + firstRowIndexInGroup = rowGroupInfo.fileRowOffset(); + currentGroupRowCount = currentBlockMetadata.getRowCount(); + FilteredRowRanges currentGroupRowRanges = blockRowRanges[currentRowGroup]; + log.debug("advanceToNextRowGroup dataSource %s, currentRowGroup %d, rowRanges %s, currentBlockMetadata %s", dataSource.getId(), currentRowGroup, currentGroupRowRanges, currentBlockMetadata); + if (currentGroupRowRanges != null) { + long rowCount = currentGroupRowRanges.getRowCount(); + columnIndexRowsFiltered += currentGroupRowCount - rowCount; + if (rowCount == 0) { + // Filters on multiple columns with page indexes may yield non-overlapping row ranges and eliminate the entire row group. + // Advance to next row group to ensure that we don't return a null Page and close the page source before all row groups are processed + return advanceToNextRowGroup(); + } + currentGroupRowCount = rowCount; + } + nextRowInGroup = 0L; + initializeColumnReaders(); + return true; + } + + private void freeCurrentRowGroupBuffers() + { + if (currentRowGroup < 0) { + return; + } + + for (PrimitiveField field : primitiveFields) { + ChunkedInputStream chunkedStream = chunkReaders.get(new ChunkKey(field.getId(), currentRowGroup)); + if (chunkedStream != null) { + chunkedStream.close(); + } + } + } + + private ColumnChunk readVariant(VariantField field) + throws IOException + { + ColumnChunk metadataChunk = readColumnChunk(field.getMetadata()); + + int positionCount = metadataChunk.getBlock().getPositionCount(); + BlockBuilder variantBlock = VARCHAR.createBlockBuilder(null, max(1, positionCount)); + if (positionCount == 0) { + variantBlock.appendNull(); + } + else { + ColumnChunk valueChunk = readColumnChunk(field.getValue()); + for (int position = 0; position < positionCount; position++) { + Slice metadata = VARBINARY.getSlice(metadataChunk.getBlock(), position); + if (metadata.length() == 0) { + variantBlock.appendNull(); + continue; + } + Slice value = VARBINARY.getSlice(valueChunk.getBlock(), position); + Variant variant = new Variant(value.getBytes(), metadata.getBytes()); + VARCHAR.writeSlice(variantBlock, utf8Slice(variant.toJson(zoneId))); + } + } + return new ColumnChunk(variantBlock.build(), metadataChunk.getDefinitionLevels(), metadataChunk.getRepetitionLevels()); + } + + private ColumnChunk readArray(GroupField field) + throws IOException + { + List parameters = field.getType().getTypeParameters(); + checkArgument(parameters.size() == 1, "Arrays must have a single type parameter, found %s", parameters.size()); + Optional children = field.getChildren().get(0); + if (children.isEmpty()) { + return new ColumnChunk(createNullBlock(field.getType()), new int[] {}, new int[] {}); + } + Field elementField = children.get(); + ColumnChunk columnChunk = readColumnChunk(elementField); + + ListColumnReader.BlockPositions collectionPositions = calculateCollectionOffsets(field, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + int positionsCount = collectionPositions.offsets().length - 1; + Block arrayBlock = ArrayBlock.fromElementBlock(positionsCount, collectionPositions.isNull(), collectionPositions.offsets(), columnChunk.getBlock()); + return new ColumnChunk(arrayBlock, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + } + + private ColumnChunk readMap(GroupField field) + throws IOException + { + List parameters = field.getType().getTypeParameters(); + checkArgument(parameters.size() == 2, "Maps must have two type parameters, found %s", parameters.size()); + Block[] blocks = new Block[parameters.size()]; + + ColumnChunk columnChunk = readColumnChunk(field.getChildren().get(0).get()); + blocks[0] = columnChunk.getBlock(); + Optional valueField = field.getChildren().get(1); + blocks[1] = valueField.isPresent() ? readColumnChunk(valueField.get()).getBlock() : createNullBlock(parameters.get(1)); + ListColumnReader.BlockPositions collectionPositions = calculateCollectionOffsets(field, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + Block mapBlock = ((MapType) field.getType()).createBlockFromKeyValue(collectionPositions.isNull(), collectionPositions.offsets(), blocks[0], blocks[1]); + return new ColumnChunk(mapBlock, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + } + + private ColumnChunk readStruct(GroupField field) + throws IOException + { + Block[] blocks = new Block[field.getType().getTypeParameters().size()]; + ColumnChunk columnChunk = null; + List> parameters = field.getChildren(); + for (int i = 0; i < blocks.length; i++) { + Optional parameter = parameters.get(i); + if (parameter.isPresent()) { + columnChunk = readColumnChunk(parameter.get()); + blocks[i] = columnChunk.getBlock(); + } + } + + if (columnChunk == null) { + throw new ParquetCorruptionException(dataSource.getId(), "Struct field does not have any children: %s", field); + } + + StructColumnReader.RowBlockPositions structIsNull = StructColumnReader.calculateStructOffsets(field, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + Optional isNull = structIsNull.isNull(); + for (int i = 0; i < blocks.length; i++) { + if (blocks[i] == null) { + blocks[i] = RunLengthEncodedBlock.create(field.getType().getTypeParameters().get(i), null, structIsNull.positionsCount()); + } + else if (isNull.isPresent()) { + blocks[i] = toNotNullSupressedBlock(structIsNull.positionsCount(), isNull.get(), blocks[i]); + } + } + Block rowBlock = fromNotNullSuppressedFieldBlocks(structIsNull.positionsCount(), structIsNull.isNull(), blocks); + return new ColumnChunk(rowBlock, columnChunk.getDefinitionLevels(), columnChunk.getRepetitionLevels()); + } + + private static Block toNotNullSupressedBlock(int positionCount, boolean[] rowIsNull, Block fieldBlock) + { + // find a existing position in the block that is null + int nullIndex = -1; + if (fieldBlock.mayHaveNull()) { + for (int position = 0; position < fieldBlock.getPositionCount(); position++) { + if (fieldBlock.isNull(position)) { + nullIndex = position; + break; + } + } + } + // if there are no null positions, append a null to the end of the block + if (nullIndex == -1) { + nullIndex = fieldBlock.getPositionCount(); + fieldBlock = fieldBlock.copyWithAppendedNull(); + } + + // create a dictionary that maps null positions to the null index + int[] dictionaryIds = new int[positionCount]; + int nullSuppressedPosition = 0; + for (int position = 0; position < positionCount; position++) { + if (rowIsNull[position]) { + dictionaryIds[position] = nullIndex; + } + else { + dictionaryIds[position] = nullSuppressedPosition; + nullSuppressedPosition++; + } + } + return DictionaryBlock.create(positionCount, fieldBlock, dictionaryIds); + } + + @Nullable + private FilteredOffsetIndex getFilteredOffsetIndex(FilteredRowRanges rowRanges, int rowGroup, long rowGroupRowCount, ColumnPath columnPath) + { + Optional rowGroupColumnIndexStore = this.rowGroups.get(rowGroup).columnIndexStore(); + if (rowGroupColumnIndexStore.isEmpty()) { + return null; + } + // We have a selective rowRanges for the rowGroup, every column must have a valid offset index + // to figure out which rows need to be read from the required parquet pages + OffsetIndex offsetIndex = requireNonNull( + rowGroupColumnIndexStore.get().getOffsetIndex(columnPath), + format("Missing OffsetIndex for column %s", columnPath)); + return FilteredOffsetIndex.filterOffsetIndex(offsetIndex, rowRanges.getParquetRowRanges(), rowGroupRowCount); + } + + private ColumnChunk readPrimitive(PrimitiveField field) + throws IOException + { + ColumnDescriptor columnDescriptor = field.getDescriptor(); + int fieldId = field.getId(); + ColumnReader columnReader = columnReaders.get(fieldId); + if (!columnReader.hasPageReader()) { + validateParquet(currentBlockMetadata.getRowCount() > 0, dataSource.getId(), "Row group has 0 rows"); + ColumnChunkMetadata metadata = currentBlockMetadata.getColumnChunkMetaData(columnDescriptor); + FilteredRowRanges rowRanges = blockRowRanges[currentRowGroup]; + OffsetIndex offsetIndex = null; + if (rowRanges != null) { + offsetIndex = getFilteredOffsetIndex(rowRanges, currentRowGroup, currentBlockMetadata.getRowCount(), metadata.getPath()); + } + ChunkedInputStream columnChunkInputStream = chunkReaders.get(new ChunkKey(fieldId, currentRowGroup)); + columnReader.setPageReader( + createPageReader(dataSource.getId(), columnChunkInputStream, metadata, columnDescriptor, offsetIndex, fileCreatedBy), + Optional.ofNullable(rowRanges)); + } + ColumnChunk columnChunk = columnReader.readPrimitive(); + + // update max size per primitive column chunk + double bytesPerCell = ((double) columnChunk.getMaxBlockSize()) / batchSize; + double bytesPerCellDelta = bytesPerCell - maxBytesPerCell.getOrDefault(fieldId, 0.0); + if (bytesPerCellDelta > 0) { + // update batch size + maxCombinedBytesPerRow += bytesPerCellDelta; + maxBatchSize = toIntExact(min(maxBatchSize, max(1, (long) (options.getMaxReadBlockSize().toBytes() / maxCombinedBytesPerRow)))); + maxBytesPerCell.put(fieldId, bytesPerCell); + } + return columnChunk; + } + + public List getColumnFields() + { + return columnFields; + } + + public Metrics getMetrics() + { + ImmutableMap.Builder> metrics = ImmutableMap.>builder() + .putAll(codecMetrics); + if (columnIndexRowsFiltered >= 0) { + metrics.put(COLUMN_INDEX_ROWS_FILTERED, new LongCount(columnIndexRowsFiltered)); + } + + return new Metrics(metrics.buildOrThrow()); + } + + private void initializeColumnReaders() + { + for (PrimitiveField field : primitiveFields) { + columnReaders.put( + field.getId(), + columnReaderFactory.create(field, currentRowGroupMemoryContext)); + } + } + + public static List getPrimitiveFields(List fields) + { + Map primitiveFields = new HashMap<>(); + fields.forEach(field -> parseField(field, primitiveFields)); + + return ImmutableList.copyOf(primitiveFields.values()); + } + + private static void parseField(Field field, Map primitiveFields) + { + if (field instanceof PrimitiveField primitiveField) { + primitiveFields.put(primitiveField.getId(), primitiveField); + } + else if (field instanceof GroupField groupField) { + groupField.getChildren().stream() + .flatMap(Optional::stream) + .forEach(child -> parseField(child, primitiveFields)); + } + else if (field instanceof VariantField variantField) { + parseField(variantField.getValue(), primitiveFields); + parseField(variantField.getMetadata(), primitiveFields); + } + } + + public Block readBlock(Field field) + throws IOException + { + return readColumnChunk(field).getBlock(); + } + + private ColumnChunk readColumnChunk(Field field) + throws IOException + { + ColumnChunk columnChunk; + if (field instanceof VariantField variantField) { + columnChunk = readVariant(variantField); + } + else if (field.getType() instanceof RowType) { + columnChunk = readStruct((GroupField) field); + } + else if (field.getType() instanceof MapType) { + columnChunk = readMap((GroupField) field); + } + else if (field.getType() instanceof ArrayType) { + columnChunk = readArray((GroupField) field); + } + else { + columnChunk = readPrimitive((PrimitiveField) field); + } + return columnChunk; + } + + public ParquetDataSource getDataSource() + { + return dataSource; + } + + public AggregatedMemoryContext getMemoryContext() + { + return memoryContext; + } + + private static FilteredRowRanges[] calculateFilteredRowRanges( + List rowGroups, + Optional filter, + List primitiveFields) + { + FilteredRowRanges[] blockRowRanges = new FilteredRowRanges[rowGroups.size()]; + if (filter.isEmpty()) { + return blockRowRanges; + } + Set paths = primitiveFields.stream() + .map(field -> ColumnPath.get(field.getDescriptor().getPath())) + .collect(toImmutableSet()); + for (int rowGroup = 0; rowGroup < rowGroups.size(); rowGroup++) { + RowGroupInfo rowGroupInfo = rowGroups.get(rowGroup); + Optional rowGroupColumnIndexStore = rowGroupInfo.columnIndexStore(); + if (rowGroupColumnIndexStore.isEmpty()) { + continue; + } + long rowGroupRowCount = rowGroupInfo.prunedBlockMetadata().getRowCount(); + FilteredRowRanges rowRanges = new FilteredRowRanges(ColumnIndexFilter.calculateRowRanges( + FilterCompat.get(filter.get()), + rowGroupColumnIndexStore.get(), + paths, + rowGroupRowCount)); + if (rowRanges.getRowCount() < rowGroupRowCount) { + blockRowRanges[rowGroup] = rowRanges; + } + } + return blockRowRanges; + } + + private void validateWritePageChecksum(Page sourcePage) + { + if (writeChecksumBuilder.isPresent()) { + Page page = sourcePage; + writeChecksumBuilder.get().addPage(page); + rowGroupStatisticsValidation.orElseThrow().addPage(page); + } + } + + private void validateBlockMetadata(List rowGroups) + throws ParquetCorruptionException + { + if (writeValidation.isPresent()) { + //writeValidation.get().validateBlocksMetadata(dataSource.getId(), rowGroups); + } + } + + @SuppressWarnings("FormatStringAnnotation") + @FormatMethod + private void validateWrite(java.util.function.Predicate test, String messageFormat, Object... args) + throws ParquetCorruptionException + { + if (writeValidation.isPresent() && !test.test(writeValidation.get())) { + throw new ParquetCorruptionException(dataSource.getId(), "Write validation failed: " + messageFormat, args); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/RowGroupInfo.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/RowGroupInfo.java new file mode 100644 index 000000000000..a35ecf379394 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/RowGroupInfo.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.trino.parquet.metadata.PrunedBlockMetadata; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; + +import java.util.Optional; + +public record RowGroupInfo(PrunedBlockMetadata prunedBlockMetadata, long fileRowOffset, Optional columnIndexStore) {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/SimpleSliceInputStream.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/SimpleSliceInputStream.java new file mode 100644 index 000000000000..e453b0eabd35 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/SimpleSliceInputStream.java @@ -0,0 +1,165 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import com.google.common.primitives.Shorts; +import io.airlift.slice.Slice; + +import static java.util.Objects.requireNonNull; + +/** + * Basic input stream based on a given Slice object. + * This is a simpler version of BasicSliceInput with a few additional methods. + *

+ * Note that methods starting with 'read' modify the underlying offset, while 'get' methods return + * value without modifying the state + */ +public final class SimpleSliceInputStream +{ + private final Slice slice; + private int offset; + + public SimpleSliceInputStream(Slice slice) + { + this(slice, 0); + } + + public SimpleSliceInputStream(Slice slice, int offset) + { + this.slice = requireNonNull(slice, "slice is null"); + this.offset = offset; + } + + public byte readByte() + { + return slice.getByte(offset++); + } + + public short readShort() + { + short value = slice.getShort(offset); + offset += Short.BYTES; + return value; + } + + public int readInt() + { + int value = slice.getInt(offset); + offset += Integer.BYTES; + return value; + } + + public long readLong() + { + long value = slice.getLong(offset); + offset += Long.BYTES; + return value; + } + + public byte[] readBytes() + { + byte[] bytes = slice.getBytes(); + offset = slice.length(); + return bytes; + } + + public void readBytes(byte[] output, int outputOffset, int length) + { + slice.getBytes(offset, output, outputOffset, length); + offset += length; + } + + public void readShorts(short[] output, int outputOffset, int length) + { + slice.getShorts(offset, output, outputOffset, length); + offset += length * Shorts.BYTES; + } + + public void readInts(int[] output, int outputOffset, int length) + { + slice.getInts(offset, output, outputOffset, length); + offset += length * Integer.BYTES; + } + + public void readLongs(long[] output, int outputOffset, int length) + { + slice.getLongs(offset, output, outputOffset, length); + offset += length * Long.BYTES; + } + + public void readBytes(Slice destination, int destinationIndex, int length) + { + slice.getBytes(offset, destination, destinationIndex, length); + offset += length; + } + + public Slice readSlice(int length) + { + Slice result = slice.slice(offset, length); + offset += length; + return result; + } + + public void skip(int n) + { + offset += n; + } + + public Slice asSlice() + { + return slice.slice(offset, slice.length() - offset); + } + + /** + * Returns the byte array wrapped by this Slice. + * Callers should take care to use {@link SimpleSliceInputStream#getByteArrayOffset()} + * since the contents of this Slice may not start at array index 0. + */ + public byte[] getByteArray() + { + return slice.byteArray(); + } + + /** + * Returns the start index the content of this slice within the byte array wrapped by this slice. + */ + public int getByteArrayOffset() + { + return offset + slice.byteArrayOffset(); + } + + public int readIntUnchecked() + { + int value = slice.getIntUnchecked(offset); + offset += Integer.BYTES; + return value; + } + + public long readLongUnchecked() + { + long value = slice.getLongUnchecked(offset); + offset += Long.BYTES; + return value; + } + + public byte getByteUnchecked(int index) + { + return slice.getByteUnchecked(offset + index); + } + + public int getIntUnchecked(int index) + { + return slice.getIntUnchecked(offset + index); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/StructColumnReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/StructColumnReader.java new file mode 100644 index 000000000000..8a5c6cf3ac19 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/StructColumnReader.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import io.trino.parquet.Field; +import it.unimi.dsi.fastutil.booleans.BooleanArrayList; + +import java.util.Optional; + +import static io.trino.parquet.ParquetTypeUtils.isOptionalFieldValueNull; + +public final class StructColumnReader +{ + private StructColumnReader() {} + + /** + * Each struct has three variants of presence: + * 1) Struct is not defined, because one of it's optional parent fields is null + * 2) Struct is null + * 3) Struct is defined and not empty. + */ + public static RowBlockPositions calculateStructOffsets( + Field field, + int[] fieldDefinitionLevels, + int[] fieldRepetitionLevels) + { + int maxDefinitionLevel = field.getDefinitionLevel(); + int maxRepetitionLevel = field.getRepetitionLevel(); + boolean required = field.isRequired(); + if (required) { + int definedValuesCount = 0; + for (int i = 0; i < fieldDefinitionLevels.length; i++) { + if (fieldRepetitionLevels[i] <= maxRepetitionLevel) { + if (fieldDefinitionLevels[i] >= maxDefinitionLevel) { + // Struct is defined and not empty + definedValuesCount++; + } + } + } + return new RowBlockPositions(Optional.empty(), definedValuesCount); + } + + int nullValuesCount = 0; + BooleanArrayList structIsNull = new BooleanArrayList(); + for (int i = 0; i < fieldDefinitionLevels.length; i++) { + if (fieldRepetitionLevels[i] <= maxRepetitionLevel) { + if (isOptionalFieldValueNull(fieldDefinitionLevels[i], maxDefinitionLevel)) { + // Struct is null + structIsNull.add(true); + nullValuesCount++; + } + else if (fieldDefinitionLevels[i] >= maxDefinitionLevel) { + // Struct is defined and not empty + structIsNull.add(false); + } + } + } + if (nullValuesCount == 0) { + return new RowBlockPositions(Optional.empty(), structIsNull.size()); + } + return new RowBlockPositions(Optional.of(structIsNull.elements()), structIsNull.size()); + } + + public record RowBlockPositions(Optional isNull, int positionsCount) {} +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/TrinoColumnIndexStore.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/TrinoColumnIndexStore.java new file mode 100644 index 000000000000..fa9b7ae142d5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/TrinoColumnIndexStore.java @@ -0,0 +1,229 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ListMultimap; +import io.trino.parquet.DiskRange; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.metadata.BlockMetadata; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.metadata.IndexReference; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.TupleDomain; +import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.format.Util; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.schema.PrimitiveType; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.BiFunction; + +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.trino.parquet.ParquetMetadataConverter.fromParquetColumnIndex; +import static io.trino.parquet.ParquetMetadataConverter.fromParquetOffsetIndex; +import static java.util.Objects.requireNonNull; + +/** + * Internal implementation of {@link ColumnIndexStore}. + * Similar to org.apache.parquet.hadoop.ColumnIndexStoreImpl which is not accessible + */ +public class TrinoColumnIndexStore + implements ColumnIndexStore +{ + private final ParquetDataSource dataSource; + private final List columnIndexReferences; + private final List offsetIndexReferences; + + @Nullable + private Map columnIndexStore; + @Nullable + private Map offsetIndexStore; + + /** + * Creates a column index store which lazily reads column/offset indexes for the columns in paths. + * + * @param columnsRead is the set of columns used for projection + * @param columnsFiltered is the set of columns used for filtering + */ + public TrinoColumnIndexStore( + ParquetDataSource dataSource, + BlockMetadata block, + Set columnsRead, + Set columnsFiltered) + { + this.dataSource = requireNonNull(dataSource, "dataSource is null"); + requireNonNull(block, "block is null"); + requireNonNull(columnsRead, "columnsRead is null"); + requireNonNull(columnsFiltered, "columnsFiltered is null"); + + ImmutableList.Builder columnIndexBuilder = ImmutableList.builderWithExpectedSize(columnsFiltered.size()); + ImmutableList.Builder offsetIndexBuilder = ImmutableList.builderWithExpectedSize(columnsRead.size()); + for (ColumnChunkMetadata column : block.columns()) { + ColumnPath path = column.getPath(); + if (column.getColumnIndexReference() != null && columnsFiltered.contains(path)) { + columnIndexBuilder.add(new ColumnIndexMetadata( + column.getColumnIndexReference(), + path, + column.getPrimitiveType())); + } + if (column.getOffsetIndexReference() != null && columnsRead.contains(path)) { + offsetIndexBuilder.add(new ColumnIndexMetadata( + column.getOffsetIndexReference(), + path, + column.getPrimitiveType())); + } + } + this.columnIndexReferences = columnIndexBuilder.build(); + this.offsetIndexReferences = offsetIndexBuilder.build(); + } + + @Override + public ColumnIndex getColumnIndex(ColumnPath column) + { + if (columnIndexStore == null) { + columnIndexStore = loadIndexes(dataSource, columnIndexReferences, (inputStream, columnMetadata) -> { + try { + return fromParquetColumnIndex(columnMetadata.getPrimitiveType(), Util.readColumnIndex(inputStream)); + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + + return columnIndexStore.get(column); + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) + { + if (offsetIndexStore == null) { + offsetIndexStore = loadIndexes(dataSource, offsetIndexReferences, (inputStream, columnMetadata) -> { + try { + return fromParquetOffsetIndex(Util.readOffsetIndex(inputStream)); + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + + return offsetIndexStore.get(column); + } + + public static Optional getColumnIndexStore( + ParquetDataSource dataSource, + BlockMetadata blockMetadata, + Map, ColumnDescriptor> descriptorsByPath, + TupleDomain parquetTupleDomain, + ParquetReaderOptions options) + { + if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) { + return Optional.empty(); + } + + boolean hasColumnIndex = false; + for (ColumnChunkMetadata column : blockMetadata.columns()) { + if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) { + hasColumnIndex = true; + break; + } + } + + if (!hasColumnIndex) { + return Optional.empty(); + } + + Set columnsReadPaths = new HashSet<>(descriptorsByPath.size()); + for (List path : descriptorsByPath.keySet()) { + columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0]))); + } + + Map parquetDomains = parquetTupleDomain.getDomains() + .orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains")); + Set columnsFilteredPaths = parquetDomains.keySet().stream() + .map(column -> ColumnPath.get(column.getPath())) + .collect(toImmutableSet()); + + return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths)); + } + + private static Map loadIndexes( + ParquetDataSource dataSource, + List indexMetadata, + BiFunction deserializer) + { + // Merge multiple small reads of the file for indexes stored close to each other + ListMultimap ranges = ArrayListMultimap.create(indexMetadata.size(), 1); + for (ColumnIndexMetadata column : indexMetadata) { + ranges.put(column.getPath(), column.getDiskRange()); + } + + Map columnInputStreams = dataSource.planRead(ranges, newSimpleAggregatedMemoryContext()); + try { + return indexMetadata.stream() + .collect(toImmutableMap( + ColumnIndexMetadata::getPath, + column -> deserializer.apply(columnInputStreams.get(column.getPath()), column))); + } + finally { + columnInputStreams.values().forEach(ChunkedInputStream::close); + } + } + + private static class ColumnIndexMetadata + { + private final DiskRange diskRange; + private final ColumnPath path; + private final PrimitiveType primitiveType; + + private ColumnIndexMetadata(IndexReference indexReference, ColumnPath path, PrimitiveType primitiveType) + { + requireNonNull(indexReference, "indexReference is null"); + this.diskRange = new DiskRange(indexReference.getOffset(), indexReference.getLength()); + this.path = requireNonNull(path, "path is null"); + this.primitiveType = requireNonNull(primitiveType, "primitiveType is null"); + } + + private DiskRange getDiskRange() + { + return diskRange; + } + + private ColumnPath getPath() + { + return path; + } + + private PrimitiveType getPrimitiveType() + { + return primitiveType; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ApacheParquetValueDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ApacheParquetValueDecoders.java new file mode 100644 index 000000000000..417b1422f6a3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ApacheParquetValueDecoders.java @@ -0,0 +1,149 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.values.ValuesReader; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; + +import static io.trino.parquet.ParquetReaderUtils.castToByte; +import static java.lang.Double.doubleToLongBits; +import static java.lang.Float.floatToIntBits; +import static java.util.Objects.requireNonNull; + +/** + * This is a set of proxy value decoders that use a delegated value reader from apache lib. + */ +public class ApacheParquetValueDecoders +{ + private ApacheParquetValueDecoders() {} + + public static final class BooleanApacheParquetValueDecoder + implements ValueDecoder + { + private final ValuesReader delegate; + + public BooleanApacheParquetValueDecoder(ValuesReader delegate) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + public void init(SimpleSliceInputStream input) + { + byte[] buffer = input.readBytes(); + try { + // Deprecated PLAIN boolean decoder from Apache lib is the only one that actually + // uses the valueCount argument to allocate memory so we simulate it here. + int valueCount = buffer.length * Byte.SIZE; + delegate.initFromPage(valueCount, ByteBufferInputStream.wrap(ByteBuffer.wrap(buffer, 0, buffer.length))); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public void read(byte[] values, int offset, int length) + { + for (int i = offset; i < offset + length; i++) { + values[i] = castToByte(delegate.readBoolean()); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + } + + public static final class DoubleApacheParquetValueDecoder + implements ValueDecoder + { + private final ValuesReader delegate; + + public DoubleApacheParquetValueDecoder(ValuesReader delegate) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + public void init(SimpleSliceInputStream input) + { + initialize(input, delegate, Double.BYTES); + } + + @Override + public void read(long[] values, int offset, int length) + { + for (int i = offset; i < offset + length; i++) { + values[i] = doubleToLongBits(delegate.readDouble()); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + } + + public static final class FloatApacheParquetValueDecoder + implements ValueDecoder + { + private final ValuesReader delegate; + + public FloatApacheParquetValueDecoder(ValuesReader delegate) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + @Override + public void init(SimpleSliceInputStream input) + { + initialize(input, delegate, Float.BYTES); + } + + @Override + public void read(int[] values, int offset, int length) + { + for (int i = offset; i < offset + length; i++) { + values[i] = floatToIntBits(delegate.readFloat()); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + } + + private static void initialize(SimpleSliceInputStream input, ValuesReader reader, int elementSizeInBytes) + { + byte[] buffer = input.readBytes(); + try { + int valueCount = buffer.length / elementSizeInBytes; + reader.initFromPage(valueCount, ByteBufferInputStream.wrap(ByteBuffer.wrap(buffer, 0, buffer.length))); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/BooleanPlainValueDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/BooleanPlainValueDecoders.java new file mode 100644 index 000000000000..21134233a7cb --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/BooleanPlainValueDecoders.java @@ -0,0 +1,142 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.flat.BitPackingUtils; + +import static io.trino.parquet.reader.flat.BitPackingUtils.unpack; +import static io.trino.parquet.reader.flat.VectorBitPackingUtils.vectorUnpack8FromByte; +import static java.lang.Math.min; +import static java.util.Objects.requireNonNull; + +public class BooleanPlainValueDecoders +{ + private BooleanPlainValueDecoders() {} + + public static ValueDecoder createBooleanPlainValueDecoder(boolean vectorizedDecodingEnabled) + { + return vectorizedDecodingEnabled ? new VectorBooleanPlainValueDecoder() : new BooleanPlainValueDecoder(); + } + + private abstract static class AbstractBooleanPlainValueDecoder + implements ValueDecoder + { + protected SimpleSliceInputStream input; + // Number of unread bits in the current byte + protected int alreadyReadBits; + // Partly read byte + protected byte partiallyReadByte; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + alreadyReadBits = 0; + } + + @Override + public void skip(int n) + { + if (alreadyReadBits != 0) { // Skip the partially read byte + int chunkSize = min(Byte.SIZE - alreadyReadBits, n); + n -= chunkSize; + alreadyReadBits = (alreadyReadBits + chunkSize) % Byte.SIZE; // Set to 0 when full byte reached + } + + // Skip full bytes + input.skip(n / Byte.SIZE); + + if (n % Byte.SIZE != 0) { // Partially skip the last byte + alreadyReadBits = n % Byte.SIZE; + partiallyReadByte = input.readByte(); + } + } + } + + private static final class BooleanPlainValueDecoder + extends AbstractBooleanPlainValueDecoder + { + @Override + public void read(byte[] values, int offset, int length) + { + if (alreadyReadBits != 0) { // Use partially unpacked byte + int bitsRemaining = Byte.SIZE - alreadyReadBits; + int chunkSize = min(bitsRemaining, length); + unpack(values, offset, partiallyReadByte, alreadyReadBits, alreadyReadBits + chunkSize); + alreadyReadBits = (alreadyReadBits + chunkSize) % Byte.SIZE; // Set to 0 when full byte reached + if (length == chunkSize) { + return; + } + offset += chunkSize; + length -= chunkSize; + } + + // Read full bytes + int bytesToRead = length / Byte.SIZE; + while (bytesToRead > 0) { + byte packedByte = input.readByte(); + BitPackingUtils.unpack8FromByte(values, offset, packedByte); + bytesToRead--; + offset += Byte.SIZE; + } + + // Partially read the last byte + alreadyReadBits = length % Byte.SIZE; + if (alreadyReadBits != 0) { + partiallyReadByte = input.readByte(); + unpack(values, offset, partiallyReadByte, 0, alreadyReadBits); + } + } + } + + private static final class VectorBooleanPlainValueDecoder + extends AbstractBooleanPlainValueDecoder + { + @Override + public void read(byte[] values, int offset, int length) + { + if (alreadyReadBits != 0) { // Use partially unpacked byte + int bitsRemaining = Byte.SIZE - alreadyReadBits; + int chunkSize = min(bitsRemaining, length); + unpack(values, offset, partiallyReadByte, alreadyReadBits, alreadyReadBits + chunkSize); + alreadyReadBits = (alreadyReadBits + chunkSize) % Byte.SIZE; // Set to 0 when full byte reached + if (length == chunkSize) { + return; + } + offset += chunkSize; + length -= chunkSize; + } + + // Read full bytes + int bytesToRead = length / Byte.SIZE; + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (inputBytesRead < bytesToRead) { + vectorUnpack8FromByte(values, offset, inputArray[inputOffset + inputBytesRead]); + offset += Byte.SIZE; + inputBytesRead++; + } + input.skip(inputBytesRead); + + // Partially read the last byte + alreadyReadBits = length % Byte.SIZE; + if (alreadyReadBits != 0) { + partiallyReadByte = input.readByte(); + unpack(values, offset, partiallyReadByte, 0, alreadyReadBits); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ByteBitUnpacker.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ByteBitUnpacker.java new file mode 100644 index 000000000000..c67d8a72018b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ByteBitUnpacker.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +public interface ByteBitUnpacker +{ + /** + * @param length must be a multiple of 32 + */ + void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ByteBitUnpackers.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ByteBitUnpackers.java new file mode 100644 index 000000000000..3c292fe054ab --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ByteBitUnpackers.java @@ -0,0 +1,490 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +import static com.google.common.base.Preconditions.checkArgument; + +public final class ByteBitUnpackers +{ + private static final ByteBitUnpacker[] UNPACKERS = { + new Unpacker1(), + new Unpacker2(), + new Unpacker3(), + new Unpacker4(), + new Unpacker5(), + new Unpacker6(), + new Unpacker7(), + new Unpacker8(), + new Unpacker9()}; + + // Byte unpacker also exists for the out-of-range 9 value. + // This unpacker truncates the most significant bit of the resulted numbers. + // This is due to the fact that deltas may require more than 8 bits to be stored. + // E.g. Values -100, 100, -100 are stored as deltas 200, -200 which is a span of 400, + // far exceeding the 8-byte capabilities. + // However, since the Trino type is Tinyint we have a certainty + // that the resulting value fits into 8-byte number and the most significant + // delta bit, when being '1' would cause positive overflow. Knowing that the value + // fits into 8-bit value we can assume that there has been negative overflow + // previously and the result is correct + // Example: + // Values: -100, 100, -100 + // First value: -100 + // Deltas: 200, -200 + // Minimum delta: -200 + // Normalized deltas: 400 , 0 + // Calculating first value: -100 + -200 (negative overflow) + 400 (positive overflow) = 100 + // Calculating first value without first bit: -100 + -200 + 144 (truncated) = -156 + // And finally the resulting value truncation: (byte) -156 == 100 + public static ByteBitUnpacker getByteBitUnpacker(int bitWidth) + { + checkArgument(bitWidth > 0 && bitWidth <= 9, "bitWidth %s should be in the range 1-9", bitWidth); + return UNPACKERS[bitWidth - 1]; + } + + private ByteBitUnpackers() {} + + private static final class Unpacker1 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + output[outputOffset] = (byte) (v0 & 0b1L); + output[outputOffset + 1] = (byte) ((v0 >>> 1) & 0b1L); + output[outputOffset + 2] = (byte) ((v0 >>> 2) & 0b1L); + output[outputOffset + 3] = (byte) ((v0 >>> 3) & 0b1L); + output[outputOffset + 4] = (byte) ((v0 >>> 4) & 0b1L); + output[outputOffset + 5] = (byte) ((v0 >>> 5) & 0b1L); + output[outputOffset + 6] = (byte) ((v0 >>> 6) & 0b1L); + output[outputOffset + 7] = (byte) ((v0 >>> 7) & 0b1L); + output[outputOffset + 8] = (byte) ((v0 >>> 8) & 0b1L); + output[outputOffset + 9] = (byte) ((v0 >>> 9) & 0b1L); + output[outputOffset + 10] = (byte) ((v0 >>> 10) & 0b1L); + output[outputOffset + 11] = (byte) ((v0 >>> 11) & 0b1L); + output[outputOffset + 12] = (byte) ((v0 >>> 12) & 0b1L); + output[outputOffset + 13] = (byte) ((v0 >>> 13) & 0b1L); + output[outputOffset + 14] = (byte) ((v0 >>> 14) & 0b1L); + output[outputOffset + 15] = (byte) ((v0 >>> 15) & 0b1L); + output[outputOffset + 16] = (byte) ((v0 >>> 16) & 0b1L); + output[outputOffset + 17] = (byte) ((v0 >>> 17) & 0b1L); + output[outputOffset + 18] = (byte) ((v0 >>> 18) & 0b1L); + output[outputOffset + 19] = (byte) ((v0 >>> 19) & 0b1L); + output[outputOffset + 20] = (byte) ((v0 >>> 20) & 0b1L); + output[outputOffset + 21] = (byte) ((v0 >>> 21) & 0b1L); + output[outputOffset + 22] = (byte) ((v0 >>> 22) & 0b1L); + output[outputOffset + 23] = (byte) ((v0 >>> 23) & 0b1L); + output[outputOffset + 24] = (byte) ((v0 >>> 24) & 0b1L); + output[outputOffset + 25] = (byte) ((v0 >>> 25) & 0b1L); + output[outputOffset + 26] = (byte) ((v0 >>> 26) & 0b1L); + output[outputOffset + 27] = (byte) ((v0 >>> 27) & 0b1L); + output[outputOffset + 28] = (byte) ((v0 >>> 28) & 0b1L); + output[outputOffset + 29] = (byte) ((v0 >>> 29) & 0b1L); + output[outputOffset + 30] = (byte) ((v0 >>> 30) & 0b1L); + output[outputOffset + 31] = (byte) ((v0 >>> 31) & 0b1L); + } + } + + private static final class Unpacker2 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + output[outputOffset] = (byte) (v0 & 0b11L); + output[outputOffset + 1] = (byte) ((v0 >>> 2) & 0b11L); + output[outputOffset + 2] = (byte) ((v0 >>> 4) & 0b11L); + output[outputOffset + 3] = (byte) ((v0 >>> 6) & 0b11L); + output[outputOffset + 4] = (byte) ((v0 >>> 8) & 0b11L); + output[outputOffset + 5] = (byte) ((v0 >>> 10) & 0b11L); + output[outputOffset + 6] = (byte) ((v0 >>> 12) & 0b11L); + output[outputOffset + 7] = (byte) ((v0 >>> 14) & 0b11L); + output[outputOffset + 8] = (byte) ((v0 >>> 16) & 0b11L); + output[outputOffset + 9] = (byte) ((v0 >>> 18) & 0b11L); + output[outputOffset + 10] = (byte) ((v0 >>> 20) & 0b11L); + output[outputOffset + 11] = (byte) ((v0 >>> 22) & 0b11L); + output[outputOffset + 12] = (byte) ((v0 >>> 24) & 0b11L); + output[outputOffset + 13] = (byte) ((v0 >>> 26) & 0b11L); + output[outputOffset + 14] = (byte) ((v0 >>> 28) & 0b11L); + output[outputOffset + 15] = (byte) ((v0 >>> 30) & 0b11L); + output[outputOffset + 16] = (byte) ((v0 >>> 32) & 0b11L); + output[outputOffset + 17] = (byte) ((v0 >>> 34) & 0b11L); + output[outputOffset + 18] = (byte) ((v0 >>> 36) & 0b11L); + output[outputOffset + 19] = (byte) ((v0 >>> 38) & 0b11L); + output[outputOffset + 20] = (byte) ((v0 >>> 40) & 0b11L); + output[outputOffset + 21] = (byte) ((v0 >>> 42) & 0b11L); + output[outputOffset + 22] = (byte) ((v0 >>> 44) & 0b11L); + output[outputOffset + 23] = (byte) ((v0 >>> 46) & 0b11L); + output[outputOffset + 24] = (byte) ((v0 >>> 48) & 0b11L); + output[outputOffset + 25] = (byte) ((v0 >>> 50) & 0b11L); + output[outputOffset + 26] = (byte) ((v0 >>> 52) & 0b11L); + output[outputOffset + 27] = (byte) ((v0 >>> 54) & 0b11L); + output[outputOffset + 28] = (byte) ((v0 >>> 56) & 0b11L); + output[outputOffset + 29] = (byte) ((v0 >>> 58) & 0b11L); + output[outputOffset + 30] = (byte) ((v0 >>> 60) & 0b11L); + output[outputOffset + 31] = (byte) ((v0 >>> 62) & 0b11L); + } + } + + private static final class Unpacker3 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + output[outputOffset] = (byte) (v0 & 0b111L); + output[outputOffset + 1] = (byte) ((v0 >>> 3) & 0b111L); + output[outputOffset + 2] = (byte) ((v0 >>> 6) & 0b111L); + output[outputOffset + 3] = (byte) ((v0 >>> 9) & 0b111L); + output[outputOffset + 4] = (byte) ((v0 >>> 12) & 0b111L); + output[outputOffset + 5] = (byte) ((v0 >>> 15) & 0b111L); + output[outputOffset + 6] = (byte) ((v0 >>> 18) & 0b111L); + output[outputOffset + 7] = (byte) ((v0 >>> 21) & 0b111L); + output[outputOffset + 8] = (byte) ((v0 >>> 24) & 0b111L); + output[outputOffset + 9] = (byte) ((v0 >>> 27) & 0b111L); + output[outputOffset + 10] = (byte) ((v0 >>> 30) & 0b111L); + output[outputOffset + 11] = (byte) ((v0 >>> 33) & 0b111L); + output[outputOffset + 12] = (byte) ((v0 >>> 36) & 0b111L); + output[outputOffset + 13] = (byte) ((v0 >>> 39) & 0b111L); + output[outputOffset + 14] = (byte) ((v0 >>> 42) & 0b111L); + output[outputOffset + 15] = (byte) ((v0 >>> 45) & 0b111L); + output[outputOffset + 16] = (byte) ((v0 >>> 48) & 0b111L); + output[outputOffset + 17] = (byte) ((v0 >>> 51) & 0b111L); + output[outputOffset + 18] = (byte) ((v0 >>> 54) & 0b111L); + output[outputOffset + 19] = (byte) ((v0 >>> 57) & 0b111L); + output[outputOffset + 20] = (byte) ((v0 >>> 60) & 0b111L); + output[outputOffset + 21] = (byte) (((v0 >>> 63) & 0b1L) | ((v1 & 0b11L) << 1)); + output[outputOffset + 22] = (byte) ((v1 >>> 2) & 0b111L); + output[outputOffset + 23] = (byte) ((v1 >>> 5) & 0b111L); + output[outputOffset + 24] = (byte) ((v1 >>> 8) & 0b111L); + output[outputOffset + 25] = (byte) ((v1 >>> 11) & 0b111L); + output[outputOffset + 26] = (byte) ((v1 >>> 14) & 0b111L); + output[outputOffset + 27] = (byte) ((v1 >>> 17) & 0b111L); + output[outputOffset + 28] = (byte) ((v1 >>> 20) & 0b111L); + output[outputOffset + 29] = (byte) ((v1 >>> 23) & 0b111L); + output[outputOffset + 30] = (byte) ((v1 >>> 26) & 0b111L); + output[outputOffset + 31] = (byte) ((v1 >>> 29) & 0b111L); + } + } + + private static final class Unpacker4 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + output[outputOffset] = (byte) (v0 & 0b1111L); + output[outputOffset + 1] = (byte) ((v0 >>> 4) & 0b1111L); + output[outputOffset + 2] = (byte) ((v0 >>> 8) & 0b1111L); + output[outputOffset + 3] = (byte) ((v0 >>> 12) & 0b1111L); + output[outputOffset + 4] = (byte) ((v0 >>> 16) & 0b1111L); + output[outputOffset + 5] = (byte) ((v0 >>> 20) & 0b1111L); + output[outputOffset + 6] = (byte) ((v0 >>> 24) & 0b1111L); + output[outputOffset + 7] = (byte) ((v0 >>> 28) & 0b1111L); + output[outputOffset + 8] = (byte) ((v0 >>> 32) & 0b1111L); + output[outputOffset + 9] = (byte) ((v0 >>> 36) & 0b1111L); + output[outputOffset + 10] = (byte) ((v0 >>> 40) & 0b1111L); + output[outputOffset + 11] = (byte) ((v0 >>> 44) & 0b1111L); + output[outputOffset + 12] = (byte) ((v0 >>> 48) & 0b1111L); + output[outputOffset + 13] = (byte) ((v0 >>> 52) & 0b1111L); + output[outputOffset + 14] = (byte) ((v0 >>> 56) & 0b1111L); + output[outputOffset + 15] = (byte) ((v0 >>> 60) & 0b1111L); + output[outputOffset + 16] = (byte) (v1 & 0b1111L); + output[outputOffset + 17] = (byte) ((v1 >>> 4) & 0b1111L); + output[outputOffset + 18] = (byte) ((v1 >>> 8) & 0b1111L); + output[outputOffset + 19] = (byte) ((v1 >>> 12) & 0b1111L); + output[outputOffset + 20] = (byte) ((v1 >>> 16) & 0b1111L); + output[outputOffset + 21] = (byte) ((v1 >>> 20) & 0b1111L); + output[outputOffset + 22] = (byte) ((v1 >>> 24) & 0b1111L); + output[outputOffset + 23] = (byte) ((v1 >>> 28) & 0b1111L); + output[outputOffset + 24] = (byte) ((v1 >>> 32) & 0b1111L); + output[outputOffset + 25] = (byte) ((v1 >>> 36) & 0b1111L); + output[outputOffset + 26] = (byte) ((v1 >>> 40) & 0b1111L); + output[outputOffset + 27] = (byte) ((v1 >>> 44) & 0b1111L); + output[outputOffset + 28] = (byte) ((v1 >>> 48) & 0b1111L); + output[outputOffset + 29] = (byte) ((v1 >>> 52) & 0b1111L); + output[outputOffset + 30] = (byte) ((v1 >>> 56) & 0b1111L); + output[outputOffset + 31] = (byte) ((v1 >>> 60) & 0b1111L); + } + } + + private static final class Unpacker5 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + output[outputOffset] = (byte) (v0 & 0b11111L); + output[outputOffset + 1] = (byte) ((v0 >>> 5) & 0b11111L); + output[outputOffset + 2] = (byte) ((v0 >>> 10) & 0b11111L); + output[outputOffset + 3] = (byte) ((v0 >>> 15) & 0b11111L); + output[outputOffset + 4] = (byte) ((v0 >>> 20) & 0b11111L); + output[outputOffset + 5] = (byte) ((v0 >>> 25) & 0b11111L); + output[outputOffset + 6] = (byte) ((v0 >>> 30) & 0b11111L); + output[outputOffset + 7] = (byte) ((v0 >>> 35) & 0b11111L); + output[outputOffset + 8] = (byte) ((v0 >>> 40) & 0b11111L); + output[outputOffset + 9] = (byte) ((v0 >>> 45) & 0b11111L); + output[outputOffset + 10] = (byte) ((v0 >>> 50) & 0b11111L); + output[outputOffset + 11] = (byte) ((v0 >>> 55) & 0b11111L); + output[outputOffset + 12] = (byte) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b1L) << 4)); + output[outputOffset + 13] = (byte) ((v1 >>> 1) & 0b11111L); + output[outputOffset + 14] = (byte) ((v1 >>> 6) & 0b11111L); + output[outputOffset + 15] = (byte) ((v1 >>> 11) & 0b11111L); + output[outputOffset + 16] = (byte) ((v1 >>> 16) & 0b11111L); + output[outputOffset + 17] = (byte) ((v1 >>> 21) & 0b11111L); + output[outputOffset + 18] = (byte) ((v1 >>> 26) & 0b11111L); + output[outputOffset + 19] = (byte) ((v1 >>> 31) & 0b11111L); + output[outputOffset + 20] = (byte) ((v1 >>> 36) & 0b11111L); + output[outputOffset + 21] = (byte) ((v1 >>> 41) & 0b11111L); + output[outputOffset + 22] = (byte) ((v1 >>> 46) & 0b11111L); + output[outputOffset + 23] = (byte) ((v1 >>> 51) & 0b11111L); + output[outputOffset + 24] = (byte) ((v1 >>> 56) & 0b11111L); + output[outputOffset + 25] = (byte) (((v1 >>> 61) & 0b111L) | ((v2 & 0b11L) << 3)); + output[outputOffset + 26] = (byte) ((v2 >>> 2) & 0b11111L); + output[outputOffset + 27] = (byte) ((v2 >>> 7) & 0b11111L); + output[outputOffset + 28] = (byte) ((v2 >>> 12) & 0b11111L); + output[outputOffset + 29] = (byte) ((v2 >>> 17) & 0b11111L); + output[outputOffset + 30] = (byte) ((v2 >>> 22) & 0b11111L); + output[outputOffset + 31] = (byte) ((v2 >>> 27) & 0b11111L); + } + } + + private static final class Unpacker6 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + output[outputOffset] = (byte) (v0 & 0b111111L); + output[outputOffset + 1] = (byte) ((v0 >>> 6) & 0b111111L); + output[outputOffset + 2] = (byte) ((v0 >>> 12) & 0b111111L); + output[outputOffset + 3] = (byte) ((v0 >>> 18) & 0b111111L); + output[outputOffset + 4] = (byte) ((v0 >>> 24) & 0b111111L); + output[outputOffset + 5] = (byte) ((v0 >>> 30) & 0b111111L); + output[outputOffset + 6] = (byte) ((v0 >>> 36) & 0b111111L); + output[outputOffset + 7] = (byte) ((v0 >>> 42) & 0b111111L); + output[outputOffset + 8] = (byte) ((v0 >>> 48) & 0b111111L); + output[outputOffset + 9] = (byte) ((v0 >>> 54) & 0b111111L); + output[outputOffset + 10] = (byte) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11L) << 4)); + output[outputOffset + 11] = (byte) ((v1 >>> 2) & 0b111111L); + output[outputOffset + 12] = (byte) ((v1 >>> 8) & 0b111111L); + output[outputOffset + 13] = (byte) ((v1 >>> 14) & 0b111111L); + output[outputOffset + 14] = (byte) ((v1 >>> 20) & 0b111111L); + output[outputOffset + 15] = (byte) ((v1 >>> 26) & 0b111111L); + output[outputOffset + 16] = (byte) ((v1 >>> 32) & 0b111111L); + output[outputOffset + 17] = (byte) ((v1 >>> 38) & 0b111111L); + output[outputOffset + 18] = (byte) ((v1 >>> 44) & 0b111111L); + output[outputOffset + 19] = (byte) ((v1 >>> 50) & 0b111111L); + output[outputOffset + 20] = (byte) ((v1 >>> 56) & 0b111111L); + output[outputOffset + 21] = (byte) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1111L) << 2)); + output[outputOffset + 22] = (byte) ((v2 >>> 4) & 0b111111L); + output[outputOffset + 23] = (byte) ((v2 >>> 10) & 0b111111L); + output[outputOffset + 24] = (byte) ((v2 >>> 16) & 0b111111L); + output[outputOffset + 25] = (byte) ((v2 >>> 22) & 0b111111L); + output[outputOffset + 26] = (byte) ((v2 >>> 28) & 0b111111L); + output[outputOffset + 27] = (byte) ((v2 >>> 34) & 0b111111L); + output[outputOffset + 28] = (byte) ((v2 >>> 40) & 0b111111L); + output[outputOffset + 29] = (byte) ((v2 >>> 46) & 0b111111L); + output[outputOffset + 30] = (byte) ((v2 >>> 52) & 0b111111L); + output[outputOffset + 31] = (byte) ((v2 >>> 58) & 0b111111L); + } + } + + private static final class Unpacker7 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + int v3 = input.readInt(); + output[outputOffset] = (byte) (v0 & 0b1111111L); + output[outputOffset + 1] = (byte) ((v0 >>> 7) & 0b1111111L); + output[outputOffset + 2] = (byte) ((v0 >>> 14) & 0b1111111L); + output[outputOffset + 3] = (byte) ((v0 >>> 21) & 0b1111111L); + output[outputOffset + 4] = (byte) ((v0 >>> 28) & 0b1111111L); + output[outputOffset + 5] = (byte) ((v0 >>> 35) & 0b1111111L); + output[outputOffset + 6] = (byte) ((v0 >>> 42) & 0b1111111L); + output[outputOffset + 7] = (byte) ((v0 >>> 49) & 0b1111111L); + output[outputOffset + 8] = (byte) ((v0 >>> 56) & 0b1111111L); + output[outputOffset + 9] = (byte) (((v0 >>> 63) & 0b1L) | ((v1 & 0b111111L) << 1)); + output[outputOffset + 10] = (byte) ((v1 >>> 6) & 0b1111111L); + output[outputOffset + 11] = (byte) ((v1 >>> 13) & 0b1111111L); + output[outputOffset + 12] = (byte) ((v1 >>> 20) & 0b1111111L); + output[outputOffset + 13] = (byte) ((v1 >>> 27) & 0b1111111L); + output[outputOffset + 14] = (byte) ((v1 >>> 34) & 0b1111111L); + output[outputOffset + 15] = (byte) ((v1 >>> 41) & 0b1111111L); + output[outputOffset + 16] = (byte) ((v1 >>> 48) & 0b1111111L); + output[outputOffset + 17] = (byte) ((v1 >>> 55) & 0b1111111L); + output[outputOffset + 18] = (byte) (((v1 >>> 62) & 0b11L) | ((v2 & 0b11111L) << 2)); + output[outputOffset + 19] = (byte) ((v2 >>> 5) & 0b1111111L); + output[outputOffset + 20] = (byte) ((v2 >>> 12) & 0b1111111L); + output[outputOffset + 21] = (byte) ((v2 >>> 19) & 0b1111111L); + output[outputOffset + 22] = (byte) ((v2 >>> 26) & 0b1111111L); + output[outputOffset + 23] = (byte) ((v2 >>> 33) & 0b1111111L); + output[outputOffset + 24] = (byte) ((v2 >>> 40) & 0b1111111L); + output[outputOffset + 25] = (byte) ((v2 >>> 47) & 0b1111111L); + output[outputOffset + 26] = (byte) ((v2 >>> 54) & 0b1111111L); + output[outputOffset + 27] = (byte) (((v2 >>> 61) & 0b111L) | ((v3 & 0b1111L) << 3)); + output[outputOffset + 28] = (byte) ((v3 >>> 4) & 0b1111111L); + output[outputOffset + 29] = (byte) ((v3 >>> 11) & 0b1111111L); + output[outputOffset + 30] = (byte) ((v3 >>> 18) & 0b1111111L); + output[outputOffset + 31] = (byte) ((v3 >>> 25) & 0b1111111L); + } + } + + private static final class Unpacker8 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + input.readBytes(output, outputOffset, length); + } + } + + private static final class Unpacker9 + implements ByteBitUnpacker + { + @Override + public void unpack(byte[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(byte[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + int v4 = input.readInt(); + output[outputOffset] = (byte) v0; + output[outputOffset + 1] = (byte) ((v0 >>> 9) & 0b111111111L); + output[outputOffset + 2] = (byte) ((v0 >>> 18) & 0b111111111L); + output[outputOffset + 3] = (byte) ((v0 >>> 27) & 0b111111111L); + output[outputOffset + 4] = (byte) ((v0 >>> 36) & 0b111111111L); + output[outputOffset + 5] = (byte) ((v0 >>> 45) & 0b111111111L); + output[outputOffset + 6] = (byte) ((v0 >>> 54) & 0b111111111L); + output[outputOffset + 7] = (byte) ((v0 >>> 63) | ((v1 & 0b11111111L) << 1)); + output[outputOffset + 8] = (byte) ((v1 >>> 8) & 0b111111111L); + output[outputOffset + 9] = (byte) ((v1 >>> 17) & 0b111111111L); + output[outputOffset + 10] = (byte) ((v1 >>> 26) & 0b111111111L); + output[outputOffset + 11] = (byte) ((v1 >>> 35) & 0b111111111L); + output[outputOffset + 12] = (byte) ((v1 >>> 44) & 0b111111111L); + output[outputOffset + 13] = (byte) ((v1 >>> 53) & 0b111111111L); + output[outputOffset + 14] = (byte) ((v1 >>> 62) | ((v2 & 0b1111111L) << 2)); + output[outputOffset + 15] = (byte) ((v2 >>> 7) & 0b111111111L); + output[outputOffset + 16] = (byte) ((v2 >>> 16) & 0b111111111L); + output[outputOffset + 17] = (byte) ((v2 >>> 25) & 0b111111111L); + output[outputOffset + 18] = (byte) ((v2 >>> 34) & 0b111111111L); + output[outputOffset + 19] = (byte) ((v2 >>> 43) & 0b111111111L); + output[outputOffset + 20] = (byte) ((v2 >>> 52) & 0b111111111L); + output[outputOffset + 21] = (byte) ((v2 >>> 61) | ((v3 & 0b111111L) << 3)); + output[outputOffset + 22] = (byte) ((v3 >>> 6) & 0b111111111L); + output[outputOffset + 23] = (byte) ((v3 >>> 15) & 0b111111111L); + output[outputOffset + 24] = (byte) ((v3 >>> 24) & 0b111111111L); + output[outputOffset + 25] = (byte) ((v3 >>> 33) & 0b111111111L); + output[outputOffset + 26] = (byte) ((v3 >>> 42) & 0b111111111L); + output[outputOffset + 27] = (byte) ((v3 >>> 51) & 0b111111111L); + output[outputOffset + 28] = (byte) ((v3 >>> 60) | ((v4 & 0b11111L) << 4)); + output[outputOffset + 29] = (byte) ((v4 >>> 5) & 0b111111111L); + output[outputOffset + 30] = (byte) ((v4 >>> 14) & 0b111111111L); + output[outputOffset + 31] = (byte) ((v4 >>> 23) & 0b111111111L); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaBinaryPackedDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaBinaryPackedDecoders.java new file mode 100644 index 000000000000..e423c4c2519d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaBinaryPackedDecoders.java @@ -0,0 +1,357 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetReaderUtils.ceilDiv; +import static io.trino.parquet.ParquetReaderUtils.readUleb128Int; +import static io.trino.parquet.ParquetReaderUtils.readUleb128Long; +import static io.trino.parquet.ParquetReaderUtils.zigzagDecode; +import static java.lang.Math.min; +import static java.util.Objects.requireNonNull; + +/** + * Implementation of the encoding described in + * ... + */ +public final class DeltaBinaryPackedDecoders +{ + // Block size is a multiple of 128 + // Mini-block size is a multiple of 32 + // Mini-block count per block is typically equal to 4 + private static final int COMMON_MINI_BLOCKS_NUMBER = 4; + private static final int FIRST_VALUE = -1; + + private DeltaBinaryPackedDecoders() {} + + public static class DeltaBinaryPackedByteDecoder + extends DeltaBinaryPackedDecoder + { + @Override + protected byte[] createMiniBlockBuffer(int size) + { + return new byte[size]; + } + + @Override + protected void setValue(byte[] values, int offset, long value) + { + values[offset] = (byte) value; + } + + @Override + public void read(byte[] values, int offset, int length) + { + readInternal(values, offset, length); + } + + @Override + protected long unpack(byte[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + DeltaPackingUtils.unpackDelta(output, outputOffset, length, input, minDelta, bitWidth); + return output[outputOffset + length - 1]; + } + } + + public static class DeltaBinaryPackedShortDecoder + extends DeltaBinaryPackedDecoder + { + @Override + protected short[] createMiniBlockBuffer(int size) + { + return new short[size]; + } + + @Override + protected void setValue(short[] values, int offset, long value) + { + values[offset] = (short) value; + } + + @Override + public void read(short[] values, int offset, int length) + { + readInternal(values, offset, length); + } + + @Override + protected long unpack(short[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + DeltaPackingUtils.unpackDelta(output, outputOffset, length, input, minDelta, bitWidth); + return output[outputOffset + length - 1]; + } + } + + public static class DeltaBinaryPackedIntDecoder + extends DeltaBinaryPackedDecoder + { + @Override + protected int[] createMiniBlockBuffer(int size) + { + return new int[size]; + } + + @Override + protected void setValue(int[] values, int offset, long value) + { + values[offset] = (int) value; + } + + @Override + public void read(int[] values, int offset, int length) + { + readInternal(values, offset, length); + } + + @Override + protected long unpack(int[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + DeltaPackingUtils.unpackDelta(output, outputOffset, length, input, minDelta, bitWidth); + return output[outputOffset + length - 1]; + } + } + + public static class DeltaBinaryPackedLongDecoder + extends DeltaBinaryPackedDecoder + { + @Override + protected long[] createMiniBlockBuffer(int size) + { + return new long[size]; + } + + @Override + protected void setValue(long[] values, int offset, long value) + { + values[offset] = value; + } + + @Override + public void read(long[] values, int offset, int length) + { + readInternal(values, offset, length); + } + + @Override + protected long unpack(long[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + DeltaPackingUtils.unpackDelta(output, outputOffset, length, input, minDelta, bitWidth); + return output[outputOffset + length - 1]; + } + } + + private abstract static class DeltaBinaryPackedDecoder + implements ValueDecoder + { + private SimpleSliceInputStream input; + + private int blockSize; + private int miniBlockSize; + // Last read value + private long previousValue; + + private int miniBlocksInBlock; + private byte[] bitWidths; + + private int alreadyReadInBlock; + private ValuesType blockValues; + private int valueCount; + + private long blockMinDelta; + private int miniBlocksRemaining; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + alreadyReadInBlock = FIRST_VALUE; + readHeader(); + blockValues = createMiniBlockBuffer(blockSize + 1); // First index is reserved for the last read value + bitWidths = new byte[miniBlocksInBlock]; + } + + protected abstract ValuesType createMiniBlockBuffer(int size); + + protected abstract void setValue(ValuesType values, int offset, long value); + + /** + * This method needs to do two things: + *

    + *
  • Set output[outputOffset-1] to 'previousValue'. This way the inner loops are consistent and can handle iterations in batches of 32. + * This is needed since some values might have been skipped
  • + *
  • Delegate unpacking to the corresponding static method from DeltaPackingUtils class
  • + *
+ * + * @return Last value read + */ + protected abstract long unpack(ValuesType output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth); + + private void readHeader() + { + blockSize = readUleb128Int(input); + checkArgument(blockSize % 128 == 0, "Corrupted Parquet file: block size of the delta encoding needs to be a multiple of 128"); + miniBlockSize = blockSize / readUleb128Int(input); + checkArgument(miniBlockSize % 32 == 0, "Corrupted Parquet file: mini block size of the delta encoding needs to be a multiple of 32"); + valueCount = readUleb128Int(input); + miniBlocksRemaining = ceilDiv(valueCount - 1, miniBlockSize); // -1 as the first value is stored in a header + previousValue = zigzagDecode(readUleb128Long(input)); + miniBlocksInBlock = blockSize / miniBlockSize; + } + + @SuppressWarnings("SuspiciousSystemArraycopy") + public void readInternal(ValuesType values, int offset, int length) + { + // This condition will be true only for the first time and then continue to + // return false hopefully making branch prediction efficient + if (alreadyReadInBlock == FIRST_VALUE && length > 0) { + setValue(values, offset++, previousValue); + length--; + alreadyReadInBlock = 0; + } + + if (alreadyReadInBlock != 0) { // Partially read block + int chunkSize = min(length, blockSize - alreadyReadInBlock); + // Leverage the fact that arrayCopy does not have array types specified + System.arraycopy(blockValues, alreadyReadInBlock + 1, values, offset, chunkSize); + markRead(chunkSize); + + offset += chunkSize; + length -= chunkSize; + } + + while (length > 0) { + readBlockHeader(); + + if (length <= blockSize) { // Read block partially + setValue(blockValues, 0, previousValue); + readBlock(blockValues, 1); // Write data to temporary buffer + System.arraycopy(blockValues, 1, values, offset, length); + markRead(length); + length = 0; + } + else { // read full block + if (offset == 0) { + // Special case: The decoder is in the middle of the page but the output offset is 0. This prevents + // us from leveraging output[offset-1] position to streamline the unpacking operation, + // The solution is to use the temporary buffer. + // This is a rare case that happens at most once every Trino page (~1MB or more) + setValue(blockValues, 0, previousValue); + readBlock(blockValues, 1); // Write data to temporary buffer + System.arraycopy(blockValues, 1, values, offset, blockSize); + } + else { + readBlock(values, offset); // Write data directly to output buffer + } + offset += blockSize; + length -= blockSize; + } + } + } + + public int getValueCount() + { + return valueCount; + } + + private boolean areBitWidthTheSame() + { + if (miniBlocksInBlock == COMMON_MINI_BLOCKS_NUMBER) { + byte and = (byte) (bitWidths[0] & bitWidths[1] & bitWidths[2] & bitWidths[3]); + byte or = (byte) (bitWidths[0] | bitWidths[1] | bitWidths[2] | bitWidths[3]); + return and == or; + } + byte first = bitWidths[0]; + boolean same = true; + for (int i = 1; i < miniBlocksInBlock; i++) { + same &= bitWidths[i] == first; + } + return same; + } + + @Override + public void skip(int n) + { + if (n == 0) { + return; + } + // This condition will be true only for the first time and then continue to + // return false hopefully making branch prediction efficient + if (alreadyReadInBlock == FIRST_VALUE) { + n--; + alreadyReadInBlock = 0; + } + + if (alreadyReadInBlock != 0) { // Partially read mini block + int chunkSize = min(n, blockSize - alreadyReadInBlock); + markRead(chunkSize); + n -= chunkSize; + } + + while (n > 0) { + readBlockHeader(); + setValue(blockValues, 0, previousValue); + readBlock(blockValues, 1); // Write data to temporary buffer + int chunkSize = min(n, blockSize); + markRead(chunkSize); + n -= chunkSize; + } + } + + /** + * @param chunkSize Needs to be less or equal to the number of values remaining in the current mini block + */ + private void markRead(int chunkSize) + { + alreadyReadInBlock += chunkSize; + // Trick to skip conditional statement, does the same as: + // if ( alreadyReadInMiniBlock == miniBlockSize) { currentMiniBlock++; } + alreadyReadInBlock %= blockSize; + } + + private void readBlock(ValuesType output, int outputOffset) + { + int miniBlocksToRead = Math.min(miniBlocksRemaining, miniBlocksInBlock); + if (miniBlocksToRead == miniBlocksInBlock && areBitWidthTheSame()) { + byte bitWidth = bitWidths[0]; + previousValue = unpack(output, outputOffset, blockSize, input, blockMinDelta, bitWidth); + } + else { + for (int i = 0; i < miniBlocksToRead; i++) { + byte bitWidth = bitWidths[i]; + previousValue = unpack(output, outputOffset, miniBlockSize, input, blockMinDelta, bitWidth); + outputOffset += miniBlockSize; + } + } + + miniBlocksRemaining -= miniBlocksToRead; + } + + private void readBlockHeader() + { + blockMinDelta = zigzagDecode(readUleb128Long(input)); + if (miniBlocksInBlock == COMMON_MINI_BLOCKS_NUMBER) { + int bitWidthsPacked = input.readInt(); + bitWidths[0] = (byte) bitWidthsPacked; + bitWidths[1] = (byte) (bitWidthsPacked >> 8); + bitWidths[2] = (byte) (bitWidthsPacked >> 16); + bitWidths[3] = (byte) (bitWidthsPacked >> 24); + } + else { + input.readBytes(bitWidths, 0, miniBlocksInBlock); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaByteArrayDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaByteArrayDecoders.java new file mode 100644 index 000000000000..7da0a6d4a925 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaByteArrayDecoders.java @@ -0,0 +1,343 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.flat.BinaryBuffer; +import io.trino.spi.type.CharType; +import io.trino.spi.type.VarcharType; + +import java.util.Arrays; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkPositionIndexes; +import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedIntDecoder; +import static io.trino.spi.type.Chars.byteCountWithoutTrailingSpace; +import static io.trino.spi.type.Varchars.byteCount; +import static java.lang.Math.max; +import static java.util.Objects.requireNonNull; + +/** + * Implementation of decoder for the encoding described at + * delta_byte_array + */ +public class DeltaByteArrayDecoders +{ + private DeltaByteArrayDecoders() {} + + public static final class BoundedVarcharDeltaByteArrayDecoder + extends AbstractDeltaByteArrayDecoder + { + private final int boundedLength; + + public BoundedVarcharDeltaByteArrayDecoder(VarcharType varcharType) + { + checkArgument( + !varcharType.isUnbounded(), + "Trino type %s is not a bounded varchar", + varcharType); + this.boundedLength = varcharType.getBoundedLength(); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + InputLengths lengths = getInputAndMaxLength(length); + int maxLength = lengths.maxInputLength(); + int totalInputLength = lengths.totalInputLength(); + boolean truncate = maxLength > boundedLength; + if (truncate) { + readBounded(values, offset, length, totalInputLength); + } + else { + readUnbounded(values, offset, length, totalInputLength); + } + } + + @Override + protected int truncatedLength(Slice slice, int offset, int length) + { + return byteCount(slice, offset, length, boundedLength); + } + } + + public static final class CharDeltaByteArrayDecoder + extends AbstractDeltaByteArrayDecoder + { + private final int maxLength; + + public CharDeltaByteArrayDecoder(CharType charType) + { + this.maxLength = charType.getLength(); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + int totalInputLength = getInputLength(length); + readBounded(values, offset, length, totalInputLength); + } + + @Override + protected int truncatedLength(Slice slice, int offset, int length) + { + return byteCountWithoutTrailingSpace(slice, offset, length, maxLength); + } + } + + public static final class BinaryDeltaByteArrayDecoder + extends AbstractDeltaByteArrayDecoder + { + @Override + protected int truncatedLength(Slice slice, int offset, int length) + { + throw new UnsupportedOperationException(); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + int totalInputLength = getInputLength(length); + readUnbounded(values, offset, length, totalInputLength); + } + } + + private abstract static class AbstractDeltaByteArrayDecoder + implements ValueDecoder + { + private int[] prefixLengths; + private int[] suffixLengths; + private int inputLengthsOffset; + // At the end of skip/read for each batch of positions, this field is + // populated with prefixLength bytes for the first position in the next read + private byte[] firstPrefix = new byte[0]; + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + this.prefixLengths = readDeltaEncodedLengths(input); + this.suffixLengths = readDeltaEncodedLengths(input); + } + + @Override + public void skip(int n) + { + checkPositionIndexes(inputLengthsOffset, inputLengthsOffset + n, prefixLengths.length); + if (n == 0) { + return; + } + + // If we've skipped to the end, there's no need to process anything + if (inputLengthsOffset + n == prefixLengths.length) { + inputLengthsOffset += n; + return; + } + + int totalSuffixesLength = getSuffixesLength(n); + Slice inputSlice = input.asSlice(); + // Start from the suffix and go back position by position to fill the prefix for next read + int bytesLeft = prefixLengths[inputLengthsOffset + n]; + byte[] newPrefix = new byte[bytesLeft]; + + int current = n - 1; + int inputOffset = totalSuffixesLength - suffixLengths[inputLengthsOffset + n - 1]; + while (bytesLeft > 0 && inputOffset >= 0) { + int currentPrefixLength = prefixLengths[inputLengthsOffset + current]; + if (currentPrefixLength < bytesLeft) { + int toCopy = bytesLeft - currentPrefixLength; + inputSlice.getBytes(inputOffset, newPrefix, currentPrefixLength, toCopy); + bytesLeft -= toCopy; + if (bytesLeft == 0) { + break; + } + } + inputOffset -= suffixLengths[inputLengthsOffset + current - 1]; + current--; + } + System.arraycopy(firstPrefix, 0, newPrefix, 0, bytesLeft); + firstPrefix = newPrefix; + + input.skip(totalSuffixesLength); + inputLengthsOffset += n; + } + + protected abstract int truncatedLength(Slice slice, int offset, int length); + + protected void readBounded(BinaryBuffer values, int offset, int length, int totalInputLength) + { + checkPositionIndexes(inputLengthsOffset, inputLengthsOffset + length, prefixLengths.length); + if (length == 0) { + return; + } + int[] outputOffsets = values.getOffsets(); + byte[] dataBuffer = readUnbounded(outputOffsets, offset, length, totalInputLength); + Slice inputSlice = Slices.wrappedBuffer(dataBuffer); + inputLengthsOffset += length; + + // Try to find the first truncated position + int i = 0; + int inputOffset = 0; + for (; i < length; i++) { + int inputLength = outputOffsets[offset + i + 1] - outputOffsets[offset + i]; + int outputLength = truncatedLength(inputSlice, inputOffset, inputLength); + if (inputLength != outputLength) { + break; + } + inputOffset += inputLength; + } + + if (i == length) { + // No trimming or truncating took place + values.addChunk(inputSlice); + return; + } + + // Resume the iteration, this time shifting positions left according to trimming/truncation + int outputOffset = inputOffset; + int nextOffset = outputOffsets[offset + i]; + for (; i < length; i++) { + int currentOffset = nextOffset; + nextOffset = outputOffsets[offset + i + 1]; + int inputLength = nextOffset - currentOffset; + int outputLength = truncatedLength(inputSlice, inputOffset, inputLength); + System.arraycopy(dataBuffer, inputOffset, dataBuffer, outputOffset, outputLength); + outputOffsets[offset + i + 1] = outputOffsets[offset + i] + outputLength; + inputOffset += inputLength; + outputOffset += outputLength; + } + + values.addChunk(inputSlice.slice(0, outputOffset)); + } + + protected void readUnbounded(BinaryBuffer values, int offset, int length, int totalInputLength) + { + checkPositionIndexes(inputLengthsOffset, inputLengthsOffset + length, prefixLengths.length); + if (length == 0) { + return; + } + int[] outputOffsets = values.getOffsets(); + Slice outputBuffer = Slices.wrappedBuffer(readUnbounded(outputOffsets, offset, length, totalInputLength)); + values.addChunk(outputBuffer); + inputLengthsOffset += length; + } + + protected int getSuffixesLength(int length) + { + int totalSuffixesLength = 0; + for (int i = 0; i < length; i++) { + totalSuffixesLength += suffixLengths[inputLengthsOffset + i]; + } + return totalSuffixesLength; + } + + protected int getInputLength(int length) + { + int totalInputLength = 0; + for (int i = 0; i < length; i++) { + totalInputLength += prefixLengths[inputLengthsOffset + i] + suffixLengths[inputLengthsOffset + i]; + } + return totalInputLength; + } + + protected InputLengths getInputAndMaxLength(int length) + { + int totalInputLength = 0; + int maxLength = 0; + for (int i = 0; i < length; i++) { + int inputLength = prefixLengths[inputLengthsOffset + i] + suffixLengths[inputLengthsOffset + i]; + totalInputLength += inputLength; + maxLength = max(maxLength, inputLength); + } + return new InputLengths(totalInputLength, maxLength); + } + + protected record InputLengths(int totalInputLength, int maxInputLength) {} + + private byte[] readUnbounded(int[] outputOffsets, int offset, int length, int totalInputLength) + { + byte[] output = new byte[totalInputLength]; + Slice inputSlice = input.asSlice(); + // System#arraycopy performs better than Slice#getBytes, therefore we + // process the input as a byte array rather than through SimpleSliceInputStream#readBytes + byte[] inputBytes; + int inputOffsetStart; + if (inputSlice.length() != 0) { + inputBytes = inputSlice.byteArray(); + inputOffsetStart = inputSlice.byteArrayOffset(); + } + else { + inputBytes = new byte[0]; + inputOffsetStart = 0; + } + int inputOffset = inputOffsetStart; + + // Read first position by copying prefix from previous read + outputOffsets[offset + 1] = outputOffsets[offset] + prefixLengths[inputLengthsOffset] + suffixLengths[inputLengthsOffset]; + System.arraycopy(firstPrefix, 0, output, 0, prefixLengths[inputLengthsOffset]); + int outputOffset = prefixLengths[inputLengthsOffset]; + int outputLength = suffixLengths[inputLengthsOffset]; + + // Read remaining length - 1 positions + for (int i = 1; i < length; i++) { + int prefixLength = prefixLengths[inputLengthsOffset + i]; + int suffixLength = suffixLengths[inputLengthsOffset + i]; + outputOffsets[offset + i + 1] = outputOffsets[offset + i] + prefixLength + suffixLength; + + // prefixLength of 0 is a common case, batching arraycopy calls for continuous runs of 0s + // performs better than copying position by position + if (prefixLength > 0) { + // Copy all previous continuous suffixes + System.arraycopy(inputBytes, inputOffset, output, outputOffset, outputLength); + inputOffset += outputLength; + outputOffset += outputLength; + outputLength = 0; + + // Copy the current prefix + int previousPositionLength = prefixLengths[inputLengthsOffset + i - 1] + suffixLengths[inputLengthsOffset + i - 1]; + int previousOutputStart = outputOffset - previousPositionLength; + System.arraycopy(output, previousOutputStart, output, outputOffset, prefixLength); + outputOffset += prefixLength; + } + outputLength += suffixLength; + } + // Copy any remaining suffixes + System.arraycopy(inputBytes, inputOffset, output, outputOffset, outputLength); + inputOffset += outputLength; + outputOffset += outputLength; + input.skip(inputOffset - inputOffsetStart); + + if (inputLengthsOffset + length < prefixLengths.length) { + // Prepare prefix for next read if end of input has not been reached + int previousPositionLength = prefixLengths[inputLengthsOffset + length - 1] + suffixLengths[inputLengthsOffset + length - 1]; + int previousOutputStart = outputOffset - previousPositionLength; + firstPrefix = Arrays.copyOfRange(output, previousOutputStart, previousOutputStart + prefixLengths[inputLengthsOffset + length]); + } + return output; + } + } + + private static int[] readDeltaEncodedLengths(SimpleSliceInputStream input) + { + DeltaBinaryPackedIntDecoder decoder = new DeltaBinaryPackedIntDecoder(); + decoder.init(input); + int valueCount = decoder.getValueCount(); + int[] lengths = new int[valueCount]; + decoder.read(lengths, 0, valueCount); + return lengths; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaLengthByteArrayDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaLengthByteArrayDecoders.java new file mode 100644 index 000000000000..f37f89bddf27 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaLengthByteArrayDecoders.java @@ -0,0 +1,254 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.flat.BinaryBuffer; +import io.trino.spi.type.CharType; +import io.trino.spi.type.VarcharType; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedIntDecoder; +import static io.trino.spi.type.Chars.byteCountWithoutTrailingSpace; +import static io.trino.spi.type.Varchars.byteCount; +import static java.lang.Math.max; +import static java.util.Objects.requireNonNull; + +/** + * Implementation of decoding for the encoding described at + * delta-length-byte-array. + * Data encoding here is identical to the one in Trino block. It the values are + * not trimmed due to type bounds, it will push a single Slice object that will + * effectively be used in Trino blocks without copying. + *

+ * If the trimming occurs, data will be copied into a single byte array + */ +public class DeltaLengthByteArrayDecoders +{ + private DeltaLengthByteArrayDecoders() {} + + public static final class BoundedVarcharDeltaLengthDecoder + extends AbstractDeltaLengthDecoder + { + private final int boundedLength; + + public BoundedVarcharDeltaLengthDecoder(VarcharType varcharType) + { + checkArgument( + !varcharType.isUnbounded(), + "Trino type %s is not a bounded varchar", + varcharType); + this.boundedLength = varcharType.getBoundedLength(); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + InputLengths lengths = getInputAndMaxLength(length); + int maxLength = lengths.maxInputLength(); + int totalInputLength = lengths.totalInputLength(); + boolean truncate = maxLength > boundedLength; + if (truncate) { + readBounded(values, offset, length, totalInputLength); + } + else { + readUnbounded(values, offset, length, totalInputLength); + } + } + + @Override + protected int truncatedLength(Slice slice, int offset, int length) + { + return byteCount(slice, offset, length, boundedLength); + } + } + + public static final class CharDeltaLengthDecoder + extends AbstractDeltaLengthDecoder + { + private final int maxLength; + + public CharDeltaLengthDecoder(CharType charType) + { + this.maxLength = charType.getLength(); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + int totalInputLength = getInputLength(length); + readBounded(values, offset, length, totalInputLength); + } + + @Override + protected int truncatedLength(Slice slice, int offset, int length) + { + return byteCountWithoutTrailingSpace(slice, offset, length, maxLength); + } + } + + public static final class BinaryDeltaLengthDecoder + extends AbstractDeltaLengthDecoder + { + @Override + protected int truncatedLength(Slice slice, int offset, int length) + { + throw new UnsupportedOperationException(); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + int totalInputLength = getInputLength(length); + readUnbounded(values, offset, length, totalInputLength); + } + } + + private abstract static class AbstractDeltaLengthDecoder + implements ValueDecoder + { + private int[] inputLengths; + private int inputLengthsOffset; + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + this.inputLengths = readInputLengths(input); + } + + @Override + public void skip(int n) + { + int totalInputLength = getInputLength(n); + input.skip(totalInputLength); + inputLengthsOffset += n; + } + + protected abstract int truncatedLength(Slice slice, int offset, int length); + + protected int getInputLength(int length) + { + int totalInputLength = 0; + for (int i = 0; i < length; i++) { + totalInputLength += inputLengths[inputLengthsOffset + i]; + } + return totalInputLength; + } + + protected InputLengths getInputAndMaxLength(int length) + { + int totalInputLength = 0; + int maxLength = 0; + for (int i = 0; i < length; i++) { + int inputLength = inputLengths[inputLengthsOffset + i]; + totalInputLength += inputLength; + maxLength = max(maxLength, inputLength); + } + return new InputLengths(totalInputLength, maxLength); + } + + protected void readUnbounded(BinaryBuffer values, int offset, int length, int totalInputLength) + { + values.addChunk(input.readSlice(totalInputLength)); + int[] outputOffsets = values.getOffsets(); + + // Some positions in offsets might have been skipped before this read, + // adjust for this difference when copying offsets to the output + int outputLength = 0; + int baseOutputOffset = outputOffsets[offset]; + for (int i = 0; i < length; i++) { + outputLength += inputLengths[inputLengthsOffset + i]; + outputOffsets[offset + i + 1] = baseOutputOffset + outputLength; + } + inputLengthsOffset += length; + } + + protected void readBounded(BinaryBuffer values, int offset, int length, int totalInputLength) + { + // Use offset arrays to temporarily store output lengths + int[] outputOffsets = values.getOffsets(); + Slice inputSlice = input.readSlice(totalInputLength); + + int currentInputOffset = 0; + int totalOutputLength = 0; + int baseOutputOffset = outputOffsets[offset]; + for (int i = 0; i < length; i++) { + int currentLength = inputLengths[inputLengthsOffset + i]; + + int currentOutputLength = truncatedLength(inputSlice, currentInputOffset, currentLength); + currentInputOffset += currentLength; + totalOutputLength += currentOutputLength; + outputOffsets[offset + i + 1] = baseOutputOffset + totalOutputLength; + } + + // No copying needed if there was no truncation + if (totalOutputLength == totalInputLength) { + values.addChunk(inputSlice); + } + else { + values.addChunk(createOutputBuffer(outputOffsets, offset, length, inputSlice, totalOutputLength)); + } + inputLengthsOffset += length; + } + + /** + * Constructs the output buffer out of input data and length array + */ + private Slice createOutputBuffer( + int[] outputOffsets, + int offset, + int length, + Slice inputSlice, + int totalOutputLength) + { + byte[] output = new byte[totalOutputLength]; + int outputOffset = 0; + int outputLength = 0; + int currentInputOffset = 0; + int inputLength = 0; + for (int i = 0; i < length; i++) { + outputLength += outputOffsets[offset + i + 1] - outputOffsets[offset + i]; + inputLength += inputLengths[inputLengthsOffset + i]; + + if (outputLength != inputLength) { + inputSlice.getBytes(currentInputOffset, output, outputOffset, outputLength); + currentInputOffset += inputLength; + inputLength = 0; + outputOffset += outputLength; + outputLength = 0; + } + } + if (outputLength != 0) { // Write the remaining slice + inputSlice.getBytes(currentInputOffset, output, outputOffset, outputLength); + } + return Slices.wrappedBuffer(output); + } + + protected record InputLengths(int totalInputLength, int maxInputLength) {} + } + + private static int[] readInputLengths(SimpleSliceInputStream input) + { + DeltaBinaryPackedIntDecoder decoder = new DeltaBinaryPackedIntDecoder(); + decoder.init(input); + int valueCount = decoder.getValueCount(); + int[] lengths = new int[valueCount]; + decoder.read(lengths, 0, valueCount); + return lengths; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaPackingUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaPackingUtils.java new file mode 100644 index 000000000000..90579737ed41 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/DeltaPackingUtils.java @@ -0,0 +1,451 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.airlift.slice.Slices; +import io.trino.parquet.reader.SimpleSliceInputStream; + +import java.util.Arrays; + +import static io.trino.parquet.ParquetReaderUtils.toByteExact; +import static io.trino.parquet.ParquetReaderUtils.toShortExact; +import static io.trino.parquet.reader.decoders.ByteBitUnpackers.getByteBitUnpacker; +import static io.trino.parquet.reader.decoders.IntBitUnpackers.getIntBitUnpacker; +import static io.trino.parquet.reader.decoders.LongBitUnpackers.getLongBitUnpacker; +import static io.trino.parquet.reader.decoders.ShortBitUnpackers.getShortBitUnpacker; + +public final class DeltaPackingUtils +{ + private DeltaPackingUtils() {} + + public static void unpackDelta(byte[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + if (bitWidth == 0) { + unpackEmpty(output, outputOffset, length, toByteExact(minDelta)); + } + else { + unpackByte(output, outputOffset, input, length, bitWidth, minDelta); + } + } + + public static void unpackDelta(short[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + if (bitWidth == 0) { + unpackEmpty(output, outputOffset, length, toShortExact(minDelta)); + } + else { + unpackShort(output, outputOffset, input, length, bitWidth, minDelta); + } + } + + public static void unpackDelta(int[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + if (bitWidth == 0) { + unpackEmpty(output, outputOffset, length, (int) minDelta); + } + else { + unpackInt(output, outputOffset, input, length, bitWidth, (int) minDelta); + } + } + + public static void unpackDelta(long[] output, int outputOffset, int length, SimpleSliceInputStream input, long minDelta, byte bitWidth) + { + if (bitWidth == 0) { + unpackEmpty(output, outputOffset, length, minDelta); + } + else { + unpackLong(output, outputOffset, input, length, bitWidth, minDelta); + } + } + + /** + * Fills output array with values that differ by a constant delta. + * With delta = 0 all values are the same and equal to the last written one + */ + private static void unpackEmpty(byte[] output, int outputOffset, int length, byte delta) + { + if (delta == 0) { // Common case + fillArray8(output, outputOffset, length / 8, output[outputOffset - 1]); + } + else { + fillArray8(output, outputOffset, length / 8, delta); + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] += output[i - 1]; + output[i + 1] += output[i]; + output[i + 2] += output[i + 1]; + output[i + 3] += output[i + 2]; + output[i + 4] += output[i + 3]; + output[i + 5] += output[i + 4]; + output[i + 6] += output[i + 5]; + output[i + 7] += output[i + 6]; + output[i + 8] += output[i + 7]; + output[i + 9] += output[i + 8]; + output[i + 10] += output[i + 9]; + output[i + 11] += output[i + 10]; + output[i + 12] += output[i + 11]; + output[i + 13] += output[i + 12]; + output[i + 14] += output[i + 13]; + output[i + 15] += output[i + 14]; + output[i + 16] += output[i + 15]; + output[i + 17] += output[i + 16]; + output[i + 18] += output[i + 17]; + output[i + 19] += output[i + 18]; + output[i + 20] += output[i + 19]; + output[i + 21] += output[i + 20]; + output[i + 22] += output[i + 21]; + output[i + 23] += output[i + 22]; + output[i + 24] += output[i + 23]; + output[i + 25] += output[i + 24]; + output[i + 26] += output[i + 25]; + output[i + 27] += output[i + 26]; + output[i + 28] += output[i + 27]; + output[i + 29] += output[i + 28]; + output[i + 30] += output[i + 29]; + output[i + 31] += output[i + 30]; + } + } + } + + /** + * Fills output array with values that differ by a constant delta. + * With delta = 0 all values are the same and equal to the last written one + */ + private static void unpackEmpty(short[] output, int outputOffset, int length, short delta) + { + if (delta == 0) { // Common case + Arrays.fill(output, outputOffset, outputOffset + length, output[outputOffset - 1]); + } + else { + Arrays.fill(output, outputOffset, outputOffset + length, delta); + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] += output[i - 1]; + output[i + 1] += output[i]; + output[i + 2] += output[i + 1]; + output[i + 3] += output[i + 2]; + output[i + 4] += output[i + 3]; + output[i + 5] += output[i + 4]; + output[i + 6] += output[i + 5]; + output[i + 7] += output[i + 6]; + output[i + 8] += output[i + 7]; + output[i + 9] += output[i + 8]; + output[i + 10] += output[i + 9]; + output[i + 11] += output[i + 10]; + output[i + 12] += output[i + 11]; + output[i + 13] += output[i + 12]; + output[i + 14] += output[i + 13]; + output[i + 15] += output[i + 14]; + output[i + 16] += output[i + 15]; + output[i + 17] += output[i + 16]; + output[i + 18] += output[i + 17]; + output[i + 19] += output[i + 18]; + output[i + 20] += output[i + 19]; + output[i + 21] += output[i + 20]; + output[i + 22] += output[i + 21]; + output[i + 23] += output[i + 22]; + output[i + 24] += output[i + 23]; + output[i + 25] += output[i + 24]; + output[i + 26] += output[i + 25]; + output[i + 27] += output[i + 26]; + output[i + 28] += output[i + 27]; + output[i + 29] += output[i + 28]; + output[i + 30] += output[i + 29]; + output[i + 31] += output[i + 30]; + } + } + } + + /** + * Fills output array with values that differ by a constant delta. + * With delta = 0 all values are the same and equal to the last written one + */ + private static void unpackEmpty(int[] output, int outputOffset, int length, int delta) + { + if (delta == 0) { // Common case + Arrays.fill(output, outputOffset, outputOffset + length, output[outputOffset - 1]); + } + else { + Arrays.fill(output, outputOffset, outputOffset + length, delta); + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] += output[i - 1]; + output[i + 1] += output[i]; + output[i + 2] += output[i + 1]; + output[i + 3] += output[i + 2]; + output[i + 4] += output[i + 3]; + output[i + 5] += output[i + 4]; + output[i + 6] += output[i + 5]; + output[i + 7] += output[i + 6]; + output[i + 8] += output[i + 7]; + output[i + 9] += output[i + 8]; + output[i + 10] += output[i + 9]; + output[i + 11] += output[i + 10]; + output[i + 12] += output[i + 11]; + output[i + 13] += output[i + 12]; + output[i + 14] += output[i + 13]; + output[i + 15] += output[i + 14]; + output[i + 16] += output[i + 15]; + output[i + 17] += output[i + 16]; + output[i + 18] += output[i + 17]; + output[i + 19] += output[i + 18]; + output[i + 20] += output[i + 19]; + output[i + 21] += output[i + 20]; + output[i + 22] += output[i + 21]; + output[i + 23] += output[i + 22]; + output[i + 24] += output[i + 23]; + output[i + 25] += output[i + 24]; + output[i + 26] += output[i + 25]; + output[i + 27] += output[i + 26]; + output[i + 28] += output[i + 27]; + output[i + 29] += output[i + 28]; + output[i + 30] += output[i + 29]; + output[i + 31] += output[i + 30]; + } + } + } + + /** + * Fills output array with values that differ by a constant delta. + * With delta = 0 all values are the same and equal to the last written one + */ + private static void unpackEmpty(long[] output, int outputOffset, int length, long delta) + { + if (delta == 0) { // Common case + Arrays.fill(output, outputOffset, outputOffset + length, output[outputOffset - 1]); + } + else { + Arrays.fill(output, outputOffset, outputOffset + length, delta); + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] += output[i - 1]; + output[i + 1] += output[i]; + output[i + 2] += output[i + 1]; + output[i + 3] += output[i + 2]; + output[i + 4] += output[i + 3]; + output[i + 5] += output[i + 4]; + output[i + 6] += output[i + 5]; + output[i + 7] += output[i + 6]; + output[i + 8] += output[i + 7]; + output[i + 9] += output[i + 8]; + output[i + 10] += output[i + 9]; + output[i + 11] += output[i + 10]; + output[i + 12] += output[i + 11]; + output[i + 13] += output[i + 12]; + output[i + 14] += output[i + 13]; + output[i + 15] += output[i + 14]; + output[i + 16] += output[i + 15]; + output[i + 17] += output[i + 16]; + output[i + 18] += output[i + 17]; + output[i + 19] += output[i + 18]; + output[i + 20] += output[i + 19]; + output[i + 21] += output[i + 20]; + output[i + 22] += output[i + 21]; + output[i + 23] += output[i + 22]; + output[i + 24] += output[i + 23]; + output[i + 25] += output[i + 24]; + output[i + 26] += output[i + 25]; + output[i + 27] += output[i + 26]; + output[i + 28] += output[i + 27]; + output[i + 29] += output[i + 28]; + output[i + 30] += output[i + 29]; + output[i + 31] += output[i + 30]; + } + } + } + + private static void unpackByte(byte[] output, int outputOffset, SimpleSliceInputStream input, int length, byte bitWidth, long minDelta) + { + getByteBitUnpacker(bitWidth).unpack(output, outputOffset, input, length); + inPlacePrefixSum(output, outputOffset, length, (short) minDelta); + } + + private static void unpackShort(short[] output, int outputOffset, SimpleSliceInputStream input, int length, byte bitWidth, long minDelta) + { + getShortBitUnpacker(bitWidth).unpack(output, outputOffset, input, length); + inPlacePrefixSum(output, outputOffset, length, (int) minDelta); + } + + private static void unpackInt(int[] output, int outputOffset, SimpleSliceInputStream input, int length, byte bitWidth, int minDelta) + { + getIntBitUnpacker(bitWidth).unpack(output, outputOffset, input, length); + inPlacePrefixSum(output, outputOffset, length, minDelta); + } + + private static void unpackLong(long[] output, int outputOffset, SimpleSliceInputStream input, int length, byte bitWidth, long minDelta) + { + getLongBitUnpacker(bitWidth).unpack(output, outputOffset, input, length); + inPlacePrefixSum(output, outputOffset, length, minDelta); + } + + /** + * Fill byte array with a value. Fills 8 values at a time + * + * @param length Number of LONG values to write i.e. number of bytes / 8 + */ + private static void fillArray8(byte[] output, int outputOffset, int length, byte baseValue) + { + int lengthInBytes = length * Long.BYTES; + Slices.wrappedBuffer(output, outputOffset, lengthInBytes) + .fill(baseValue); + } + + private static void inPlacePrefixSum(byte[] output, int outputOffset, int length, short minDelta) + { + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] = (byte) (output[i] + (output[i - 1] + minDelta)); + output[i + 1] = (byte) (output[i + 1] + (output[i] + minDelta)); + output[i + 2] = (byte) (output[i + 2] + (output[i + 1] + minDelta)); + output[i + 3] = (byte) (output[i + 3] + (output[i + 2] + minDelta)); + output[i + 4] = (byte) (output[i + 4] + (output[i + 3] + minDelta)); + output[i + 5] = (byte) (output[i + 5] + (output[i + 4] + minDelta)); + output[i + 6] = (byte) (output[i + 6] + (output[i + 5] + minDelta)); + output[i + 7] = (byte) (output[i + 7] + (output[i + 6] + minDelta)); + output[i + 8] = (byte) (output[i + 8] + (output[i + 7] + minDelta)); + output[i + 9] = (byte) (output[i + 9] + (output[i + 8] + minDelta)); + output[i + 10] = (byte) (output[i + 10] + (output[i + 9] + minDelta)); + output[i + 11] = (byte) (output[i + 11] + (output[i + 10] + minDelta)); + output[i + 12] = (byte) (output[i + 12] + (output[i + 11] + minDelta)); + output[i + 13] = (byte) (output[i + 13] + (output[i + 12] + minDelta)); + output[i + 14] = (byte) (output[i + 14] + (output[i + 13] + minDelta)); + output[i + 15] = (byte) (output[i + 15] + (output[i + 14] + minDelta)); + output[i + 16] = (byte) (output[i + 16] + (output[i + 15] + minDelta)); + output[i + 17] = (byte) (output[i + 17] + (output[i + 16] + minDelta)); + output[i + 18] = (byte) (output[i + 18] + (output[i + 17] + minDelta)); + output[i + 19] = (byte) (output[i + 19] + (output[i + 18] + minDelta)); + output[i + 20] = (byte) (output[i + 20] + (output[i + 19] + minDelta)); + output[i + 21] = (byte) (output[i + 21] + (output[i + 20] + minDelta)); + output[i + 22] = (byte) (output[i + 22] + (output[i + 21] + minDelta)); + output[i + 23] = (byte) (output[i + 23] + (output[i + 22] + minDelta)); + output[i + 24] = (byte) (output[i + 24] + (output[i + 23] + minDelta)); + output[i + 25] = (byte) (output[i + 25] + (output[i + 24] + minDelta)); + output[i + 26] = (byte) (output[i + 26] + (output[i + 25] + minDelta)); + output[i + 27] = (byte) (output[i + 27] + (output[i + 26] + minDelta)); + output[i + 28] = (byte) (output[i + 28] + (output[i + 27] + minDelta)); + output[i + 29] = (byte) (output[i + 29] + (output[i + 28] + minDelta)); + output[i + 30] = (byte) (output[i + 30] + (output[i + 29] + minDelta)); + output[i + 31] = (byte) (output[i + 31] + (output[i + 30] + minDelta)); + } + } + + private static void inPlacePrefixSum(short[] output, int outputOffset, int length, int minDelta) + { + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] = (short) (output[i] + output[i - 1] + minDelta); + output[i + 1] = (short) (output[i + 1] + output[i] + minDelta); + output[i + 2] = (short) (output[i + 2] + output[i + 1] + minDelta); + output[i + 3] = (short) (output[i + 3] + output[i + 2] + minDelta); + output[i + 4] = (short) (output[i + 4] + output[i + 3] + minDelta); + output[i + 5] = (short) (output[i + 5] + output[i + 4] + minDelta); + output[i + 6] = (short) (output[i + 6] + output[i + 5] + minDelta); + output[i + 7] = (short) (output[i + 7] + output[i + 6] + minDelta); + output[i + 8] = (short) (output[i + 8] + output[i + 7] + minDelta); + output[i + 9] = (short) (output[i + 9] + output[i + 8] + minDelta); + output[i + 10] = (short) (output[i + 10] + output[i + 9] + minDelta); + output[i + 11] = (short) (output[i + 11] + output[i + 10] + minDelta); + output[i + 12] = (short) (output[i + 12] + output[i + 11] + minDelta); + output[i + 13] = (short) (output[i + 13] + output[i + 12] + minDelta); + output[i + 14] = (short) (output[i + 14] + output[i + 13] + minDelta); + output[i + 15] = (short) (output[i + 15] + output[i + 14] + minDelta); + output[i + 16] = (short) (output[i + 16] + output[i + 15] + minDelta); + output[i + 17] = (short) (output[i + 17] + output[i + 16] + minDelta); + output[i + 18] = (short) (output[i + 18] + output[i + 17] + minDelta); + output[i + 19] = (short) (output[i + 19] + output[i + 18] + minDelta); + output[i + 20] = (short) (output[i + 20] + output[i + 19] + minDelta); + output[i + 21] = (short) (output[i + 21] + output[i + 20] + minDelta); + output[i + 22] = (short) (output[i + 22] + output[i + 21] + minDelta); + output[i + 23] = (short) (output[i + 23] + output[i + 22] + minDelta); + output[i + 24] = (short) (output[i + 24] + output[i + 23] + minDelta); + output[i + 25] = (short) (output[i + 25] + output[i + 24] + minDelta); + output[i + 26] = (short) (output[i + 26] + output[i + 25] + minDelta); + output[i + 27] = (short) (output[i + 27] + output[i + 26] + minDelta); + output[i + 28] = (short) (output[i + 28] + output[i + 27] + minDelta); + output[i + 29] = (short) (output[i + 29] + output[i + 28] + minDelta); + output[i + 30] = (short) (output[i + 30] + output[i + 29] + minDelta); + output[i + 31] = (short) (output[i + 31] + output[i + 30] + minDelta); + } + } + + private static void inPlacePrefixSum(int[] output, int outputOffset, int length, int minDelta) + { + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] += output[i - 1] + minDelta; + output[i + 1] += output[i] + minDelta; + output[i + 2] += output[i + 1] + minDelta; + output[i + 3] += output[i + 2] + minDelta; + output[i + 4] += output[i + 3] + minDelta; + output[i + 5] += output[i + 4] + minDelta; + output[i + 6] += output[i + 5] + minDelta; + output[i + 7] += output[i + 6] + minDelta; + output[i + 8] += output[i + 7] + minDelta; + output[i + 9] += output[i + 8] + minDelta; + output[i + 10] += output[i + 9] + minDelta; + output[i + 11] += output[i + 10] + minDelta; + output[i + 12] += output[i + 11] + minDelta; + output[i + 13] += output[i + 12] + minDelta; + output[i + 14] += output[i + 13] + minDelta; + output[i + 15] += output[i + 14] + minDelta; + output[i + 16] += output[i + 15] + minDelta; + output[i + 17] += output[i + 16] + minDelta; + output[i + 18] += output[i + 17] + minDelta; + output[i + 19] += output[i + 18] + minDelta; + output[i + 20] += output[i + 19] + minDelta; + output[i + 21] += output[i + 20] + minDelta; + output[i + 22] += output[i + 21] + minDelta; + output[i + 23] += output[i + 22] + minDelta; + output[i + 24] += output[i + 23] + minDelta; + output[i + 25] += output[i + 24] + minDelta; + output[i + 26] += output[i + 25] + minDelta; + output[i + 27] += output[i + 26] + minDelta; + output[i + 28] += output[i + 27] + minDelta; + output[i + 29] += output[i + 28] + minDelta; + output[i + 30] += output[i + 29] + minDelta; + output[i + 31] += output[i + 30] + minDelta; + } + } + + private static void inPlacePrefixSum(long[] output, int outputOffset, int length, long minDelta) + { + for (int i = outputOffset; i < outputOffset + length; i += 32) { + output[i] += output[i - 1] + minDelta; + output[i + 1] += output[i] + minDelta; + output[i + 2] += output[i + 1] + minDelta; + output[i + 3] += output[i + 2] + minDelta; + output[i + 4] += output[i + 3] + minDelta; + output[i + 5] += output[i + 4] + minDelta; + output[i + 6] += output[i + 5] + minDelta; + output[i + 7] += output[i + 6] + minDelta; + output[i + 8] += output[i + 7] + minDelta; + output[i + 9] += output[i + 8] + minDelta; + output[i + 10] += output[i + 9] + minDelta; + output[i + 11] += output[i + 10] + minDelta; + output[i + 12] += output[i + 11] + minDelta; + output[i + 13] += output[i + 12] + minDelta; + output[i + 14] += output[i + 13] + minDelta; + output[i + 15] += output[i + 14] + minDelta; + output[i + 16] += output[i + 15] + minDelta; + output[i + 17] += output[i + 16] + minDelta; + output[i + 18] += output[i + 17] + minDelta; + output[i + 19] += output[i + 18] + minDelta; + output[i + 20] += output[i + 19] + minDelta; + output[i + 21] += output[i + 20] + minDelta; + output[i + 22] += output[i + 21] + minDelta; + output[i + 23] += output[i + 22] + minDelta; + output[i + 24] += output[i + 23] + minDelta; + output[i + 25] += output[i + 24] + minDelta; + output[i + 26] += output[i + 25] + minDelta; + output[i + 27] += output[i + 26] + minDelta; + output[i + 28] += output[i + 27] + minDelta; + output[i + 29] += output[i + 28] + minDelta; + output[i + 30] += output[i + 29] + minDelta; + output[i + 31] += output[i + 30] + minDelta; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/IntBitUnpacker.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/IntBitUnpacker.java new file mode 100644 index 000000000000..1daef303f6c4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/IntBitUnpacker.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +public interface IntBitUnpacker +{ + /** + * @param length must be a multiple of 8 + */ + void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/IntBitUnpackers.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/IntBitUnpackers.java new file mode 100644 index 000000000000..656c9d273378 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/IntBitUnpackers.java @@ -0,0 +1,1367 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +public final class IntBitUnpackers +{ + private static final IntBitUnpacker[] UNPACKERS = { + new Unpacker0(), + new Unpacker1(), + new Unpacker2(), + new Unpacker3(), + new Unpacker4(), + new Unpacker5(), + new Unpacker6(), + new Unpacker7(), + new Unpacker8(), + new Unpacker9(), + new Unpacker10(), + new Unpacker11(), + new Unpacker12(), + new Unpacker13(), + new Unpacker14(), + new Unpacker15(), + new Unpacker16(), + new Unpacker17(), + new Unpacker18(), + new Unpacker19(), + new Unpacker20(), + new Unpacker21(), + new Unpacker22(), + new Unpacker23(), + new Unpacker24(), + new Unpacker25(), + new Unpacker26(), + new Unpacker27(), + new Unpacker28(), + new Unpacker29(), + new Unpacker30(), + new Unpacker31(), + new Unpacker32()}; + + public static IntBitUnpacker getIntBitUnpacker(int bitWidth) + { + return UNPACKERS[bitWidth]; + } + + private IntBitUnpackers() {} + + private static final class Unpacker0 + implements IntBitUnpacker + { + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, + int length) + { + // Do nothing + } + } + + private static final class Unpacker1 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + byte v0 = input[inputOffset]; + output[outputOffset] = v0 & 1; + output[outputOffset + 1] = (v0 >>> 1) & 1; + output[outputOffset + 2] = (v0 >>> 2) & 1; + output[outputOffset + 3] = (v0 >>> 3) & 1; + output[outputOffset + 4] = (v0 >>> 4) & 1; + output[outputOffset + 5] = (v0 >>> 5) & 1; + output[outputOffset + 6] = (v0 >>> 6) & 1; + output[outputOffset + 7] = (v0 >>> 7) & 1; + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArr = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 8) { + unpack8(output, outputOffset, inputArr, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead++; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker2 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + byte v0 = input[inputOffset]; + byte v1 = input[inputOffset + 1]; + + output[outputOffset] = v0 & 0b11; + output[outputOffset + 1] = (v0 >>> 2) & 0b11; + output[outputOffset + 2] = (v0 >>> 4) & 0b11; + output[outputOffset + 3] = (v0 >>> 6) & 0b11; + + output[outputOffset + 4] = v1 & 0b11; + output[outputOffset + 5] = (v1 >>> 2) & 0b11; + output[outputOffset + 6] = (v1 >>> 4) & 0b11; + output[outputOffset + 7] = (v1 >>> 6) & 0b11; + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArr = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 8) { + unpack8(output, outputOffset, inputArr, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 2; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker3 + implements IntBitUnpacker + { + private static void unpack64(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + output[outputOffset] = (int) (v0 & 0b111L); + output[outputOffset + 1] = (int) ((v0 >>> 3) & 0b111L); + output[outputOffset + 2] = (int) ((v0 >>> 6) & 0b111L); + output[outputOffset + 3] = (int) ((v0 >>> 9) & 0b111L); + output[outputOffset + 4] = (int) ((v0 >>> 12) & 0b111L); + output[outputOffset + 5] = (int) ((v0 >>> 15) & 0b111L); + output[outputOffset + 6] = (int) ((v0 >>> 18) & 0b111L); + output[outputOffset + 7] = (int) ((v0 >>> 21) & 0b111L); + output[outputOffset + 8] = (int) ((v0 >>> 24) & 0b111L); + output[outputOffset + 9] = (int) ((v0 >>> 27) & 0b111L); + output[outputOffset + 10] = (int) ((v0 >>> 30) & 0b111L); + output[outputOffset + 11] = (int) ((v0 >>> 33) & 0b111L); + output[outputOffset + 12] = (int) ((v0 >>> 36) & 0b111L); + output[outputOffset + 13] = (int) ((v0 >>> 39) & 0b111L); + output[outputOffset + 14] = (int) ((v0 >>> 42) & 0b111L); + output[outputOffset + 15] = (int) ((v0 >>> 45) & 0b111L); + output[outputOffset + 16] = (int) ((v0 >>> 48) & 0b111L); + output[outputOffset + 17] = (int) ((v0 >>> 51) & 0b111L); + output[outputOffset + 18] = (int) ((v0 >>> 54) & 0b111L); + output[outputOffset + 19] = (int) ((v0 >>> 57) & 0b111L); + output[outputOffset + 20] = (int) ((v0 >>> 60) & 0b111L); + output[outputOffset + 21] = (int) (((v0 >>> 63) & 0b1L) | ((v1 & 0b11L) << 1)); + output[outputOffset + 22] = (int) ((v1 >>> 2) & 0b111L); + output[outputOffset + 23] = (int) ((v1 >>> 5) & 0b111L); + output[outputOffset + 24] = (int) ((v1 >>> 8) & 0b111L); + output[outputOffset + 25] = (int) ((v1 >>> 11) & 0b111L); + output[outputOffset + 26] = (int) ((v1 >>> 14) & 0b111L); + output[outputOffset + 27] = (int) ((v1 >>> 17) & 0b111L); + output[outputOffset + 28] = (int) ((v1 >>> 20) & 0b111L); + output[outputOffset + 29] = (int) ((v1 >>> 23) & 0b111L); + output[outputOffset + 30] = (int) ((v1 >>> 26) & 0b111L); + output[outputOffset + 31] = (int) ((v1 >>> 29) & 0b111L); + output[outputOffset + 32] = (int) ((v1 >>> 32) & 0b111L); + output[outputOffset + 33] = (int) ((v1 >>> 35) & 0b111L); + output[outputOffset + 34] = (int) ((v1 >>> 38) & 0b111L); + output[outputOffset + 35] = (int) ((v1 >>> 41) & 0b111L); + output[outputOffset + 36] = (int) ((v1 >>> 44) & 0b111L); + output[outputOffset + 37] = (int) ((v1 >>> 47) & 0b111L); + output[outputOffset + 38] = (int) ((v1 >>> 50) & 0b111L); + output[outputOffset + 39] = (int) ((v1 >>> 53) & 0b111L); + output[outputOffset + 40] = (int) ((v1 >>> 56) & 0b111L); + output[outputOffset + 41] = (int) ((v1 >>> 59) & 0b111L); + output[outputOffset + 42] = (int) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1L) << 2)); + output[outputOffset + 43] = (int) ((v2 >>> 1) & 0b111L); + output[outputOffset + 44] = (int) ((v2 >>> 4) & 0b111L); + output[outputOffset + 45] = (int) ((v2 >>> 7) & 0b111L); + output[outputOffset + 46] = (int) ((v2 >>> 10) & 0b111L); + output[outputOffset + 47] = (int) ((v2 >>> 13) & 0b111L); + output[outputOffset + 48] = (int) ((v2 >>> 16) & 0b111L); + output[outputOffset + 49] = (int) ((v2 >>> 19) & 0b111L); + output[outputOffset + 50] = (int) ((v2 >>> 22) & 0b111L); + output[outputOffset + 51] = (int) ((v2 >>> 25) & 0b111L); + output[outputOffset + 52] = (int) ((v2 >>> 28) & 0b111L); + output[outputOffset + 53] = (int) ((v2 >>> 31) & 0b111L); + output[outputOffset + 54] = (int) ((v2 >>> 34) & 0b111L); + output[outputOffset + 55] = (int) ((v2 >>> 37) & 0b111L); + output[outputOffset + 56] = (int) ((v2 >>> 40) & 0b111L); + output[outputOffset + 57] = (int) ((v2 >>> 43) & 0b111L); + output[outputOffset + 58] = (int) ((v2 >>> 46) & 0b111L); + output[outputOffset + 59] = (int) ((v2 >>> 49) & 0b111L); + output[outputOffset + 60] = (int) ((v2 >>> 52) & 0b111L); + output[outputOffset + 61] = (int) ((v2 >>> 55) & 0b111L); + output[outputOffset + 62] = (int) ((v2 >>> 58) & 0b111L); + output[outputOffset + 63] = (int) ((v2 >>> 61) & 0b111L); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + short v0 = input.readShort(); + byte v1 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b111L); + output[outputOffset + 1] = (int) ((v0 >>> 3) & 0b111L); + output[outputOffset + 2] = (int) ((v0 >>> 6) & 0b111L); + output[outputOffset + 3] = (int) ((v0 >>> 9) & 0b111L); + output[outputOffset + 4] = (int) ((v0 >>> 12) & 0b111L); + output[outputOffset + 5] = (int) (((v0 >>> 15) & 0b1L) | ((v1 & 0b11L) << 1)); + output[outputOffset + 6] = (int) ((v1 >>> 2) & 0b111L); + output[outputOffset + 7] = (int) ((v1 >>> 5) & 0b111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 64) { + unpack64(output, outputOffset, input); + outputOffset += 64; + length -= 64; + } + switch (length) { + case 56: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 48: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 40: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 32: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 24: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 16: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 8: + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker4 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + byte v0 = input[inputOffset]; + byte v1 = input[inputOffset + 1]; + byte v2 = input[inputOffset + 2]; + byte v3 = input[inputOffset + 3]; + + output[outputOffset] = v0 & 0b1111; + output[outputOffset + 1] = (v0 >>> 4) & 0b1111; + output[outputOffset + 2] = v1 & 0b1111; + output[outputOffset + 3] = (v1 >>> 4) & 0b1111; + output[outputOffset + 4] = v2 & 0b1111; + output[outputOffset + 5] = (v2 >>> 4) & 0b1111; + output[outputOffset + 6] = v3 & 0b1111; + output[outputOffset + 7] = (v3 >>> 4) & 0b1111; + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArr = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 8) { + unpack8(output, outputOffset, inputArr, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 4; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker5 + implements IntBitUnpacker + { + private static void unpack64(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + output[outputOffset] = (int) (v0 & 0b11111L); + output[outputOffset + 1] = (int) ((v0 >>> 5) & 0b11111L); + output[outputOffset + 2] = (int) ((v0 >>> 10) & 0b11111L); + output[outputOffset + 3] = (int) ((v0 >>> 15) & 0b11111L); + output[outputOffset + 4] = (int) ((v0 >>> 20) & 0b11111L); + output[outputOffset + 5] = (int) ((v0 >>> 25) & 0b11111L); + output[outputOffset + 6] = (int) ((v0 >>> 30) & 0b11111L); + output[outputOffset + 7] = (int) ((v0 >>> 35) & 0b11111L); + output[outputOffset + 8] = (int) ((v0 >>> 40) & 0b11111L); + output[outputOffset + 9] = (int) ((v0 >>> 45) & 0b11111L); + output[outputOffset + 10] = (int) ((v0 >>> 50) & 0b11111L); + output[outputOffset + 11] = (int) ((v0 >>> 55) & 0b11111L); + output[outputOffset + 12] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b1L) << 4)); + output[outputOffset + 13] = (int) ((v1 >>> 1) & 0b11111L); + output[outputOffset + 14] = (int) ((v1 >>> 6) & 0b11111L); + output[outputOffset + 15] = (int) ((v1 >>> 11) & 0b11111L); + output[outputOffset + 16] = (int) ((v1 >>> 16) & 0b11111L); + output[outputOffset + 17] = (int) ((v1 >>> 21) & 0b11111L); + output[outputOffset + 18] = (int) ((v1 >>> 26) & 0b11111L); + output[outputOffset + 19] = (int) ((v1 >>> 31) & 0b11111L); + output[outputOffset + 20] = (int) ((v1 >>> 36) & 0b11111L); + output[outputOffset + 21] = (int) ((v1 >>> 41) & 0b11111L); + output[outputOffset + 22] = (int) ((v1 >>> 46) & 0b11111L); + output[outputOffset + 23] = (int) ((v1 >>> 51) & 0b11111L); + output[outputOffset + 24] = (int) ((v1 >>> 56) & 0b11111L); + output[outputOffset + 25] = (int) (((v1 >>> 61) & 0b111L) | ((v2 & 0b11L) << 3)); + output[outputOffset + 26] = (int) ((v2 >>> 2) & 0b11111L); + output[outputOffset + 27] = (int) ((v2 >>> 7) & 0b11111L); + output[outputOffset + 28] = (int) ((v2 >>> 12) & 0b11111L); + output[outputOffset + 29] = (int) ((v2 >>> 17) & 0b11111L); + output[outputOffset + 30] = (int) ((v2 >>> 22) & 0b11111L); + output[outputOffset + 31] = (int) ((v2 >>> 27) & 0b11111L); + output[outputOffset + 32] = (int) ((v2 >>> 32) & 0b11111L); + output[outputOffset + 33] = (int) ((v2 >>> 37) & 0b11111L); + output[outputOffset + 34] = (int) ((v2 >>> 42) & 0b11111L); + output[outputOffset + 35] = (int) ((v2 >>> 47) & 0b11111L); + output[outputOffset + 36] = (int) ((v2 >>> 52) & 0b11111L); + output[outputOffset + 37] = (int) ((v2 >>> 57) & 0b11111L); + output[outputOffset + 38] = (int) (((v2 >>> 62) & 0b11L) | ((v3 & 0b111L) << 2)); + output[outputOffset + 39] = (int) ((v3 >>> 3) & 0b11111L); + output[outputOffset + 40] = (int) ((v3 >>> 8) & 0b11111L); + output[outputOffset + 41] = (int) ((v3 >>> 13) & 0b11111L); + output[outputOffset + 42] = (int) ((v3 >>> 18) & 0b11111L); + output[outputOffset + 43] = (int) ((v3 >>> 23) & 0b11111L); + output[outputOffset + 44] = (int) ((v3 >>> 28) & 0b11111L); + output[outputOffset + 45] = (int) ((v3 >>> 33) & 0b11111L); + output[outputOffset + 46] = (int) ((v3 >>> 38) & 0b11111L); + output[outputOffset + 47] = (int) ((v3 >>> 43) & 0b11111L); + output[outputOffset + 48] = (int) ((v3 >>> 48) & 0b11111L); + output[outputOffset + 49] = (int) ((v3 >>> 53) & 0b11111L); + output[outputOffset + 50] = (int) ((v3 >>> 58) & 0b11111L); + output[outputOffset + 51] = (int) (((v3 >>> 63) & 0b1L) | ((v4 & 0b1111L) << 1)); + output[outputOffset + 52] = (int) ((v4 >>> 4) & 0b11111L); + output[outputOffset + 53] = (int) ((v4 >>> 9) & 0b11111L); + output[outputOffset + 54] = (int) ((v4 >>> 14) & 0b11111L); + output[outputOffset + 55] = (int) ((v4 >>> 19) & 0b11111L); + output[outputOffset + 56] = (int) ((v4 >>> 24) & 0b11111L); + output[outputOffset + 57] = (int) ((v4 >>> 29) & 0b11111L); + output[outputOffset + 58] = (int) ((v4 >>> 34) & 0b11111L); + output[outputOffset + 59] = (int) ((v4 >>> 39) & 0b11111L); + output[outputOffset + 60] = (int) ((v4 >>> 44) & 0b11111L); + output[outputOffset + 61] = (int) ((v4 >>> 49) & 0b11111L); + output[outputOffset + 62] = (int) ((v4 >>> 54) & 0b11111L); + output[outputOffset + 63] = (int) ((v4 >>> 59) & 0b11111L); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + byte v1 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111L); + output[outputOffset + 1] = (int) ((v0 >>> 5) & 0b11111L); + output[outputOffset + 2] = (int) ((v0 >>> 10) & 0b11111L); + output[outputOffset + 3] = (int) ((v0 >>> 15) & 0b11111L); + output[outputOffset + 4] = (int) ((v0 >>> 20) & 0b11111L); + output[outputOffset + 5] = (int) ((v0 >>> 25) & 0b11111L); + output[outputOffset + 6] = (int) (((v0 >>> 30) & 0b11L) | ((v1 & 0b111L) << 2)); + output[outputOffset + 7] = (int) ((v1 >>> 3) & 0b11111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 64) { + unpack64(output, outputOffset, input); + outputOffset += 64; + length -= 64; + } + switch (length) { + case 56: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 48: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 40: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 32: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 24: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 16: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 8: + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker6 + implements IntBitUnpacker + { + private static void unpack32(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + output[outputOffset] = (int) (v0 & 0b111111L); + output[outputOffset + 1] = (int) ((v0 >>> 6) & 0b111111L); + output[outputOffset + 2] = (int) ((v0 >>> 12) & 0b111111L); + output[outputOffset + 3] = (int) ((v0 >>> 18) & 0b111111L); + output[outputOffset + 4] = (int) ((v0 >>> 24) & 0b111111L); + output[outputOffset + 5] = (int) ((v0 >>> 30) & 0b111111L); + output[outputOffset + 6] = (int) ((v0 >>> 36) & 0b111111L); + output[outputOffset + 7] = (int) ((v0 >>> 42) & 0b111111L); + output[outputOffset + 8] = (int) ((v0 >>> 48) & 0b111111L); + output[outputOffset + 9] = (int) ((v0 >>> 54) & 0b111111L); + output[outputOffset + 10] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11L) << 4)); + output[outputOffset + 11] = (int) ((v1 >>> 2) & 0b111111L); + output[outputOffset + 12] = (int) ((v1 >>> 8) & 0b111111L); + output[outputOffset + 13] = (int) ((v1 >>> 14) & 0b111111L); + output[outputOffset + 14] = (int) ((v1 >>> 20) & 0b111111L); + output[outputOffset + 15] = (int) ((v1 >>> 26) & 0b111111L); + output[outputOffset + 16] = (int) ((v1 >>> 32) & 0b111111L); + output[outputOffset + 17] = (int) ((v1 >>> 38) & 0b111111L); + output[outputOffset + 18] = (int) ((v1 >>> 44) & 0b111111L); + output[outputOffset + 19] = (int) ((v1 >>> 50) & 0b111111L); + output[outputOffset + 20] = (int) ((v1 >>> 56) & 0b111111L); + output[outputOffset + 21] = (int) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1111L) << 2)); + output[outputOffset + 22] = (int) ((v2 >>> 4) & 0b111111L); + output[outputOffset + 23] = (int) ((v2 >>> 10) & 0b111111L); + output[outputOffset + 24] = (int) ((v2 >>> 16) & 0b111111L); + output[outputOffset + 25] = (int) ((v2 >>> 22) & 0b111111L); + output[outputOffset + 26] = (int) ((v2 >>> 28) & 0b111111L); + output[outputOffset + 27] = (int) ((v2 >>> 34) & 0b111111L); + output[outputOffset + 28] = (int) ((v2 >>> 40) & 0b111111L); + output[outputOffset + 29] = (int) ((v2 >>> 46) & 0b111111L); + output[outputOffset + 30] = (int) ((v2 >>> 52) & 0b111111L); + output[outputOffset + 31] = (int) ((v2 >>> 58) & 0b111111L); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + short v1 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b111111L); + output[outputOffset + 1] = (int) ((v0 >>> 6) & 0b111111L); + output[outputOffset + 2] = (int) ((v0 >>> 12) & 0b111111L); + output[outputOffset + 3] = (int) ((v0 >>> 18) & 0b111111L); + output[outputOffset + 4] = (int) ((v0 >>> 24) & 0b111111L); + output[outputOffset + 5] = (int) (((v0 >>> 30) & 0b11L) | ((v1 & 0b1111L) << 2)); + output[outputOffset + 6] = (int) ((v1 >>> 4) & 0b111111L); + output[outputOffset + 7] = (int) ((v1 >>> 10) & 0b111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + switch (length) { + case 24: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 16: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 8: + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker7 + implements IntBitUnpacker + { + private static void unpack64(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + output[outputOffset] = (int) (v0 & 0b1111111L); + output[outputOffset + 1] = (int) ((v0 >>> 7) & 0b1111111L); + output[outputOffset + 2] = (int) ((v0 >>> 14) & 0b1111111L); + output[outputOffset + 3] = (int) ((v0 >>> 21) & 0b1111111L); + output[outputOffset + 4] = (int) ((v0 >>> 28) & 0b1111111L); + output[outputOffset + 5] = (int) ((v0 >>> 35) & 0b1111111L); + output[outputOffset + 6] = (int) ((v0 >>> 42) & 0b1111111L); + output[outputOffset + 7] = (int) ((v0 >>> 49) & 0b1111111L); + output[outputOffset + 8] = (int) ((v0 >>> 56) & 0b1111111L); + output[outputOffset + 9] = (int) (((v0 >>> 63) & 0b1L) | ((v1 & 0b111111L) << 1)); + output[outputOffset + 10] = (int) ((v1 >>> 6) & 0b1111111L); + output[outputOffset + 11] = (int) ((v1 >>> 13) & 0b1111111L); + output[outputOffset + 12] = (int) ((v1 >>> 20) & 0b1111111L); + output[outputOffset + 13] = (int) ((v1 >>> 27) & 0b1111111L); + output[outputOffset + 14] = (int) ((v1 >>> 34) & 0b1111111L); + output[outputOffset + 15] = (int) ((v1 >>> 41) & 0b1111111L); + output[outputOffset + 16] = (int) ((v1 >>> 48) & 0b1111111L); + output[outputOffset + 17] = (int) ((v1 >>> 55) & 0b1111111L); + output[outputOffset + 18] = (int) (((v1 >>> 62) & 0b11L) | ((v2 & 0b11111L) << 2)); + output[outputOffset + 19] = (int) ((v2 >>> 5) & 0b1111111L); + output[outputOffset + 20] = (int) ((v2 >>> 12) & 0b1111111L); + output[outputOffset + 21] = (int) ((v2 >>> 19) & 0b1111111L); + output[outputOffset + 22] = (int) ((v2 >>> 26) & 0b1111111L); + output[outputOffset + 23] = (int) ((v2 >>> 33) & 0b1111111L); + output[outputOffset + 24] = (int) ((v2 >>> 40) & 0b1111111L); + output[outputOffset + 25] = (int) ((v2 >>> 47) & 0b1111111L); + output[outputOffset + 26] = (int) ((v2 >>> 54) & 0b1111111L); + output[outputOffset + 27] = (int) (((v2 >>> 61) & 0b111L) | ((v3 & 0b1111L) << 3)); + output[outputOffset + 28] = (int) ((v3 >>> 4) & 0b1111111L); + output[outputOffset + 29] = (int) ((v3 >>> 11) & 0b1111111L); + output[outputOffset + 30] = (int) ((v3 >>> 18) & 0b1111111L); + output[outputOffset + 31] = (int) ((v3 >>> 25) & 0b1111111L); + output[outputOffset + 32] = (int) ((v3 >>> 32) & 0b1111111L); + output[outputOffset + 33] = (int) ((v3 >>> 39) & 0b1111111L); + output[outputOffset + 34] = (int) ((v3 >>> 46) & 0b1111111L); + output[outputOffset + 35] = (int) ((v3 >>> 53) & 0b1111111L); + output[outputOffset + 36] = (int) (((v3 >>> 60) & 0b1111L) | ((v4 & 0b111L) << 4)); + output[outputOffset + 37] = (int) ((v4 >>> 3) & 0b1111111L); + output[outputOffset + 38] = (int) ((v4 >>> 10) & 0b1111111L); + output[outputOffset + 39] = (int) ((v4 >>> 17) & 0b1111111L); + output[outputOffset + 40] = (int) ((v4 >>> 24) & 0b1111111L); + output[outputOffset + 41] = (int) ((v4 >>> 31) & 0b1111111L); + output[outputOffset + 42] = (int) ((v4 >>> 38) & 0b1111111L); + output[outputOffset + 43] = (int) ((v4 >>> 45) & 0b1111111L); + output[outputOffset + 44] = (int) ((v4 >>> 52) & 0b1111111L); + output[outputOffset + 45] = (int) (((v4 >>> 59) & 0b11111L) | ((v5 & 0b11L) << 5)); + output[outputOffset + 46] = (int) ((v5 >>> 2) & 0b1111111L); + output[outputOffset + 47] = (int) ((v5 >>> 9) & 0b1111111L); + output[outputOffset + 48] = (int) ((v5 >>> 16) & 0b1111111L); + output[outputOffset + 49] = (int) ((v5 >>> 23) & 0b1111111L); + output[outputOffset + 50] = (int) ((v5 >>> 30) & 0b1111111L); + output[outputOffset + 51] = (int) ((v5 >>> 37) & 0b1111111L); + output[outputOffset + 52] = (int) ((v5 >>> 44) & 0b1111111L); + output[outputOffset + 53] = (int) ((v5 >>> 51) & 0b1111111L); + output[outputOffset + 54] = (int) (((v5 >>> 58) & 0b111111L) | ((v6 & 0b1L) << 6)); + output[outputOffset + 55] = (int) ((v6 >>> 1) & 0b1111111L); + output[outputOffset + 56] = (int) ((v6 >>> 8) & 0b1111111L); + output[outputOffset + 57] = (int) ((v6 >>> 15) & 0b1111111L); + output[outputOffset + 58] = (int) ((v6 >>> 22) & 0b1111111L); + output[outputOffset + 59] = (int) ((v6 >>> 29) & 0b1111111L); + output[outputOffset + 60] = (int) ((v6 >>> 36) & 0b1111111L); + output[outputOffset + 61] = (int) ((v6 >>> 43) & 0b1111111L); + output[outputOffset + 62] = (int) ((v6 >>> 50) & 0b1111111L); + output[outputOffset + 63] = (int) ((v6 >>> 57) & 0b1111111L); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + short v1 = input.readShort(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111L); + output[outputOffset + 1] = (int) ((v0 >>> 7) & 0b1111111L); + output[outputOffset + 2] = (int) ((v0 >>> 14) & 0b1111111L); + output[outputOffset + 3] = (int) ((v0 >>> 21) & 0b1111111L); + output[outputOffset + 4] = (int) (((v0 >>> 28) & 0b1111L) | ((v1 & 0b111L) << 4)); + output[outputOffset + 5] = (int) ((v1 >>> 3) & 0b1111111L); + output[outputOffset + 6] = (int) (((v1 >>> 10) & 0b111111L) | ((v2 & 0b1L) << 6)); + output[outputOffset + 7] = (int) ((v2 >>> 1) & 0b1111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, + int length) + { + while (length >= 64) { + unpack64(output, outputOffset, input); + outputOffset += 64; + length -= 64; + } + switch (length) { + case 56: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 48: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 40: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 32: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 24: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 16: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 8: + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker8 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + output[outputOffset] = input[inputOffset] & 0b11111111; + output[outputOffset + 1] = input[inputOffset + 1] & 0b11111111; + output[outputOffset + 2] = input[inputOffset + 2] & 0b11111111; + output[outputOffset + 3] = input[inputOffset + 3] & 0b11111111; + output[outputOffset + 4] = input[inputOffset + 4] & 0b11111111; + output[outputOffset + 5] = input[inputOffset + 5] & 0b11111111; + output[outputOffset + 6] = input[inputOffset + 6] & 0b11111111; + output[outputOffset + 7] = input[inputOffset + 7] & 0b11111111; + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 8) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 8; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker9 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + byte v1 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 9) & 0b111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 18) & 0b111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 27) & 0b111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 36) & 0b111111111L); + output[outputOffset + 5] = (int) ((v0 >>> 45) & 0b111111111L); + output[outputOffset + 6] = (int) ((v0 >>> 54) & 0b111111111L); + output[outputOffset + 7] = (int) (((v0 >>> 63) & 0b1L) | ((v1 & 0b11111111L) << 1)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker10 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + short v1 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b1111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 10) & 0b1111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 20) & 0b1111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 30) & 0b1111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 40) & 0b1111111111L); + output[outputOffset + 5] = (int) ((v0 >>> 50) & 0b1111111111L); + output[outputOffset + 6] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b111111L) << 4)); + output[outputOffset + 7] = (int) ((v1 >>> 6) & 0b1111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker11 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + short v1 = input.readShort(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 11) & 0b11111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 22) & 0b11111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 33) & 0b11111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 44) & 0b11111111111L); + output[outputOffset + 5] = (int) (((v0 >>> 55) & 0b111111111L) | ((v1 & 0b11L) << 9)); + output[outputOffset + 6] = (int) ((v1 >>> 2) & 0b11111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 13) & 0b111L) | ((v2 & 0b11111111L) << 3)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker12 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + output[outputOffset] = (int) (v0 & 0b111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 12) & 0b111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 24) & 0b111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 36) & 0b111111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 48) & 0b111111111111L); + output[outputOffset + 5] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111L) << 4)); + output[outputOffset + 6] = (int) ((v1 >>> 8) & 0b111111111111L); + output[outputOffset + 7] = (int) ((v1 >>> 20) & 0b111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker13 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 13) & 0b1111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 26) & 0b1111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 39) & 0b1111111111111L); + output[outputOffset + 4] = (int) (((v0 >>> 52) & 0b111111111111L) | ((v1 & 0b1L) << 12)); + output[outputOffset + 5] = (int) ((v1 >>> 1) & 0b1111111111111L); + output[outputOffset + 6] = (int) ((v1 >>> 14) & 0b1111111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 27) & 0b11111L) | ((v2 & 0b11111111L) << 5)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker14 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + short v2 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b11111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 14) & 0b11111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 28) & 0b11111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 42) & 0b11111111111111L); + output[outputOffset + 4] = (int) (((v0 >>> 56) & 0b11111111L) | ((v1 & 0b111111L) << 8)); + output[outputOffset + 5] = (int) ((v1 >>> 6) & 0b11111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 20) & 0b111111111111L) | ((v2 & 0b11L) << 12)); + output[outputOffset + 7] = (int) ((v2 >>> 2) & 0b11111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker15 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + short v2 = input.readShort(); + byte v3 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 15) & 0b111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 30) & 0b111111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 45) & 0b111111111111111L); + output[outputOffset + 4] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111111L) << 4)); + output[outputOffset + 5] = (int) ((v1 >>> 11) & 0b111111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 26) & 0b111111L) | ((v2 & 0b111111111L) << 6)); + output[outputOffset + 7] = (int) (((v2 >>> 9) & 0b1111111L) | ((v3 & 0b11111111L) << 7)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker16 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + output[outputOffset] = (int) (v0 & 0b1111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 16) & 0b1111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 32) & 0b1111111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 48) & 0b1111111111111111L); + output[outputOffset + 4] = (int) (v1 & 0b1111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 16) & 0b1111111111111111L); + output[outputOffset + 6] = (int) ((v1 >>> 32) & 0b1111111111111111L); + output[outputOffset + 7] = (int) ((v1 >>> 48) & 0b1111111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker17 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 17) & 0b11111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 34) & 0b11111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 51) & 0b1111111111111L) | ((v1 & 0b1111L) << 13)); + output[outputOffset + 4] = (int) ((v1 >>> 4) & 0b11111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 21) & 0b11111111111111111L); + output[outputOffset + 6] = (int) ((v1 >>> 38) & 0b11111111111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 55) & 0b111111111L) | ((v2 & 0b11111111L) << 9)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker18 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + short v2 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 18) & 0b111111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 36) & 0b111111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 54) & 0b1111111111L) | ((v1 & 0b11111111L) << 10)); + output[outputOffset + 4] = (int) ((v1 >>> 8) & 0b111111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 26) & 0b111111111111111111L); + output[outputOffset + 6] = (int) ((v1 >>> 44) & 0b111111111111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111111111111L) << 2)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker19 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + short v2 = input.readShort(); + byte v3 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 19) & 0b1111111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 38) & 0b1111111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 57) & 0b1111111L) | ((v1 & 0b111111111111L) << 7)); + output[outputOffset + 4] = (int) ((v1 >>> 12) & 0b1111111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 31) & 0b1111111111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 50) & 0b11111111111111L) | ((v2 & 0b11111L) << 14)); + output[outputOffset + 7] = (int) (((v2 >>> 5) & 0b11111111111L) | ((v3 & 0b11111111L) << 11)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker20 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + output[outputOffset] = (int) (v0 & 0b11111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 20) & 0b11111111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 40) & 0b11111111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b1111111111111111L) << 4)); + output[outputOffset + 4] = (int) ((v1 >>> 16) & 0b11111111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 36) & 0b11111111111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 56) & 0b11111111L) | ((v2 & 0b111111111111L) << 8)); + output[outputOffset + 7] = (int) ((v2 >>> 12) & 0b11111111111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker21 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + byte v3 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 21) & 0b111111111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 42) & 0b111111111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 63) & 0b1L) | ((v1 & 0b11111111111111111111L) << 1)); + output[outputOffset + 4] = (int) ((v1 >>> 20) & 0b111111111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 41) & 0b111111111111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111111111111111L) << 2)); + output[outputOffset + 7] = (int) (((v2 >>> 19) & 0b1111111111111L) | ((v3 & 0b11111111L) << 13)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker22 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + short v3 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b1111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 22) & 0b1111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 44) & 0b11111111111111111111L) | ((v1 & 0b11L) << 20)); + output[outputOffset + 3] = (int) ((v1 >>> 2) & 0b1111111111111111111111L); + output[outputOffset + 4] = (int) ((v1 >>> 24) & 0b1111111111111111111111L); + output[outputOffset + 5] = (int) (((v1 >>> 46) & 0b111111111111111111L) | ((v2 & 0b1111L) << 18)); + output[outputOffset + 6] = (int) ((v2 >>> 4) & 0b1111111111111111111111L); + output[outputOffset + 7] = (int) (((v2 >>> 26) & 0b111111L) | ((v3 & 0b1111111111111111L) << 6)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker23 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + short v3 = input.readShort(); + byte v4 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 23) & 0b11111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 46) & 0b111111111111111111L) | ((v1 & 0b11111L) << 18)); + output[outputOffset + 3] = (int) ((v1 >>> 5) & 0b11111111111111111111111L); + output[outputOffset + 4] = (int) ((v1 >>> 28) & 0b11111111111111111111111L); + output[outputOffset + 5] = (int) (((v1 >>> 51) & 0b1111111111111L) | ((v2 & 0b1111111111L) << 13)); + output[outputOffset + 6] = (int) (((v2 >>> 10) & 0b1111111111111111111111L) | ((v3 & 0b1L) << 22)); + output[outputOffset + 7] = (int) (((v3 >>> 1) & 0b111111111111111L) | ((v4 & 0b11111111L) << 15)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker24 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + output[outputOffset] = (int) (v0 & 0b111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 24) & 0b111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 48) & 0b1111111111111111L) | ((v1 & 0b11111111L) << 16)); + output[outputOffset + 3] = (int) ((v1 >>> 8) & 0b111111111111111111111111L); + output[outputOffset + 4] = (int) ((v1 >>> 32) & 0b111111111111111111111111L); + output[outputOffset + 5] = (int) (((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111111111111111L) << 8)); + output[outputOffset + 6] = (int) ((v2 >>> 16) & 0b111111111111111111111111L); + output[outputOffset + 7] = (int) ((v2 >>> 40) & 0b111111111111111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker25 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + byte v3 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 25) & 0b1111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 50) & 0b11111111111111L) | ((v1 & 0b11111111111L) << 14)); + output[outputOffset + 3] = (int) ((v1 >>> 11) & 0b1111111111111111111111111L); + output[outputOffset + 4] = (int) ((v1 >>> 36) & 0b1111111111111111111111111L); + output[outputOffset + 5] = (int) (((v1 >>> 61) & 0b111L) | ((v2 & 0b1111111111111111111111L) << 3)); + output[outputOffset + 6] = (int) ((v2 >>> 22) & 0b1111111111111111111111111L); + output[outputOffset + 7] = (int) (((v2 >>> 47) & 0b11111111111111111L) | ((v3 & 0b11111111L) << 17)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker26 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + short v3 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b11111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 26) & 0b11111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 52) & 0b111111111111L) | ((v1 & 0b11111111111111L) << 12)); + output[outputOffset + 3] = (int) ((v1 >>> 14) & 0b11111111111111111111111111L); + output[outputOffset + 4] = (int) (((v1 >>> 40) & 0b111111111111111111111111L) | ((v2 & 0b11L) << 24)); + output[outputOffset + 5] = (int) ((v2 >>> 2) & 0b11111111111111111111111111L); + output[outputOffset + 6] = (int) ((v2 >>> 28) & 0b11111111111111111111111111L); + output[outputOffset + 7] = (int) (((v2 >>> 54) & 0b1111111111L) | ((v3 & 0b1111111111111111L) << 10)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker27 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + short v3 = input.readShort(); + byte v4 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b111111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 27) & 0b111111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 54) & 0b1111111111L) | ((v1 & 0b11111111111111111L) << 10)); + output[outputOffset + 3] = (int) ((v1 >>> 17) & 0b111111111111111111111111111L); + output[outputOffset + 4] = (int) (((v1 >>> 44) & 0b11111111111111111111L) | ((v2 & 0b1111111L) << 20)); + output[outputOffset + 5] = (int) ((v2 >>> 7) & 0b111111111111111111111111111L); + output[outputOffset + 6] = (int) ((v2 >>> 34) & 0b111111111111111111111111111L); + output[outputOffset + 7] = (int) (((v2 >>> 61) & 0b111L) | ((v3 & 0b1111111111111111L) << 3) | ((v4 & 0b11111111L) << 19)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker28 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + int v3 = input.readInt(); + output[outputOffset] = (int) (v0 & 0b1111111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 28) & 0b1111111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 56) & 0b11111111L) | ((v1 & 0b11111111111111111111L) << 8)); + output[outputOffset + 3] = (int) ((v1 >>> 20) & 0b1111111111111111111111111111L); + output[outputOffset + 4] = (int) (((v1 >>> 48) & 0b1111111111111111L) | ((v2 & 0b111111111111L) << 16)); + output[outputOffset + 5] = (int) ((v2 >>> 12) & 0b1111111111111111111111111111L); + output[outputOffset + 6] = (int) (((v2 >>> 40) & 0b111111111111111111111111L) | ((v3 & 0b1111L) << 24)); + output[outputOffset + 7] = (int) ((v3 >>> 4) & 0b1111111111111111111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker29 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + int v3 = input.readInt(); + byte v4 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 29) & 0b11111111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 58) & 0b111111L) | ((v1 & 0b11111111111111111111111L) << 6)); + output[outputOffset + 3] = (int) ((v1 >>> 23) & 0b11111111111111111111111111111L); + output[outputOffset + 4] = (int) (((v1 >>> 52) & 0b111111111111L) | ((v2 & 0b11111111111111111L) << 12)); + output[outputOffset + 5] = (int) ((v2 >>> 17) & 0b11111111111111111111111111111L); + output[outputOffset + 6] = (int) (((v2 >>> 46) & 0b111111111111111111L) | ((v3 & 0b11111111111L) << 18)); + output[outputOffset + 7] = (int) (((v3 >>> 11) & 0b111111111111111111111L) | ((v4 & 0b11111111L) << 21)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, + int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker30 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + int v3 = input.readInt(); + short v4 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b111111111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 30) & 0b111111111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111111111111111111111L) << 4)); + output[outputOffset + 3] = (int) ((v1 >>> 26) & 0b111111111111111111111111111111L); + output[outputOffset + 4] = (int) (((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111111111111111111111L) << 8)); + output[outputOffset + 5] = (int) ((v2 >>> 22) & 0b111111111111111111111111111111L); + output[outputOffset + 6] = (int) (((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111111111111111111L) << 12)); + output[outputOffset + 7] = (int) (((v3 >>> 18) & 0b11111111111111L) | ((v4 & 0b1111111111111111L) << 14)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, + int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker31 + implements IntBitUnpacker + { + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + int v3 = input.readInt(); + short v4 = input.readShort(); + byte v5 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111111111111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 31) & 0b1111111111111111111111111111111L); + output[outputOffset + 2] = (int) (((v0 >>> 62) & 0b11L) | ((v1 & 0b11111111111111111111111111111L) << 2)); + output[outputOffset + 3] = (int) ((v1 >>> 29) & 0b1111111111111111111111111111111L); + output[outputOffset + 4] = (int) (((v1 >>> 60) & 0b1111L) | ((v2 & 0b111111111111111111111111111L) << 4)); + output[outputOffset + 5] = (int) ((v2 >>> 27) & 0b1111111111111111111111111111111L); + output[outputOffset + 6] = (int) (((v2 >>> 58) & 0b111111L) | ((v3 & 0b1111111111111111111111111L) << 6)); + output[outputOffset + 7] = (int) (((v3 >>> 25) & 0b1111111L) | ((v4 & 0b1111111111111111L) << 7) | ((v5 & 0b11111111L) << 23)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 8) { + unpack8(output, outputOffset, input); + outputOffset += 8; + length -= 8; + } + } + } + + private static final class Unpacker32 + implements IntBitUnpacker + { + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + input.readInts(output, outputOffset, length); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/LongBitUnpacker.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/LongBitUnpacker.java new file mode 100644 index 000000000000..3ad03e97d2c7 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/LongBitUnpacker.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +public interface LongBitUnpacker +{ + /** + * @param length must be a multiple of 32 + */ + void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/LongBitUnpackers.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/LongBitUnpackers.java new file mode 100644 index 000000000000..105ef5ba7d1b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/LongBitUnpackers.java @@ -0,0 +1,4279 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +import static com.google.common.base.Preconditions.checkArgument; + +public final class LongBitUnpackers +{ + private static final LongBitUnpacker[] UNPACKERS = { + new Unpacker1(), + new Unpacker2(), + new Unpacker3(), + new Unpacker4(), + new Unpacker5(), + new Unpacker6(), + new Unpacker7(), + new Unpacker8(), + new Unpacker9(), + new Unpacker10(), + new Unpacker11(), + new Unpacker12(), + new Unpacker13(), + new Unpacker14(), + new Unpacker15(), + new Unpacker16(), + new Unpacker17(), + new Unpacker18(), + new Unpacker19(), + new Unpacker20(), + new Unpacker21(), + new Unpacker22(), + new Unpacker23(), + new Unpacker24(), + new Unpacker25(), + new Unpacker26(), + new Unpacker27(), + new Unpacker28(), + new Unpacker29(), + new Unpacker30(), + new Unpacker31(), + new Unpacker32(), + new Unpacker33(), + new Unpacker34(), + new Unpacker35(), + new Unpacker36(), + new Unpacker37(), + new Unpacker38(), + new Unpacker39(), + new Unpacker40(), + new Unpacker41(), + new Unpacker42(), + new Unpacker43(), + new Unpacker44(), + new Unpacker45(), + new Unpacker46(), + new Unpacker47(), + new Unpacker48(), + new Unpacker49(), + new Unpacker50(), + new Unpacker51(), + new Unpacker52(), + new Unpacker53(), + new Unpacker54(), + new Unpacker55(), + new Unpacker56(), + new Unpacker57(), + new Unpacker58(), + new Unpacker59(), + new Unpacker60(), + new Unpacker61(), + new Unpacker62(), + new Unpacker63(), + new Unpacker64()}; + + public static LongBitUnpacker getLongBitUnpacker(int bitWidth) + { + checkArgument(bitWidth > 0 && bitWidth <= Long.SIZE, "bitWidth %s should be in the range 1-64", bitWidth); + return UNPACKERS[bitWidth - 1]; + } + + private LongBitUnpackers() {} + + private static final class Unpacker1 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + output[outputOffset] = v0 & 0b1L; + output[outputOffset + 1] = (v0 >>> 1) & 0b1L; + output[outputOffset + 2] = (v0 >>> 2) & 0b1L; + output[outputOffset + 3] = (v0 >>> 3) & 0b1L; + output[outputOffset + 4] = (v0 >>> 4) & 0b1L; + output[outputOffset + 5] = (v0 >>> 5) & 0b1L; + output[outputOffset + 6] = (v0 >>> 6) & 0b1L; + output[outputOffset + 7] = (v0 >>> 7) & 0b1L; + output[outputOffset + 8] = (v0 >>> 8) & 0b1L; + output[outputOffset + 9] = (v0 >>> 9) & 0b1L; + output[outputOffset + 10] = (v0 >>> 10) & 0b1L; + output[outputOffset + 11] = (v0 >>> 11) & 0b1L; + output[outputOffset + 12] = (v0 >>> 12) & 0b1L; + output[outputOffset + 13] = (v0 >>> 13) & 0b1L; + output[outputOffset + 14] = (v0 >>> 14) & 0b1L; + output[outputOffset + 15] = (v0 >>> 15) & 0b1L; + output[outputOffset + 16] = (v0 >>> 16) & 0b1L; + output[outputOffset + 17] = (v0 >>> 17) & 0b1L; + output[outputOffset + 18] = (v0 >>> 18) & 0b1L; + output[outputOffset + 19] = (v0 >>> 19) & 0b1L; + output[outputOffset + 20] = (v0 >>> 20) & 0b1L; + output[outputOffset + 21] = (v0 >>> 21) & 0b1L; + output[outputOffset + 22] = (v0 >>> 22) & 0b1L; + output[outputOffset + 23] = (v0 >>> 23) & 0b1L; + output[outputOffset + 24] = (v0 >>> 24) & 0b1L; + output[outputOffset + 25] = (v0 >>> 25) & 0b1L; + output[outputOffset + 26] = (v0 >>> 26) & 0b1L; + output[outputOffset + 27] = (v0 >>> 27) & 0b1L; + output[outputOffset + 28] = (v0 >>> 28) & 0b1L; + output[outputOffset + 29] = (v0 >>> 29) & 0b1L; + output[outputOffset + 30] = (v0 >>> 30) & 0b1L; + output[outputOffset + 31] = (v0 >>> 31) & 0b1L; + } + } + + private static final class Unpacker2 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + output[outputOffset] = v0 & 0b11L; + output[outputOffset + 1] = (v0 >>> 2) & 0b11L; + output[outputOffset + 2] = (v0 >>> 4) & 0b11L; + output[outputOffset + 3] = (v0 >>> 6) & 0b11L; + output[outputOffset + 4] = (v0 >>> 8) & 0b11L; + output[outputOffset + 5] = (v0 >>> 10) & 0b11L; + output[outputOffset + 6] = (v0 >>> 12) & 0b11L; + output[outputOffset + 7] = (v0 >>> 14) & 0b11L; + output[outputOffset + 8] = (v0 >>> 16) & 0b11L; + output[outputOffset + 9] = (v0 >>> 18) & 0b11L; + output[outputOffset + 10] = (v0 >>> 20) & 0b11L; + output[outputOffset + 11] = (v0 >>> 22) & 0b11L; + output[outputOffset + 12] = (v0 >>> 24) & 0b11L; + output[outputOffset + 13] = (v0 >>> 26) & 0b11L; + output[outputOffset + 14] = (v0 >>> 28) & 0b11L; + output[outputOffset + 15] = (v0 >>> 30) & 0b11L; + output[outputOffset + 16] = (v0 >>> 32) & 0b11L; + output[outputOffset + 17] = (v0 >>> 34) & 0b11L; + output[outputOffset + 18] = (v0 >>> 36) & 0b11L; + output[outputOffset + 19] = (v0 >>> 38) & 0b11L; + output[outputOffset + 20] = (v0 >>> 40) & 0b11L; + output[outputOffset + 21] = (v0 >>> 42) & 0b11L; + output[outputOffset + 22] = (v0 >>> 44) & 0b11L; + output[outputOffset + 23] = (v0 >>> 46) & 0b11L; + output[outputOffset + 24] = (v0 >>> 48) & 0b11L; + output[outputOffset + 25] = (v0 >>> 50) & 0b11L; + output[outputOffset + 26] = (v0 >>> 52) & 0b11L; + output[outputOffset + 27] = (v0 >>> 54) & 0b11L; + output[outputOffset + 28] = (v0 >>> 56) & 0b11L; + output[outputOffset + 29] = (v0 >>> 58) & 0b11L; + output[outputOffset + 30] = (v0 >>> 60) & 0b11L; + output[outputOffset + 31] = (v0 >>> 62) & 0b11L; + } + } + + private static final class Unpacker3 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + output[outputOffset] = v0 & 0b111L; + output[outputOffset + 1] = (v0 >>> 3) & 0b111L; + output[outputOffset + 2] = (v0 >>> 6) & 0b111L; + output[outputOffset + 3] = (v0 >>> 9) & 0b111L; + output[outputOffset + 4] = (v0 >>> 12) & 0b111L; + output[outputOffset + 5] = (v0 >>> 15) & 0b111L; + output[outputOffset + 6] = (v0 >>> 18) & 0b111L; + output[outputOffset + 7] = (v0 >>> 21) & 0b111L; + output[outputOffset + 8] = (v0 >>> 24) & 0b111L; + output[outputOffset + 9] = (v0 >>> 27) & 0b111L; + output[outputOffset + 10] = (v0 >>> 30) & 0b111L; + output[outputOffset + 11] = (v0 >>> 33) & 0b111L; + output[outputOffset + 12] = (v0 >>> 36) & 0b111L; + output[outputOffset + 13] = (v0 >>> 39) & 0b111L; + output[outputOffset + 14] = (v0 >>> 42) & 0b111L; + output[outputOffset + 15] = (v0 >>> 45) & 0b111L; + output[outputOffset + 16] = (v0 >>> 48) & 0b111L; + output[outputOffset + 17] = (v0 >>> 51) & 0b111L; + output[outputOffset + 18] = (v0 >>> 54) & 0b111L; + output[outputOffset + 19] = (v0 >>> 57) & 0b111L; + output[outputOffset + 20] = (v0 >>> 60) & 0b111L; + output[outputOffset + 21] = ((v0 >>> 63) & 0b1L) | ((v1 & 0b11L) << 1); + output[outputOffset + 22] = (v1 >>> 2) & 0b111L; + output[outputOffset + 23] = (v1 >>> 5) & 0b111L; + output[outputOffset + 24] = (v1 >>> 8) & 0b111L; + output[outputOffset + 25] = (v1 >>> 11) & 0b111L; + output[outputOffset + 26] = (v1 >>> 14) & 0b111L; + output[outputOffset + 27] = (v1 >>> 17) & 0b111L; + output[outputOffset + 28] = (v1 >>> 20) & 0b111L; + output[outputOffset + 29] = (v1 >>> 23) & 0b111L; + output[outputOffset + 30] = (v1 >>> 26) & 0b111L; + output[outputOffset + 31] = (v1 >>> 29) & 0b111L; + } + } + + private static final class Unpacker4 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + output[outputOffset] = v0 & 0b1111L; + output[outputOffset + 1] = (v0 >>> 4) & 0b1111L; + output[outputOffset + 2] = (v0 >>> 8) & 0b1111L; + output[outputOffset + 3] = (v0 >>> 12) & 0b1111L; + output[outputOffset + 4] = (v0 >>> 16) & 0b1111L; + output[outputOffset + 5] = (v0 >>> 20) & 0b1111L; + output[outputOffset + 6] = (v0 >>> 24) & 0b1111L; + output[outputOffset + 7] = (v0 >>> 28) & 0b1111L; + output[outputOffset + 8] = (v0 >>> 32) & 0b1111L; + output[outputOffset + 9] = (v0 >>> 36) & 0b1111L; + output[outputOffset + 10] = (v0 >>> 40) & 0b1111L; + output[outputOffset + 11] = (v0 >>> 44) & 0b1111L; + output[outputOffset + 12] = (v0 >>> 48) & 0b1111L; + output[outputOffset + 13] = (v0 >>> 52) & 0b1111L; + output[outputOffset + 14] = (v0 >>> 56) & 0b1111L; + output[outputOffset + 15] = (v0 >>> 60) & 0b1111L; + output[outputOffset + 16] = v1 & 0b1111L; + output[outputOffset + 17] = (v1 >>> 4) & 0b1111L; + output[outputOffset + 18] = (v1 >>> 8) & 0b1111L; + output[outputOffset + 19] = (v1 >>> 12) & 0b1111L; + output[outputOffset + 20] = (v1 >>> 16) & 0b1111L; + output[outputOffset + 21] = (v1 >>> 20) & 0b1111L; + output[outputOffset + 22] = (v1 >>> 24) & 0b1111L; + output[outputOffset + 23] = (v1 >>> 28) & 0b1111L; + output[outputOffset + 24] = (v1 >>> 32) & 0b1111L; + output[outputOffset + 25] = (v1 >>> 36) & 0b1111L; + output[outputOffset + 26] = (v1 >>> 40) & 0b1111L; + output[outputOffset + 27] = (v1 >>> 44) & 0b1111L; + output[outputOffset + 28] = (v1 >>> 48) & 0b1111L; + output[outputOffset + 29] = (v1 >>> 52) & 0b1111L; + output[outputOffset + 30] = (v1 >>> 56) & 0b1111L; + output[outputOffset + 31] = (v1 >>> 60) & 0b1111L; + } + } + + private static final class Unpacker5 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + output[outputOffset] = v0 & 0b11111L; + output[outputOffset + 1] = (v0 >>> 5) & 0b11111L; + output[outputOffset + 2] = (v0 >>> 10) & 0b11111L; + output[outputOffset + 3] = (v0 >>> 15) & 0b11111L; + output[outputOffset + 4] = (v0 >>> 20) & 0b11111L; + output[outputOffset + 5] = (v0 >>> 25) & 0b11111L; + output[outputOffset + 6] = (v0 >>> 30) & 0b11111L; + output[outputOffset + 7] = (v0 >>> 35) & 0b11111L; + output[outputOffset + 8] = (v0 >>> 40) & 0b11111L; + output[outputOffset + 9] = (v0 >>> 45) & 0b11111L; + output[outputOffset + 10] = (v0 >>> 50) & 0b11111L; + output[outputOffset + 11] = (v0 >>> 55) & 0b11111L; + output[outputOffset + 12] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b1L) << 4); + output[outputOffset + 13] = (v1 >>> 1) & 0b11111L; + output[outputOffset + 14] = (v1 >>> 6) & 0b11111L; + output[outputOffset + 15] = (v1 >>> 11) & 0b11111L; + output[outputOffset + 16] = (v1 >>> 16) & 0b11111L; + output[outputOffset + 17] = (v1 >>> 21) & 0b11111L; + output[outputOffset + 18] = (v1 >>> 26) & 0b11111L; + output[outputOffset + 19] = (v1 >>> 31) & 0b11111L; + output[outputOffset + 20] = (v1 >>> 36) & 0b11111L; + output[outputOffset + 21] = (v1 >>> 41) & 0b11111L; + output[outputOffset + 22] = (v1 >>> 46) & 0b11111L; + output[outputOffset + 23] = (v1 >>> 51) & 0b11111L; + output[outputOffset + 24] = (v1 >>> 56) & 0b11111L; + output[outputOffset + 25] = ((v1 >>> 61) & 0b111L) | ((v2 & 0b11L) << 3); + output[outputOffset + 26] = (v2 >>> 2) & 0b11111L; + output[outputOffset + 27] = (v2 >>> 7) & 0b11111L; + output[outputOffset + 28] = (v2 >>> 12) & 0b11111L; + output[outputOffset + 29] = (v2 >>> 17) & 0b11111L; + output[outputOffset + 30] = (v2 >>> 22) & 0b11111L; + output[outputOffset + 31] = (v2 >>> 27) & 0b11111L; + } + } + + private static final class Unpacker6 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + output[outputOffset] = v0 & 0b111111L; + output[outputOffset + 1] = (v0 >>> 6) & 0b111111L; + output[outputOffset + 2] = (v0 >>> 12) & 0b111111L; + output[outputOffset + 3] = (v0 >>> 18) & 0b111111L; + output[outputOffset + 4] = (v0 >>> 24) & 0b111111L; + output[outputOffset + 5] = (v0 >>> 30) & 0b111111L; + output[outputOffset + 6] = (v0 >>> 36) & 0b111111L; + output[outputOffset + 7] = (v0 >>> 42) & 0b111111L; + output[outputOffset + 8] = (v0 >>> 48) & 0b111111L; + output[outputOffset + 9] = (v0 >>> 54) & 0b111111L; + output[outputOffset + 10] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b11L) << 4); + output[outputOffset + 11] = (v1 >>> 2) & 0b111111L; + output[outputOffset + 12] = (v1 >>> 8) & 0b111111L; + output[outputOffset + 13] = (v1 >>> 14) & 0b111111L; + output[outputOffset + 14] = (v1 >>> 20) & 0b111111L; + output[outputOffset + 15] = (v1 >>> 26) & 0b111111L; + output[outputOffset + 16] = (v1 >>> 32) & 0b111111L; + output[outputOffset + 17] = (v1 >>> 38) & 0b111111L; + output[outputOffset + 18] = (v1 >>> 44) & 0b111111L; + output[outputOffset + 19] = (v1 >>> 50) & 0b111111L; + output[outputOffset + 20] = (v1 >>> 56) & 0b111111L; + output[outputOffset + 21] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b1111L) << 2); + output[outputOffset + 22] = (v2 >>> 4) & 0b111111L; + output[outputOffset + 23] = (v2 >>> 10) & 0b111111L; + output[outputOffset + 24] = (v2 >>> 16) & 0b111111L; + output[outputOffset + 25] = (v2 >>> 22) & 0b111111L; + output[outputOffset + 26] = (v2 >>> 28) & 0b111111L; + output[outputOffset + 27] = (v2 >>> 34) & 0b111111L; + output[outputOffset + 28] = (v2 >>> 40) & 0b111111L; + output[outputOffset + 29] = (v2 >>> 46) & 0b111111L; + output[outputOffset + 30] = (v2 >>> 52) & 0b111111L; + output[outputOffset + 31] = (v2 >>> 58) & 0b111111L; + } + } + + private static final class Unpacker7 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + int v3 = input.readInt(); + output[outputOffset] = v0 & 0b1111111L; + output[outputOffset + 1] = (v0 >>> 7) & 0b1111111L; + output[outputOffset + 2] = (v0 >>> 14) & 0b1111111L; + output[outputOffset + 3] = (v0 >>> 21) & 0b1111111L; + output[outputOffset + 4] = (v0 >>> 28) & 0b1111111L; + output[outputOffset + 5] = (v0 >>> 35) & 0b1111111L; + output[outputOffset + 6] = (v0 >>> 42) & 0b1111111L; + output[outputOffset + 7] = (v0 >>> 49) & 0b1111111L; + output[outputOffset + 8] = (v0 >>> 56) & 0b1111111L; + output[outputOffset + 9] = ((v0 >>> 63) & 0b1L) | ((v1 & 0b111111L) << 1); + output[outputOffset + 10] = (v1 >>> 6) & 0b1111111L; + output[outputOffset + 11] = (v1 >>> 13) & 0b1111111L; + output[outputOffset + 12] = (v1 >>> 20) & 0b1111111L; + output[outputOffset + 13] = (v1 >>> 27) & 0b1111111L; + output[outputOffset + 14] = (v1 >>> 34) & 0b1111111L; + output[outputOffset + 15] = (v1 >>> 41) & 0b1111111L; + output[outputOffset + 16] = (v1 >>> 48) & 0b1111111L; + output[outputOffset + 17] = (v1 >>> 55) & 0b1111111L; + output[outputOffset + 18] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b11111L) << 2); + output[outputOffset + 19] = (v2 >>> 5) & 0b1111111L; + output[outputOffset + 20] = (v2 >>> 12) & 0b1111111L; + output[outputOffset + 21] = (v2 >>> 19) & 0b1111111L; + output[outputOffset + 22] = (v2 >>> 26) & 0b1111111L; + output[outputOffset + 23] = (v2 >>> 33) & 0b1111111L; + output[outputOffset + 24] = (v2 >>> 40) & 0b1111111L; + output[outputOffset + 25] = (v2 >>> 47) & 0b1111111L; + output[outputOffset + 26] = (v2 >>> 54) & 0b1111111L; + output[outputOffset + 27] = ((v2 >>> 61) & 0b111L) | ((v3 & 0b1111L) << 3); + output[outputOffset + 28] = (v3 >>> 4) & 0b1111111L; + output[outputOffset + 29] = (v3 >>> 11) & 0b1111111L; + output[outputOffset + 30] = (v3 >>> 18) & 0b1111111L; + output[outputOffset + 31] = (v3 >>> 25) & 0b1111111L; + } + } + + private static final class Unpacker8 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + output[outputOffset] = v0 & 0b11111111L; + output[outputOffset + 1] = (v0 >>> 8) & 0b11111111L; + output[outputOffset + 2] = (v0 >>> 16) & 0b11111111L; + output[outputOffset + 3] = (v0 >>> 24) & 0b11111111L; + output[outputOffset + 4] = (v0 >>> 32) & 0b11111111L; + output[outputOffset + 5] = (v0 >>> 40) & 0b11111111L; + output[outputOffset + 6] = (v0 >>> 48) & 0b11111111L; + output[outputOffset + 7] = (v0 >>> 56) & 0b11111111L; + output[outputOffset + 8] = v1 & 0b11111111L; + output[outputOffset + 9] = (v1 >>> 8) & 0b11111111L; + output[outputOffset + 10] = (v1 >>> 16) & 0b11111111L; + output[outputOffset + 11] = (v1 >>> 24) & 0b11111111L; + output[outputOffset + 12] = (v1 >>> 32) & 0b11111111L; + output[outputOffset + 13] = (v1 >>> 40) & 0b11111111L; + output[outputOffset + 14] = (v1 >>> 48) & 0b11111111L; + output[outputOffset + 15] = (v1 >>> 56) & 0b11111111L; + output[outputOffset + 16] = v2 & 0b11111111L; + output[outputOffset + 17] = (v2 >>> 8) & 0b11111111L; + output[outputOffset + 18] = (v2 >>> 16) & 0b11111111L; + output[outputOffset + 19] = (v2 >>> 24) & 0b11111111L; + output[outputOffset + 20] = (v2 >>> 32) & 0b11111111L; + output[outputOffset + 21] = (v2 >>> 40) & 0b11111111L; + output[outputOffset + 22] = (v2 >>> 48) & 0b11111111L; + output[outputOffset + 23] = (v2 >>> 56) & 0b11111111L; + output[outputOffset + 24] = v3 & 0b11111111L; + output[outputOffset + 25] = (v3 >>> 8) & 0b11111111L; + output[outputOffset + 26] = (v3 >>> 16) & 0b11111111L; + output[outputOffset + 27] = (v3 >>> 24) & 0b11111111L; + output[outputOffset + 28] = (v3 >>> 32) & 0b11111111L; + output[outputOffset + 29] = (v3 >>> 40) & 0b11111111L; + output[outputOffset + 30] = (v3 >>> 48) & 0b11111111L; + output[outputOffset + 31] = (v3 >>> 56) & 0b11111111L; + } + } + + private static final class Unpacker9 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + int v4 = input.readInt(); + output[outputOffset] = v0 & 0b111111111L; + output[outputOffset + 1] = (v0 >>> 9) & 0b111111111L; + output[outputOffset + 2] = (v0 >>> 18) & 0b111111111L; + output[outputOffset + 3] = (v0 >>> 27) & 0b111111111L; + output[outputOffset + 4] = (v0 >>> 36) & 0b111111111L; + output[outputOffset + 5] = (v0 >>> 45) & 0b111111111L; + output[outputOffset + 6] = (v0 >>> 54) & 0b111111111L; + output[outputOffset + 7] = ((v0 >>> 63) & 0b1L) | ((v1 & 0b11111111L) << 1); + output[outputOffset + 8] = (v1 >>> 8) & 0b111111111L; + output[outputOffset + 9] = (v1 >>> 17) & 0b111111111L; + output[outputOffset + 10] = (v1 >>> 26) & 0b111111111L; + output[outputOffset + 11] = (v1 >>> 35) & 0b111111111L; + output[outputOffset + 12] = (v1 >>> 44) & 0b111111111L; + output[outputOffset + 13] = (v1 >>> 53) & 0b111111111L; + output[outputOffset + 14] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111L) << 2); + output[outputOffset + 15] = (v2 >>> 7) & 0b111111111L; + output[outputOffset + 16] = (v2 >>> 16) & 0b111111111L; + output[outputOffset + 17] = (v2 >>> 25) & 0b111111111L; + output[outputOffset + 18] = (v2 >>> 34) & 0b111111111L; + output[outputOffset + 19] = (v2 >>> 43) & 0b111111111L; + output[outputOffset + 20] = (v2 >>> 52) & 0b111111111L; + output[outputOffset + 21] = ((v2 >>> 61) & 0b111L) | ((v3 & 0b111111L) << 3); + output[outputOffset + 22] = (v3 >>> 6) & 0b111111111L; + output[outputOffset + 23] = (v3 >>> 15) & 0b111111111L; + output[outputOffset + 24] = (v3 >>> 24) & 0b111111111L; + output[outputOffset + 25] = (v3 >>> 33) & 0b111111111L; + output[outputOffset + 26] = (v3 >>> 42) & 0b111111111L; + output[outputOffset + 27] = (v3 >>> 51) & 0b111111111L; + output[outputOffset + 28] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111L) << 4); + output[outputOffset + 29] = (v4 >>> 5) & 0b111111111L; + output[outputOffset + 30] = (v4 >>> 14) & 0b111111111L; + output[outputOffset + 31] = (v4 >>> 23) & 0b111111111L; + } + } + + private static final class Unpacker10 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111L; + output[outputOffset + 1] = (v0 >>> 10) & 0b1111111111L; + output[outputOffset + 2] = (v0 >>> 20) & 0b1111111111L; + output[outputOffset + 3] = (v0 >>> 30) & 0b1111111111L; + output[outputOffset + 4] = (v0 >>> 40) & 0b1111111111L; + output[outputOffset + 5] = (v0 >>> 50) & 0b1111111111L; + output[outputOffset + 6] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b111111L) << 4); + output[outputOffset + 7] = (v1 >>> 6) & 0b1111111111L; + output[outputOffset + 8] = (v1 >>> 16) & 0b1111111111L; + output[outputOffset + 9] = (v1 >>> 26) & 0b1111111111L; + output[outputOffset + 10] = (v1 >>> 36) & 0b1111111111L; + output[outputOffset + 11] = (v1 >>> 46) & 0b1111111111L; + output[outputOffset + 12] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b11L) << 8); + output[outputOffset + 13] = (v2 >>> 2) & 0b1111111111L; + output[outputOffset + 14] = (v2 >>> 12) & 0b1111111111L; + output[outputOffset + 15] = (v2 >>> 22) & 0b1111111111L; + output[outputOffset + 16] = (v2 >>> 32) & 0b1111111111L; + output[outputOffset + 17] = (v2 >>> 42) & 0b1111111111L; + output[outputOffset + 18] = (v2 >>> 52) & 0b1111111111L; + output[outputOffset + 19] = ((v2 >>> 62) & 0b11L) | ((v3 & 0b11111111L) << 2); + output[outputOffset + 20] = (v3 >>> 8) & 0b1111111111L; + output[outputOffset + 21] = (v3 >>> 18) & 0b1111111111L; + output[outputOffset + 22] = (v3 >>> 28) & 0b1111111111L; + output[outputOffset + 23] = (v3 >>> 38) & 0b1111111111L; + output[outputOffset + 24] = (v3 >>> 48) & 0b1111111111L; + output[outputOffset + 25] = ((v3 >>> 58) & 0b111111L) | ((v4 & 0b1111L) << 6); + output[outputOffset + 26] = (v4 >>> 4) & 0b1111111111L; + output[outputOffset + 27] = (v4 >>> 14) & 0b1111111111L; + output[outputOffset + 28] = (v4 >>> 24) & 0b1111111111L; + output[outputOffset + 29] = (v4 >>> 34) & 0b1111111111L; + output[outputOffset + 30] = (v4 >>> 44) & 0b1111111111L; + output[outputOffset + 31] = (v4 >>> 54) & 0b1111111111L; + } + } + + private static final class Unpacker11 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + int v5 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111L; + output[outputOffset + 1] = (v0 >>> 11) & 0b11111111111L; + output[outputOffset + 2] = (v0 >>> 22) & 0b11111111111L; + output[outputOffset + 3] = (v0 >>> 33) & 0b11111111111L; + output[outputOffset + 4] = (v0 >>> 44) & 0b11111111111L; + output[outputOffset + 5] = ((v0 >>> 55) & 0b111111111L) | ((v1 & 0b11L) << 9); + output[outputOffset + 6] = (v1 >>> 2) & 0b11111111111L; + output[outputOffset + 7] = (v1 >>> 13) & 0b11111111111L; + output[outputOffset + 8] = (v1 >>> 24) & 0b11111111111L; + output[outputOffset + 9] = (v1 >>> 35) & 0b11111111111L; + output[outputOffset + 10] = (v1 >>> 46) & 0b11111111111L; + output[outputOffset + 11] = ((v1 >>> 57) & 0b1111111L) | ((v2 & 0b1111L) << 7); + output[outputOffset + 12] = (v2 >>> 4) & 0b11111111111L; + output[outputOffset + 13] = (v2 >>> 15) & 0b11111111111L; + output[outputOffset + 14] = (v2 >>> 26) & 0b11111111111L; + output[outputOffset + 15] = (v2 >>> 37) & 0b11111111111L; + output[outputOffset + 16] = (v2 >>> 48) & 0b11111111111L; + output[outputOffset + 17] = ((v2 >>> 59) & 0b11111L) | ((v3 & 0b111111L) << 5); + output[outputOffset + 18] = (v3 >>> 6) & 0b11111111111L; + output[outputOffset + 19] = (v3 >>> 17) & 0b11111111111L; + output[outputOffset + 20] = (v3 >>> 28) & 0b11111111111L; + output[outputOffset + 21] = (v3 >>> 39) & 0b11111111111L; + output[outputOffset + 22] = (v3 >>> 50) & 0b11111111111L; + output[outputOffset + 23] = ((v3 >>> 61) & 0b111L) | ((v4 & 0b11111111L) << 3); + output[outputOffset + 24] = (v4 >>> 8) & 0b11111111111L; + output[outputOffset + 25] = (v4 >>> 19) & 0b11111111111L; + output[outputOffset + 26] = (v4 >>> 30) & 0b11111111111L; + output[outputOffset + 27] = (v4 >>> 41) & 0b11111111111L; + output[outputOffset + 28] = (v4 >>> 52) & 0b11111111111L; + output[outputOffset + 29] = ((v4 >>> 63) & 0b1L) | ((v5 & 0b1111111111L) << 1); + output[outputOffset + 30] = (v5 >>> 10) & 0b11111111111L; + output[outputOffset + 31] = (v5 >>> 21) & 0b11111111111L; + } + } + + private static final class Unpacker12 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111L; + output[outputOffset + 1] = (v0 >>> 12) & 0b111111111111L; + output[outputOffset + 2] = (v0 >>> 24) & 0b111111111111L; + output[outputOffset + 3] = (v0 >>> 36) & 0b111111111111L; + output[outputOffset + 4] = (v0 >>> 48) & 0b111111111111L; + output[outputOffset + 5] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111L) << 4); + output[outputOffset + 6] = (v1 >>> 8) & 0b111111111111L; + output[outputOffset + 7] = (v1 >>> 20) & 0b111111111111L; + output[outputOffset + 8] = (v1 >>> 32) & 0b111111111111L; + output[outputOffset + 9] = (v1 >>> 44) & 0b111111111111L; + output[outputOffset + 10] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111L) << 8); + output[outputOffset + 11] = (v2 >>> 4) & 0b111111111111L; + output[outputOffset + 12] = (v2 >>> 16) & 0b111111111111L; + output[outputOffset + 13] = (v2 >>> 28) & 0b111111111111L; + output[outputOffset + 14] = (v2 >>> 40) & 0b111111111111L; + output[outputOffset + 15] = (v2 >>> 52) & 0b111111111111L; + output[outputOffset + 16] = v3 & 0b111111111111L; + output[outputOffset + 17] = (v3 >>> 12) & 0b111111111111L; + output[outputOffset + 18] = (v3 >>> 24) & 0b111111111111L; + output[outputOffset + 19] = (v3 >>> 36) & 0b111111111111L; + output[outputOffset + 20] = (v3 >>> 48) & 0b111111111111L; + output[outputOffset + 21] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111111L) << 4); + output[outputOffset + 22] = (v4 >>> 8) & 0b111111111111L; + output[outputOffset + 23] = (v4 >>> 20) & 0b111111111111L; + output[outputOffset + 24] = (v4 >>> 32) & 0b111111111111L; + output[outputOffset + 25] = (v4 >>> 44) & 0b111111111111L; + output[outputOffset + 26] = ((v4 >>> 56) & 0b11111111L) | ((v5 & 0b1111L) << 8); + output[outputOffset + 27] = (v5 >>> 4) & 0b111111111111L; + output[outputOffset + 28] = (v5 >>> 16) & 0b111111111111L; + output[outputOffset + 29] = (v5 >>> 28) & 0b111111111111L; + output[outputOffset + 30] = (v5 >>> 40) & 0b111111111111L; + output[outputOffset + 31] = (v5 >>> 52) & 0b111111111111L; + } + } + + private static final class Unpacker13 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + int v6 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111L; + output[outputOffset + 1] = (v0 >>> 13) & 0b1111111111111L; + output[outputOffset + 2] = (v0 >>> 26) & 0b1111111111111L; + output[outputOffset + 3] = (v0 >>> 39) & 0b1111111111111L; + output[outputOffset + 4] = ((v0 >>> 52) & 0b111111111111L) | ((v1 & 0b1L) << 12); + output[outputOffset + 5] = (v1 >>> 1) & 0b1111111111111L; + output[outputOffset + 6] = (v1 >>> 14) & 0b1111111111111L; + output[outputOffset + 7] = (v1 >>> 27) & 0b1111111111111L; + output[outputOffset + 8] = (v1 >>> 40) & 0b1111111111111L; + output[outputOffset + 9] = ((v1 >>> 53) & 0b11111111111L) | ((v2 & 0b11L) << 11); + output[outputOffset + 10] = (v2 >>> 2) & 0b1111111111111L; + output[outputOffset + 11] = (v2 >>> 15) & 0b1111111111111L; + output[outputOffset + 12] = (v2 >>> 28) & 0b1111111111111L; + output[outputOffset + 13] = (v2 >>> 41) & 0b1111111111111L; + output[outputOffset + 14] = ((v2 >>> 54) & 0b1111111111L) | ((v3 & 0b111L) << 10); + output[outputOffset + 15] = (v3 >>> 3) & 0b1111111111111L; + output[outputOffset + 16] = (v3 >>> 16) & 0b1111111111111L; + output[outputOffset + 17] = (v3 >>> 29) & 0b1111111111111L; + output[outputOffset + 18] = (v3 >>> 42) & 0b1111111111111L; + output[outputOffset + 19] = ((v3 >>> 55) & 0b111111111L) | ((v4 & 0b1111L) << 9); + output[outputOffset + 20] = (v4 >>> 4) & 0b1111111111111L; + output[outputOffset + 21] = (v4 >>> 17) & 0b1111111111111L; + output[outputOffset + 22] = (v4 >>> 30) & 0b1111111111111L; + output[outputOffset + 23] = (v4 >>> 43) & 0b1111111111111L; + output[outputOffset + 24] = ((v4 >>> 56) & 0b11111111L) | ((v5 & 0b11111L) << 8); + output[outputOffset + 25] = (v5 >>> 5) & 0b1111111111111L; + output[outputOffset + 26] = (v5 >>> 18) & 0b1111111111111L; + output[outputOffset + 27] = (v5 >>> 31) & 0b1111111111111L; + output[outputOffset + 28] = (v5 >>> 44) & 0b1111111111111L; + output[outputOffset + 29] = ((v5 >>> 57) & 0b1111111L) | ((v6 & 0b111111L) << 7); + output[outputOffset + 30] = (v6 >>> 6) & 0b1111111111111L; + output[outputOffset + 31] = (v6 >>> 19) & 0b1111111111111L; + } + } + + private static final class Unpacker14 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111L; + output[outputOffset + 1] = (v0 >>> 14) & 0b11111111111111L; + output[outputOffset + 2] = (v0 >>> 28) & 0b11111111111111L; + output[outputOffset + 3] = (v0 >>> 42) & 0b11111111111111L; + output[outputOffset + 4] = ((v0 >>> 56) & 0b11111111L) | ((v1 & 0b111111L) << 8); + output[outputOffset + 5] = (v1 >>> 6) & 0b11111111111111L; + output[outputOffset + 6] = (v1 >>> 20) & 0b11111111111111L; + output[outputOffset + 7] = (v1 >>> 34) & 0b11111111111111L; + output[outputOffset + 8] = (v1 >>> 48) & 0b11111111111111L; + output[outputOffset + 9] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b111111111111L) << 2); + output[outputOffset + 10] = (v2 >>> 12) & 0b11111111111111L; + output[outputOffset + 11] = (v2 >>> 26) & 0b11111111111111L; + output[outputOffset + 12] = (v2 >>> 40) & 0b11111111111111L; + output[outputOffset + 13] = ((v2 >>> 54) & 0b1111111111L) | ((v3 & 0b1111L) << 10); + output[outputOffset + 14] = (v3 >>> 4) & 0b11111111111111L; + output[outputOffset + 15] = (v3 >>> 18) & 0b11111111111111L; + output[outputOffset + 16] = (v3 >>> 32) & 0b11111111111111L; + output[outputOffset + 17] = (v3 >>> 46) & 0b11111111111111L; + output[outputOffset + 18] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b1111111111L) << 4); + output[outputOffset + 19] = (v4 >>> 10) & 0b11111111111111L; + output[outputOffset + 20] = (v4 >>> 24) & 0b11111111111111L; + output[outputOffset + 21] = (v4 >>> 38) & 0b11111111111111L; + output[outputOffset + 22] = ((v4 >>> 52) & 0b111111111111L) | ((v5 & 0b11L) << 12); + output[outputOffset + 23] = (v5 >>> 2) & 0b11111111111111L; + output[outputOffset + 24] = (v5 >>> 16) & 0b11111111111111L; + output[outputOffset + 25] = (v5 >>> 30) & 0b11111111111111L; + output[outputOffset + 26] = (v5 >>> 44) & 0b11111111111111L; + output[outputOffset + 27] = ((v5 >>> 58) & 0b111111L) | ((v6 & 0b11111111L) << 6); + output[outputOffset + 28] = (v6 >>> 8) & 0b11111111111111L; + output[outputOffset + 29] = (v6 >>> 22) & 0b11111111111111L; + output[outputOffset + 30] = (v6 >>> 36) & 0b11111111111111L; + output[outputOffset + 31] = (v6 >>> 50) & 0b11111111111111L; + } + } + + private static final class Unpacker15 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + int v7 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111L; + output[outputOffset + 1] = (v0 >>> 15) & 0b111111111111111L; + output[outputOffset + 2] = (v0 >>> 30) & 0b111111111111111L; + output[outputOffset + 3] = (v0 >>> 45) & 0b111111111111111L; + output[outputOffset + 4] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111111L) << 4); + output[outputOffset + 5] = (v1 >>> 11) & 0b111111111111111L; + output[outputOffset + 6] = (v1 >>> 26) & 0b111111111111111L; + output[outputOffset + 7] = (v1 >>> 41) & 0b111111111111111L; + output[outputOffset + 8] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111111L) << 8); + output[outputOffset + 9] = (v2 >>> 7) & 0b111111111111111L; + output[outputOffset + 10] = (v2 >>> 22) & 0b111111111111111L; + output[outputOffset + 11] = (v2 >>> 37) & 0b111111111111111L; + output[outputOffset + 12] = ((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111L) << 12); + output[outputOffset + 13] = (v3 >>> 3) & 0b111111111111111L; + output[outputOffset + 14] = (v3 >>> 18) & 0b111111111111111L; + output[outputOffset + 15] = (v3 >>> 33) & 0b111111111111111L; + output[outputOffset + 16] = (v3 >>> 48) & 0b111111111111111L; + output[outputOffset + 17] = ((v3 >>> 63) & 0b1L) | ((v4 & 0b11111111111111L) << 1); + output[outputOffset + 18] = (v4 >>> 14) & 0b111111111111111L; + output[outputOffset + 19] = (v4 >>> 29) & 0b111111111111111L; + output[outputOffset + 20] = (v4 >>> 44) & 0b111111111111111L; + output[outputOffset + 21] = ((v4 >>> 59) & 0b11111L) | ((v5 & 0b1111111111L) << 5); + output[outputOffset + 22] = (v5 >>> 10) & 0b111111111111111L; + output[outputOffset + 23] = (v5 >>> 25) & 0b111111111111111L; + output[outputOffset + 24] = (v5 >>> 40) & 0b111111111111111L; + output[outputOffset + 25] = ((v5 >>> 55) & 0b111111111L) | ((v6 & 0b111111L) << 9); + output[outputOffset + 26] = (v6 >>> 6) & 0b111111111111111L; + output[outputOffset + 27] = (v6 >>> 21) & 0b111111111111111L; + output[outputOffset + 28] = (v6 >>> 36) & 0b111111111111111L; + output[outputOffset + 29] = ((v6 >>> 51) & 0b1111111111111L) | ((v7 & 0b11L) << 13); + output[outputOffset + 30] = (v7 >>> 2) & 0b111111111111111L; + output[outputOffset + 31] = (v7 >>> 17) & 0b111111111111111L; + } + } + + private static final class Unpacker16 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111L; + output[outputOffset + 1] = (v0 >>> 16) & 0b1111111111111111L; + output[outputOffset + 2] = (v0 >>> 32) & 0b1111111111111111L; + output[outputOffset + 3] = (v0 >>> 48) & 0b1111111111111111L; + output[outputOffset + 4] = v1 & 0b1111111111111111L; + output[outputOffset + 5] = (v1 >>> 16) & 0b1111111111111111L; + output[outputOffset + 6] = (v1 >>> 32) & 0b1111111111111111L; + output[outputOffset + 7] = (v1 >>> 48) & 0b1111111111111111L; + output[outputOffset + 8] = v2 & 0b1111111111111111L; + output[outputOffset + 9] = (v2 >>> 16) & 0b1111111111111111L; + output[outputOffset + 10] = (v2 >>> 32) & 0b1111111111111111L; + output[outputOffset + 11] = (v2 >>> 48) & 0b1111111111111111L; + output[outputOffset + 12] = v3 & 0b1111111111111111L; + output[outputOffset + 13] = (v3 >>> 16) & 0b1111111111111111L; + output[outputOffset + 14] = (v3 >>> 32) & 0b1111111111111111L; + output[outputOffset + 15] = (v3 >>> 48) & 0b1111111111111111L; + output[outputOffset + 16] = v4 & 0b1111111111111111L; + output[outputOffset + 17] = (v4 >>> 16) & 0b1111111111111111L; + output[outputOffset + 18] = (v4 >>> 32) & 0b1111111111111111L; + output[outputOffset + 19] = (v4 >>> 48) & 0b1111111111111111L; + output[outputOffset + 20] = v5 & 0b1111111111111111L; + output[outputOffset + 21] = (v5 >>> 16) & 0b1111111111111111L; + output[outputOffset + 22] = (v5 >>> 32) & 0b1111111111111111L; + output[outputOffset + 23] = (v5 >>> 48) & 0b1111111111111111L; + output[outputOffset + 24] = v6 & 0b1111111111111111L; + output[outputOffset + 25] = (v6 >>> 16) & 0b1111111111111111L; + output[outputOffset + 26] = (v6 >>> 32) & 0b1111111111111111L; + output[outputOffset + 27] = (v6 >>> 48) & 0b1111111111111111L; + output[outputOffset + 28] = v7 & 0b1111111111111111L; + output[outputOffset + 29] = (v7 >>> 16) & 0b1111111111111111L; + output[outputOffset + 30] = (v7 >>> 32) & 0b1111111111111111L; + output[outputOffset + 31] = (v7 >>> 48) & 0b1111111111111111L; + } + } + + private static final class Unpacker17 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + int v8 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111L; + output[outputOffset + 1] = (v0 >>> 17) & 0b11111111111111111L; + output[outputOffset + 2] = (v0 >>> 34) & 0b11111111111111111L; + output[outputOffset + 3] = ((v0 >>> 51) & 0b1111111111111L) | ((v1 & 0b1111L) << 13); + output[outputOffset + 4] = (v1 >>> 4) & 0b11111111111111111L; + output[outputOffset + 5] = (v1 >>> 21) & 0b11111111111111111L; + output[outputOffset + 6] = (v1 >>> 38) & 0b11111111111111111L; + output[outputOffset + 7] = ((v1 >>> 55) & 0b111111111L) | ((v2 & 0b11111111L) << 9); + output[outputOffset + 8] = (v2 >>> 8) & 0b11111111111111111L; + output[outputOffset + 9] = (v2 >>> 25) & 0b11111111111111111L; + output[outputOffset + 10] = (v2 >>> 42) & 0b11111111111111111L; + output[outputOffset + 11] = ((v2 >>> 59) & 0b11111L) | ((v3 & 0b111111111111L) << 5); + output[outputOffset + 12] = (v3 >>> 12) & 0b11111111111111111L; + output[outputOffset + 13] = (v3 >>> 29) & 0b11111111111111111L; + output[outputOffset + 14] = (v3 >>> 46) & 0b11111111111111111L; + output[outputOffset + 15] = ((v3 >>> 63) & 0b1L) | ((v4 & 0b1111111111111111L) << 1); + output[outputOffset + 16] = (v4 >>> 16) & 0b11111111111111111L; + output[outputOffset + 17] = (v4 >>> 33) & 0b11111111111111111L; + output[outputOffset + 18] = ((v4 >>> 50) & 0b11111111111111L) | ((v5 & 0b111L) << 14); + output[outputOffset + 19] = (v5 >>> 3) & 0b11111111111111111L; + output[outputOffset + 20] = (v5 >>> 20) & 0b11111111111111111L; + output[outputOffset + 21] = (v5 >>> 37) & 0b11111111111111111L; + output[outputOffset + 22] = ((v5 >>> 54) & 0b1111111111L) | ((v6 & 0b1111111L) << 10); + output[outputOffset + 23] = (v6 >>> 7) & 0b11111111111111111L; + output[outputOffset + 24] = (v6 >>> 24) & 0b11111111111111111L; + output[outputOffset + 25] = (v6 >>> 41) & 0b11111111111111111L; + output[outputOffset + 26] = ((v6 >>> 58) & 0b111111L) | ((v7 & 0b11111111111L) << 6); + output[outputOffset + 27] = (v7 >>> 11) & 0b11111111111111111L; + output[outputOffset + 28] = (v7 >>> 28) & 0b11111111111111111L; + output[outputOffset + 29] = (v7 >>> 45) & 0b11111111111111111L; + output[outputOffset + 30] = ((v7 >>> 62) & 0b11L) | ((v8 & 0b111111111111111L) << 2); + output[outputOffset + 31] = (v8 >>> 15) & 0b11111111111111111L; + } + } + + private static final class Unpacker18 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111L; + output[outputOffset + 1] = (v0 >>> 18) & 0b111111111111111111L; + output[outputOffset + 2] = (v0 >>> 36) & 0b111111111111111111L; + output[outputOffset + 3] = ((v0 >>> 54) & 0b1111111111L) | ((v1 & 0b11111111L) << 10); + output[outputOffset + 4] = (v1 >>> 8) & 0b111111111111111111L; + output[outputOffset + 5] = (v1 >>> 26) & 0b111111111111111111L; + output[outputOffset + 6] = (v1 >>> 44) & 0b111111111111111111L; + output[outputOffset + 7] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111111111111L) << 2); + output[outputOffset + 8] = (v2 >>> 16) & 0b111111111111111111L; + output[outputOffset + 9] = (v2 >>> 34) & 0b111111111111111111L; + output[outputOffset + 10] = ((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111111L) << 12); + output[outputOffset + 11] = (v3 >>> 6) & 0b111111111111111111L; + output[outputOffset + 12] = (v3 >>> 24) & 0b111111111111111111L; + output[outputOffset + 13] = (v3 >>> 42) & 0b111111111111111111L; + output[outputOffset + 14] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111111111111L) << 4); + output[outputOffset + 15] = (v4 >>> 14) & 0b111111111111111111L; + output[outputOffset + 16] = (v4 >>> 32) & 0b111111111111111111L; + output[outputOffset + 17] = ((v4 >>> 50) & 0b11111111111111L) | ((v5 & 0b1111L) << 14); + output[outputOffset + 18] = (v5 >>> 4) & 0b111111111111111111L; + output[outputOffset + 19] = (v5 >>> 22) & 0b111111111111111111L; + output[outputOffset + 20] = (v5 >>> 40) & 0b111111111111111111L; + output[outputOffset + 21] = ((v5 >>> 58) & 0b111111L) | ((v6 & 0b111111111111L) << 6); + output[outputOffset + 22] = (v6 >>> 12) & 0b111111111111111111L; + output[outputOffset + 23] = (v6 >>> 30) & 0b111111111111111111L; + output[outputOffset + 24] = ((v6 >>> 48) & 0b1111111111111111L) | ((v7 & 0b11L) << 16); + output[outputOffset + 25] = (v7 >>> 2) & 0b111111111111111111L; + output[outputOffset + 26] = (v7 >>> 20) & 0b111111111111111111L; + output[outputOffset + 27] = (v7 >>> 38) & 0b111111111111111111L; + output[outputOffset + 28] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b1111111111L) << 8); + output[outputOffset + 29] = (v8 >>> 10) & 0b111111111111111111L; + output[outputOffset + 30] = (v8 >>> 28) & 0b111111111111111111L; + output[outputOffset + 31] = (v8 >>> 46) & 0b111111111111111111L; + } + } + + private static final class Unpacker19 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + int v9 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111L; + output[outputOffset + 1] = (v0 >>> 19) & 0b1111111111111111111L; + output[outputOffset + 2] = (v0 >>> 38) & 0b1111111111111111111L; + output[outputOffset + 3] = ((v0 >>> 57) & 0b1111111L) | ((v1 & 0b111111111111L) << 7); + output[outputOffset + 4] = (v1 >>> 12) & 0b1111111111111111111L; + output[outputOffset + 5] = (v1 >>> 31) & 0b1111111111111111111L; + output[outputOffset + 6] = ((v1 >>> 50) & 0b11111111111111L) | ((v2 & 0b11111L) << 14); + output[outputOffset + 7] = (v2 >>> 5) & 0b1111111111111111111L; + output[outputOffset + 8] = (v2 >>> 24) & 0b1111111111111111111L; + output[outputOffset + 9] = (v2 >>> 43) & 0b1111111111111111111L; + output[outputOffset + 10] = ((v2 >>> 62) & 0b11L) | ((v3 & 0b11111111111111111L) << 2); + output[outputOffset + 11] = (v3 >>> 17) & 0b1111111111111111111L; + output[outputOffset + 12] = (v3 >>> 36) & 0b1111111111111111111L; + output[outputOffset + 13] = ((v3 >>> 55) & 0b111111111L) | ((v4 & 0b1111111111L) << 9); + output[outputOffset + 14] = (v4 >>> 10) & 0b1111111111111111111L; + output[outputOffset + 15] = (v4 >>> 29) & 0b1111111111111111111L; + output[outputOffset + 16] = ((v4 >>> 48) & 0b1111111111111111L) | ((v5 & 0b111L) << 16); + output[outputOffset + 17] = (v5 >>> 3) & 0b1111111111111111111L; + output[outputOffset + 18] = (v5 >>> 22) & 0b1111111111111111111L; + output[outputOffset + 19] = (v5 >>> 41) & 0b1111111111111111111L; + output[outputOffset + 20] = ((v5 >>> 60) & 0b1111L) | ((v6 & 0b111111111111111L) << 4); + output[outputOffset + 21] = (v6 >>> 15) & 0b1111111111111111111L; + output[outputOffset + 22] = (v6 >>> 34) & 0b1111111111111111111L; + output[outputOffset + 23] = ((v6 >>> 53) & 0b11111111111L) | ((v7 & 0b11111111L) << 11); + output[outputOffset + 24] = (v7 >>> 8) & 0b1111111111111111111L; + output[outputOffset + 25] = (v7 >>> 27) & 0b1111111111111111111L; + output[outputOffset + 26] = ((v7 >>> 46) & 0b111111111111111111L) | ((v8 & 0b1L) << 18); + output[outputOffset + 27] = (v8 >>> 1) & 0b1111111111111111111L; + output[outputOffset + 28] = (v8 >>> 20) & 0b1111111111111111111L; + output[outputOffset + 29] = (v8 >>> 39) & 0b1111111111111111111L; + output[outputOffset + 30] = ((v8 >>> 58) & 0b111111L) | ((v9 & 0b1111111111111L) << 6); + output[outputOffset + 31] = (v9 >>> 13) & 0b1111111111111111111L; + } + } + + private static final class Unpacker20 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111L; + output[outputOffset + 1] = (v0 >>> 20) & 0b11111111111111111111L; + output[outputOffset + 2] = (v0 >>> 40) & 0b11111111111111111111L; + output[outputOffset + 3] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b1111111111111111L) << 4); + output[outputOffset + 4] = (v1 >>> 16) & 0b11111111111111111111L; + output[outputOffset + 5] = (v1 >>> 36) & 0b11111111111111111111L; + output[outputOffset + 6] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b111111111111L) << 8); + output[outputOffset + 7] = (v2 >>> 12) & 0b11111111111111111111L; + output[outputOffset + 8] = (v2 >>> 32) & 0b11111111111111111111L; + output[outputOffset + 9] = ((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b11111111L) << 12); + output[outputOffset + 10] = (v3 >>> 8) & 0b11111111111111111111L; + output[outputOffset + 11] = (v3 >>> 28) & 0b11111111111111111111L; + output[outputOffset + 12] = ((v3 >>> 48) & 0b1111111111111111L) | ((v4 & 0b1111L) << 16); + output[outputOffset + 13] = (v4 >>> 4) & 0b11111111111111111111L; + output[outputOffset + 14] = (v4 >>> 24) & 0b11111111111111111111L; + output[outputOffset + 15] = (v4 >>> 44) & 0b11111111111111111111L; + output[outputOffset + 16] = v5 & 0b11111111111111111111L; + output[outputOffset + 17] = (v5 >>> 20) & 0b11111111111111111111L; + output[outputOffset + 18] = (v5 >>> 40) & 0b11111111111111111111L; + output[outputOffset + 19] = ((v5 >>> 60) & 0b1111L) | ((v6 & 0b1111111111111111L) << 4); + output[outputOffset + 20] = (v6 >>> 16) & 0b11111111111111111111L; + output[outputOffset + 21] = (v6 >>> 36) & 0b11111111111111111111L; + output[outputOffset + 22] = ((v6 >>> 56) & 0b11111111L) | ((v7 & 0b111111111111L) << 8); + output[outputOffset + 23] = (v7 >>> 12) & 0b11111111111111111111L; + output[outputOffset + 24] = (v7 >>> 32) & 0b11111111111111111111L; + output[outputOffset + 25] = ((v7 >>> 52) & 0b111111111111L) | ((v8 & 0b11111111L) << 12); + output[outputOffset + 26] = (v8 >>> 8) & 0b11111111111111111111L; + output[outputOffset + 27] = (v8 >>> 28) & 0b11111111111111111111L; + output[outputOffset + 28] = ((v8 >>> 48) & 0b1111111111111111L) | ((v9 & 0b1111L) << 16); + output[outputOffset + 29] = (v9 >>> 4) & 0b11111111111111111111L; + output[outputOffset + 30] = (v9 >>> 24) & 0b11111111111111111111L; + output[outputOffset + 31] = (v9 >>> 44) & 0b11111111111111111111L; + } + } + + private static final class Unpacker21 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + int v10 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 21) & 0b111111111111111111111L; + output[outputOffset + 2] = (v0 >>> 42) & 0b111111111111111111111L; + output[outputOffset + 3] = ((v0 >>> 63) & 0b1L) | ((v1 & 0b11111111111111111111L) << 1); + output[outputOffset + 4] = (v1 >>> 20) & 0b111111111111111111111L; + output[outputOffset + 5] = (v1 >>> 41) & 0b111111111111111111111L; + output[outputOffset + 6] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111111111111111L) << 2); + output[outputOffset + 7] = (v2 >>> 19) & 0b111111111111111111111L; + output[outputOffset + 8] = (v2 >>> 40) & 0b111111111111111111111L; + output[outputOffset + 9] = ((v2 >>> 61) & 0b111L) | ((v3 & 0b111111111111111111L) << 3); + output[outputOffset + 10] = (v3 >>> 18) & 0b111111111111111111111L; + output[outputOffset + 11] = (v3 >>> 39) & 0b111111111111111111111L; + output[outputOffset + 12] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111111111111111L) << 4); + output[outputOffset + 13] = (v4 >>> 17) & 0b111111111111111111111L; + output[outputOffset + 14] = (v4 >>> 38) & 0b111111111111111111111L; + output[outputOffset + 15] = ((v4 >>> 59) & 0b11111L) | ((v5 & 0b1111111111111111L) << 5); + output[outputOffset + 16] = (v5 >>> 16) & 0b111111111111111111111L; + output[outputOffset + 17] = (v5 >>> 37) & 0b111111111111111111111L; + output[outputOffset + 18] = ((v5 >>> 58) & 0b111111L) | ((v6 & 0b111111111111111L) << 6); + output[outputOffset + 19] = (v6 >>> 15) & 0b111111111111111111111L; + output[outputOffset + 20] = (v6 >>> 36) & 0b111111111111111111111L; + output[outputOffset + 21] = ((v6 >>> 57) & 0b1111111L) | ((v7 & 0b11111111111111L) << 7); + output[outputOffset + 22] = (v7 >>> 14) & 0b111111111111111111111L; + output[outputOffset + 23] = (v7 >>> 35) & 0b111111111111111111111L; + output[outputOffset + 24] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b1111111111111L) << 8); + output[outputOffset + 25] = (v8 >>> 13) & 0b111111111111111111111L; + output[outputOffset + 26] = (v8 >>> 34) & 0b111111111111111111111L; + output[outputOffset + 27] = ((v8 >>> 55) & 0b111111111L) | ((v9 & 0b111111111111L) << 9); + output[outputOffset + 28] = (v9 >>> 12) & 0b111111111111111111111L; + output[outputOffset + 29] = (v9 >>> 33) & 0b111111111111111111111L; + output[outputOffset + 30] = ((v9 >>> 54) & 0b1111111111L) | ((v10 & 0b11111111111L) << 10); + output[outputOffset + 31] = (v10 >>> 11) & 0b111111111111111111111L; + } + } + + private static final class Unpacker22 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 22) & 0b1111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 44) & 0b11111111111111111111L) | ((v1 & 0b11L) << 20); + output[outputOffset + 3] = (v1 >>> 2) & 0b1111111111111111111111L; + output[outputOffset + 4] = (v1 >>> 24) & 0b1111111111111111111111L; + output[outputOffset + 5] = ((v1 >>> 46) & 0b111111111111111111L) | ((v2 & 0b1111L) << 18); + output[outputOffset + 6] = (v2 >>> 4) & 0b1111111111111111111111L; + output[outputOffset + 7] = (v2 >>> 26) & 0b1111111111111111111111L; + output[outputOffset + 8] = ((v2 >>> 48) & 0b1111111111111111L) | ((v3 & 0b111111L) << 16); + output[outputOffset + 9] = (v3 >>> 6) & 0b1111111111111111111111L; + output[outputOffset + 10] = (v3 >>> 28) & 0b1111111111111111111111L; + output[outputOffset + 11] = ((v3 >>> 50) & 0b11111111111111L) | ((v4 & 0b11111111L) << 14); + output[outputOffset + 12] = (v4 >>> 8) & 0b1111111111111111111111L; + output[outputOffset + 13] = (v4 >>> 30) & 0b1111111111111111111111L; + output[outputOffset + 14] = ((v4 >>> 52) & 0b111111111111L) | ((v5 & 0b1111111111L) << 12); + output[outputOffset + 15] = (v5 >>> 10) & 0b1111111111111111111111L; + output[outputOffset + 16] = (v5 >>> 32) & 0b1111111111111111111111L; + output[outputOffset + 17] = ((v5 >>> 54) & 0b1111111111L) | ((v6 & 0b111111111111L) << 10); + output[outputOffset + 18] = (v6 >>> 12) & 0b1111111111111111111111L; + output[outputOffset + 19] = (v6 >>> 34) & 0b1111111111111111111111L; + output[outputOffset + 20] = ((v6 >>> 56) & 0b11111111L) | ((v7 & 0b11111111111111L) << 8); + output[outputOffset + 21] = (v7 >>> 14) & 0b1111111111111111111111L; + output[outputOffset + 22] = (v7 >>> 36) & 0b1111111111111111111111L; + output[outputOffset + 23] = ((v7 >>> 58) & 0b111111L) | ((v8 & 0b1111111111111111L) << 6); + output[outputOffset + 24] = (v8 >>> 16) & 0b1111111111111111111111L; + output[outputOffset + 25] = (v8 >>> 38) & 0b1111111111111111111111L; + output[outputOffset + 26] = ((v8 >>> 60) & 0b1111L) | ((v9 & 0b111111111111111111L) << 4); + output[outputOffset + 27] = (v9 >>> 18) & 0b1111111111111111111111L; + output[outputOffset + 28] = (v9 >>> 40) & 0b1111111111111111111111L; + output[outputOffset + 29] = ((v9 >>> 62) & 0b11L) | ((v10 & 0b11111111111111111111L) << 2); + output[outputOffset + 30] = (v10 >>> 20) & 0b1111111111111111111111L; + output[outputOffset + 31] = (v10 >>> 42) & 0b1111111111111111111111L; + } + } + + private static final class Unpacker23 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + int v11 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 23) & 0b11111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 46) & 0b111111111111111111L) | ((v1 & 0b11111L) << 18); + output[outputOffset + 3] = (v1 >>> 5) & 0b11111111111111111111111L; + output[outputOffset + 4] = (v1 >>> 28) & 0b11111111111111111111111L; + output[outputOffset + 5] = ((v1 >>> 51) & 0b1111111111111L) | ((v2 & 0b1111111111L) << 13); + output[outputOffset + 6] = (v2 >>> 10) & 0b11111111111111111111111L; + output[outputOffset + 7] = (v2 >>> 33) & 0b11111111111111111111111L; + output[outputOffset + 8] = ((v2 >>> 56) & 0b11111111L) | ((v3 & 0b111111111111111L) << 8); + output[outputOffset + 9] = (v3 >>> 15) & 0b11111111111111111111111L; + output[outputOffset + 10] = (v3 >>> 38) & 0b11111111111111111111111L; + output[outputOffset + 11] = ((v3 >>> 61) & 0b111L) | ((v4 & 0b11111111111111111111L) << 3); + output[outputOffset + 12] = (v4 >>> 20) & 0b11111111111111111111111L; + output[outputOffset + 13] = ((v4 >>> 43) & 0b111111111111111111111L) | ((v5 & 0b11L) << 21); + output[outputOffset + 14] = (v5 >>> 2) & 0b11111111111111111111111L; + output[outputOffset + 15] = (v5 >>> 25) & 0b11111111111111111111111L; + output[outputOffset + 16] = ((v5 >>> 48) & 0b1111111111111111L) | ((v6 & 0b1111111L) << 16); + output[outputOffset + 17] = (v6 >>> 7) & 0b11111111111111111111111L; + output[outputOffset + 18] = (v6 >>> 30) & 0b11111111111111111111111L; + output[outputOffset + 19] = ((v6 >>> 53) & 0b11111111111L) | ((v7 & 0b111111111111L) << 11); + output[outputOffset + 20] = (v7 >>> 12) & 0b11111111111111111111111L; + output[outputOffset + 21] = (v7 >>> 35) & 0b11111111111111111111111L; + output[outputOffset + 22] = ((v7 >>> 58) & 0b111111L) | ((v8 & 0b11111111111111111L) << 6); + output[outputOffset + 23] = (v8 >>> 17) & 0b11111111111111111111111L; + output[outputOffset + 24] = (v8 >>> 40) & 0b11111111111111111111111L; + output[outputOffset + 25] = ((v8 >>> 63) & 0b1L) | ((v9 & 0b1111111111111111111111L) << 1); + output[outputOffset + 26] = (v9 >>> 22) & 0b11111111111111111111111L; + output[outputOffset + 27] = ((v9 >>> 45) & 0b1111111111111111111L) | ((v10 & 0b1111L) << 19); + output[outputOffset + 28] = (v10 >>> 4) & 0b11111111111111111111111L; + output[outputOffset + 29] = (v10 >>> 27) & 0b11111111111111111111111L; + output[outputOffset + 30] = ((v10 >>> 50) & 0b11111111111111L) | ((v11 & 0b111111111L) << 14); + output[outputOffset + 31] = (v11 >>> 9) & 0b11111111111111111111111L; + } + } + + private static final class Unpacker24 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 24) & 0b111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 48) & 0b1111111111111111L) | ((v1 & 0b11111111L) << 16); + output[outputOffset + 3] = (v1 >>> 8) & 0b111111111111111111111111L; + output[outputOffset + 4] = (v1 >>> 32) & 0b111111111111111111111111L; + output[outputOffset + 5] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111111111111111L) << 8); + output[outputOffset + 6] = (v2 >>> 16) & 0b111111111111111111111111L; + output[outputOffset + 7] = (v2 >>> 40) & 0b111111111111111111111111L; + output[outputOffset + 8] = v3 & 0b111111111111111111111111L; + output[outputOffset + 9] = (v3 >>> 24) & 0b111111111111111111111111L; + output[outputOffset + 10] = ((v3 >>> 48) & 0b1111111111111111L) | ((v4 & 0b11111111L) << 16); + output[outputOffset + 11] = (v4 >>> 8) & 0b111111111111111111111111L; + output[outputOffset + 12] = (v4 >>> 32) & 0b111111111111111111111111L; + output[outputOffset + 13] = ((v4 >>> 56) & 0b11111111L) | ((v5 & 0b1111111111111111L) << 8); + output[outputOffset + 14] = (v5 >>> 16) & 0b111111111111111111111111L; + output[outputOffset + 15] = (v5 >>> 40) & 0b111111111111111111111111L; + output[outputOffset + 16] = v6 & 0b111111111111111111111111L; + output[outputOffset + 17] = (v6 >>> 24) & 0b111111111111111111111111L; + output[outputOffset + 18] = ((v6 >>> 48) & 0b1111111111111111L) | ((v7 & 0b11111111L) << 16); + output[outputOffset + 19] = (v7 >>> 8) & 0b111111111111111111111111L; + output[outputOffset + 20] = (v7 >>> 32) & 0b111111111111111111111111L; + output[outputOffset + 21] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b1111111111111111L) << 8); + output[outputOffset + 22] = (v8 >>> 16) & 0b111111111111111111111111L; + output[outputOffset + 23] = (v8 >>> 40) & 0b111111111111111111111111L; + output[outputOffset + 24] = v9 & 0b111111111111111111111111L; + output[outputOffset + 25] = (v9 >>> 24) & 0b111111111111111111111111L; + output[outputOffset + 26] = ((v9 >>> 48) & 0b1111111111111111L) | ((v10 & 0b11111111L) << 16); + output[outputOffset + 27] = (v10 >>> 8) & 0b111111111111111111111111L; + output[outputOffset + 28] = (v10 >>> 32) & 0b111111111111111111111111L; + output[outputOffset + 29] = ((v10 >>> 56) & 0b11111111L) | ((v11 & 0b1111111111111111L) << 8); + output[outputOffset + 30] = (v11 >>> 16) & 0b111111111111111111111111L; + output[outputOffset + 31] = (v11 >>> 40) & 0b111111111111111111111111L; + } + } + + private static final class Unpacker25 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + int v12 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 25) & 0b1111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 50) & 0b11111111111111L) | ((v1 & 0b11111111111L) << 14); + output[outputOffset + 3] = (v1 >>> 11) & 0b1111111111111111111111111L; + output[outputOffset + 4] = (v1 >>> 36) & 0b1111111111111111111111111L; + output[outputOffset + 5] = ((v1 >>> 61) & 0b111L) | ((v2 & 0b1111111111111111111111L) << 3); + output[outputOffset + 6] = (v2 >>> 22) & 0b1111111111111111111111111L; + output[outputOffset + 7] = ((v2 >>> 47) & 0b11111111111111111L) | ((v3 & 0b11111111L) << 17); + output[outputOffset + 8] = (v3 >>> 8) & 0b1111111111111111111111111L; + output[outputOffset + 9] = (v3 >>> 33) & 0b1111111111111111111111111L; + output[outputOffset + 10] = ((v3 >>> 58) & 0b111111L) | ((v4 & 0b1111111111111111111L) << 6); + output[outputOffset + 11] = (v4 >>> 19) & 0b1111111111111111111111111L; + output[outputOffset + 12] = ((v4 >>> 44) & 0b11111111111111111111L) | ((v5 & 0b11111L) << 20); + output[outputOffset + 13] = (v5 >>> 5) & 0b1111111111111111111111111L; + output[outputOffset + 14] = (v5 >>> 30) & 0b1111111111111111111111111L; + output[outputOffset + 15] = ((v5 >>> 55) & 0b111111111L) | ((v6 & 0b1111111111111111L) << 9); + output[outputOffset + 16] = (v6 >>> 16) & 0b1111111111111111111111111L; + output[outputOffset + 17] = ((v6 >>> 41) & 0b11111111111111111111111L) | ((v7 & 0b11L) << 23); + output[outputOffset + 18] = (v7 >>> 2) & 0b1111111111111111111111111L; + output[outputOffset + 19] = (v7 >>> 27) & 0b1111111111111111111111111L; + output[outputOffset + 20] = ((v7 >>> 52) & 0b111111111111L) | ((v8 & 0b1111111111111L) << 12); + output[outputOffset + 21] = (v8 >>> 13) & 0b1111111111111111111111111L; + output[outputOffset + 22] = (v8 >>> 38) & 0b1111111111111111111111111L; + output[outputOffset + 23] = ((v8 >>> 63) & 0b1L) | ((v9 & 0b111111111111111111111111L) << 1); + output[outputOffset + 24] = (v9 >>> 24) & 0b1111111111111111111111111L; + output[outputOffset + 25] = ((v9 >>> 49) & 0b111111111111111L) | ((v10 & 0b1111111111L) << 15); + output[outputOffset + 26] = (v10 >>> 10) & 0b1111111111111111111111111L; + output[outputOffset + 27] = (v10 >>> 35) & 0b1111111111111111111111111L; + output[outputOffset + 28] = ((v10 >>> 60) & 0b1111L) | ((v11 & 0b111111111111111111111L) << 4); + output[outputOffset + 29] = (v11 >>> 21) & 0b1111111111111111111111111L; + output[outputOffset + 30] = ((v11 >>> 46) & 0b111111111111111111L) | ((v12 & 0b1111111L) << 18); + output[outputOffset + 31] = (v12 >>> 7) & 0b1111111111111111111111111L; + } + } + + private static final class Unpacker26 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 26) & 0b11111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 52) & 0b111111111111L) | ((v1 & 0b11111111111111L) << 12); + output[outputOffset + 3] = (v1 >>> 14) & 0b11111111111111111111111111L; + output[outputOffset + 4] = ((v1 >>> 40) & 0b111111111111111111111111L) | ((v2 & 0b11L) << 24); + output[outputOffset + 5] = (v2 >>> 2) & 0b11111111111111111111111111L; + output[outputOffset + 6] = (v2 >>> 28) & 0b11111111111111111111111111L; + output[outputOffset + 7] = ((v2 >>> 54) & 0b1111111111L) | ((v3 & 0b1111111111111111L) << 10); + output[outputOffset + 8] = (v3 >>> 16) & 0b11111111111111111111111111L; + output[outputOffset + 9] = ((v3 >>> 42) & 0b1111111111111111111111L) | ((v4 & 0b1111L) << 22); + output[outputOffset + 10] = (v4 >>> 4) & 0b11111111111111111111111111L; + output[outputOffset + 11] = (v4 >>> 30) & 0b11111111111111111111111111L; + output[outputOffset + 12] = ((v4 >>> 56) & 0b11111111L) | ((v5 & 0b111111111111111111L) << 8); + output[outputOffset + 13] = (v5 >>> 18) & 0b11111111111111111111111111L; + output[outputOffset + 14] = ((v5 >>> 44) & 0b11111111111111111111L) | ((v6 & 0b111111L) << 20); + output[outputOffset + 15] = (v6 >>> 6) & 0b11111111111111111111111111L; + output[outputOffset + 16] = (v6 >>> 32) & 0b11111111111111111111111111L; + output[outputOffset + 17] = ((v6 >>> 58) & 0b111111L) | ((v7 & 0b11111111111111111111L) << 6); + output[outputOffset + 18] = (v7 >>> 20) & 0b11111111111111111111111111L; + output[outputOffset + 19] = ((v7 >>> 46) & 0b111111111111111111L) | ((v8 & 0b11111111L) << 18); + output[outputOffset + 20] = (v8 >>> 8) & 0b11111111111111111111111111L; + output[outputOffset + 21] = (v8 >>> 34) & 0b11111111111111111111111111L; + output[outputOffset + 22] = ((v8 >>> 60) & 0b1111L) | ((v9 & 0b1111111111111111111111L) << 4); + output[outputOffset + 23] = (v9 >>> 22) & 0b11111111111111111111111111L; + output[outputOffset + 24] = ((v9 >>> 48) & 0b1111111111111111L) | ((v10 & 0b1111111111L) << 16); + output[outputOffset + 25] = (v10 >>> 10) & 0b11111111111111111111111111L; + output[outputOffset + 26] = (v10 >>> 36) & 0b11111111111111111111111111L; + output[outputOffset + 27] = ((v10 >>> 62) & 0b11L) | ((v11 & 0b111111111111111111111111L) << 2); + output[outputOffset + 28] = (v11 >>> 24) & 0b11111111111111111111111111L; + output[outputOffset + 29] = ((v11 >>> 50) & 0b11111111111111L) | ((v12 & 0b111111111111L) << 14); + output[outputOffset + 30] = (v12 >>> 12) & 0b11111111111111111111111111L; + output[outputOffset + 31] = (v12 >>> 38) & 0b11111111111111111111111111L; + } + } + + private static final class Unpacker27 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + int v13 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 27) & 0b111111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 54) & 0b1111111111L) | ((v1 & 0b11111111111111111L) << 10); + output[outputOffset + 3] = (v1 >>> 17) & 0b111111111111111111111111111L; + output[outputOffset + 4] = ((v1 >>> 44) & 0b11111111111111111111L) | ((v2 & 0b1111111L) << 20); + output[outputOffset + 5] = (v2 >>> 7) & 0b111111111111111111111111111L; + output[outputOffset + 6] = (v2 >>> 34) & 0b111111111111111111111111111L; + output[outputOffset + 7] = ((v2 >>> 61) & 0b111L) | ((v3 & 0b111111111111111111111111L) << 3); + output[outputOffset + 8] = (v3 >>> 24) & 0b111111111111111111111111111L; + output[outputOffset + 9] = ((v3 >>> 51) & 0b1111111111111L) | ((v4 & 0b11111111111111L) << 13); + output[outputOffset + 10] = (v4 >>> 14) & 0b111111111111111111111111111L; + output[outputOffset + 11] = ((v4 >>> 41) & 0b11111111111111111111111L) | ((v5 & 0b1111L) << 23); + output[outputOffset + 12] = (v5 >>> 4) & 0b111111111111111111111111111L; + output[outputOffset + 13] = (v5 >>> 31) & 0b111111111111111111111111111L; + output[outputOffset + 14] = ((v5 >>> 58) & 0b111111L) | ((v6 & 0b111111111111111111111L) << 6); + output[outputOffset + 15] = (v6 >>> 21) & 0b111111111111111111111111111L; + output[outputOffset + 16] = ((v6 >>> 48) & 0b1111111111111111L) | ((v7 & 0b11111111111L) << 16); + output[outputOffset + 17] = (v7 >>> 11) & 0b111111111111111111111111111L; + output[outputOffset + 18] = ((v7 >>> 38) & 0b11111111111111111111111111L) | ((v8 & 0b1L) << 26); + output[outputOffset + 19] = (v8 >>> 1) & 0b111111111111111111111111111L; + output[outputOffset + 20] = (v8 >>> 28) & 0b111111111111111111111111111L; + output[outputOffset + 21] = ((v8 >>> 55) & 0b111111111L) | ((v9 & 0b111111111111111111L) << 9); + output[outputOffset + 22] = (v9 >>> 18) & 0b111111111111111111111111111L; + output[outputOffset + 23] = ((v9 >>> 45) & 0b1111111111111111111L) | ((v10 & 0b11111111L) << 19); + output[outputOffset + 24] = (v10 >>> 8) & 0b111111111111111111111111111L; + output[outputOffset + 25] = (v10 >>> 35) & 0b111111111111111111111111111L; + output[outputOffset + 26] = ((v10 >>> 62) & 0b11L) | ((v11 & 0b1111111111111111111111111L) << 2); + output[outputOffset + 27] = (v11 >>> 25) & 0b111111111111111111111111111L; + output[outputOffset + 28] = ((v11 >>> 52) & 0b111111111111L) | ((v12 & 0b111111111111111L) << 12); + output[outputOffset + 29] = (v12 >>> 15) & 0b111111111111111111111111111L; + output[outputOffset + 30] = ((v12 >>> 42) & 0b1111111111111111111111L) | ((v13 & 0b11111L) << 22); + output[outputOffset + 31] = (v13 >>> 5) & 0b111111111111111111111111111L; + } + } + + private static final class Unpacker28 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 28) & 0b1111111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 56) & 0b11111111L) | ((v1 & 0b11111111111111111111L) << 8); + output[outputOffset + 3] = (v1 >>> 20) & 0b1111111111111111111111111111L; + output[outputOffset + 4] = ((v1 >>> 48) & 0b1111111111111111L) | ((v2 & 0b111111111111L) << 16); + output[outputOffset + 5] = (v2 >>> 12) & 0b1111111111111111111111111111L; + output[outputOffset + 6] = ((v2 >>> 40) & 0b111111111111111111111111L) | ((v3 & 0b1111L) << 24); + output[outputOffset + 7] = (v3 >>> 4) & 0b1111111111111111111111111111L; + output[outputOffset + 8] = (v3 >>> 32) & 0b1111111111111111111111111111L; + output[outputOffset + 9] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b111111111111111111111111L) << 4); + output[outputOffset + 10] = (v4 >>> 24) & 0b1111111111111111111111111111L; + output[outputOffset + 11] = ((v4 >>> 52) & 0b111111111111L) | ((v5 & 0b1111111111111111L) << 12); + output[outputOffset + 12] = (v5 >>> 16) & 0b1111111111111111111111111111L; + output[outputOffset + 13] = ((v5 >>> 44) & 0b11111111111111111111L) | ((v6 & 0b11111111L) << 20); + output[outputOffset + 14] = (v6 >>> 8) & 0b1111111111111111111111111111L; + output[outputOffset + 15] = (v6 >>> 36) & 0b1111111111111111111111111111L; + output[outputOffset + 16] = v7 & 0b1111111111111111111111111111L; + output[outputOffset + 17] = (v7 >>> 28) & 0b1111111111111111111111111111L; + output[outputOffset + 18] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b11111111111111111111L) << 8); + output[outputOffset + 19] = (v8 >>> 20) & 0b1111111111111111111111111111L; + output[outputOffset + 20] = ((v8 >>> 48) & 0b1111111111111111L) | ((v9 & 0b111111111111L) << 16); + output[outputOffset + 21] = (v9 >>> 12) & 0b1111111111111111111111111111L; + output[outputOffset + 22] = ((v9 >>> 40) & 0b111111111111111111111111L) | ((v10 & 0b1111L) << 24); + output[outputOffset + 23] = (v10 >>> 4) & 0b1111111111111111111111111111L; + output[outputOffset + 24] = (v10 >>> 32) & 0b1111111111111111111111111111L; + output[outputOffset + 25] = ((v10 >>> 60) & 0b1111L) | ((v11 & 0b111111111111111111111111L) << 4); + output[outputOffset + 26] = (v11 >>> 24) & 0b1111111111111111111111111111L; + output[outputOffset + 27] = ((v11 >>> 52) & 0b111111111111L) | ((v12 & 0b1111111111111111L) << 12); + output[outputOffset + 28] = (v12 >>> 16) & 0b1111111111111111111111111111L; + output[outputOffset + 29] = ((v12 >>> 44) & 0b11111111111111111111L) | ((v13 & 0b11111111L) << 20); + output[outputOffset + 30] = (v13 >>> 8) & 0b1111111111111111111111111111L; + output[outputOffset + 31] = (v13 >>> 36) & 0b1111111111111111111111111111L; + } + } + + private static final class Unpacker29 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + int v14 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 29) & 0b11111111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 58) & 0b111111L) | ((v1 & 0b11111111111111111111111L) << 6); + output[outputOffset + 3] = (v1 >>> 23) & 0b11111111111111111111111111111L; + output[outputOffset + 4] = ((v1 >>> 52) & 0b111111111111L) | ((v2 & 0b11111111111111111L) << 12); + output[outputOffset + 5] = (v2 >>> 17) & 0b11111111111111111111111111111L; + output[outputOffset + 6] = ((v2 >>> 46) & 0b111111111111111111L) | ((v3 & 0b11111111111L) << 18); + output[outputOffset + 7] = (v3 >>> 11) & 0b11111111111111111111111111111L; + output[outputOffset + 8] = ((v3 >>> 40) & 0b111111111111111111111111L) | ((v4 & 0b11111L) << 24); + output[outputOffset + 9] = (v4 >>> 5) & 0b11111111111111111111111111111L; + output[outputOffset + 10] = (v4 >>> 34) & 0b11111111111111111111111111111L; + output[outputOffset + 11] = ((v4 >>> 63) & 0b1L) | ((v5 & 0b1111111111111111111111111111L) << 1); + output[outputOffset + 12] = (v5 >>> 28) & 0b11111111111111111111111111111L; + output[outputOffset + 13] = ((v5 >>> 57) & 0b1111111L) | ((v6 & 0b1111111111111111111111L) << 7); + output[outputOffset + 14] = (v6 >>> 22) & 0b11111111111111111111111111111L; + output[outputOffset + 15] = ((v6 >>> 51) & 0b1111111111111L) | ((v7 & 0b1111111111111111L) << 13); + output[outputOffset + 16] = (v7 >>> 16) & 0b11111111111111111111111111111L; + output[outputOffset + 17] = ((v7 >>> 45) & 0b1111111111111111111L) | ((v8 & 0b1111111111L) << 19); + output[outputOffset + 18] = (v8 >>> 10) & 0b11111111111111111111111111111L; + output[outputOffset + 19] = ((v8 >>> 39) & 0b1111111111111111111111111L) | ((v9 & 0b1111L) << 25); + output[outputOffset + 20] = (v9 >>> 4) & 0b11111111111111111111111111111L; + output[outputOffset + 21] = (v9 >>> 33) & 0b11111111111111111111111111111L; + output[outputOffset + 22] = ((v9 >>> 62) & 0b11L) | ((v10 & 0b111111111111111111111111111L) << 2); + output[outputOffset + 23] = (v10 >>> 27) & 0b11111111111111111111111111111L; + output[outputOffset + 24] = ((v10 >>> 56) & 0b11111111L) | ((v11 & 0b111111111111111111111L) << 8); + output[outputOffset + 25] = (v11 >>> 21) & 0b11111111111111111111111111111L; + output[outputOffset + 26] = ((v11 >>> 50) & 0b11111111111111L) | ((v12 & 0b111111111111111L) << 14); + output[outputOffset + 27] = (v12 >>> 15) & 0b11111111111111111111111111111L; + output[outputOffset + 28] = ((v12 >>> 44) & 0b11111111111111111111L) | ((v13 & 0b111111111L) << 20); + output[outputOffset + 29] = (v13 >>> 9) & 0b11111111111111111111111111111L; + output[outputOffset + 30] = ((v13 >>> 38) & 0b11111111111111111111111111L) | ((v14 & 0b111L) << 26); + output[outputOffset + 31] = (v14 >>> 3) & 0b11111111111111111111111111111L; + } + } + + private static final class Unpacker30 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 30) & 0b111111111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111111111111111111111L) << 4); + output[outputOffset + 3] = (v1 >>> 26) & 0b111111111111111111111111111111L; + output[outputOffset + 4] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111111111111111111111L) << 8); + output[outputOffset + 5] = (v2 >>> 22) & 0b111111111111111111111111111111L; + output[outputOffset + 6] = ((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111111111111111111L) << 12); + output[outputOffset + 7] = (v3 >>> 18) & 0b111111111111111111111111111111L; + output[outputOffset + 8] = ((v3 >>> 48) & 0b1111111111111111L) | ((v4 & 0b11111111111111L) << 16); + output[outputOffset + 9] = (v4 >>> 14) & 0b111111111111111111111111111111L; + output[outputOffset + 10] = ((v4 >>> 44) & 0b11111111111111111111L) | ((v5 & 0b1111111111L) << 20); + output[outputOffset + 11] = (v5 >>> 10) & 0b111111111111111111111111111111L; + output[outputOffset + 12] = ((v5 >>> 40) & 0b111111111111111111111111L) | ((v6 & 0b111111L) << 24); + output[outputOffset + 13] = (v6 >>> 6) & 0b111111111111111111111111111111L; + output[outputOffset + 14] = ((v6 >>> 36) & 0b1111111111111111111111111111L) | ((v7 & 0b11L) << 28); + output[outputOffset + 15] = (v7 >>> 2) & 0b111111111111111111111111111111L; + output[outputOffset + 16] = (v7 >>> 32) & 0b111111111111111111111111111111L; + output[outputOffset + 17] = ((v7 >>> 62) & 0b11L) | ((v8 & 0b1111111111111111111111111111L) << 2); + output[outputOffset + 18] = (v8 >>> 28) & 0b111111111111111111111111111111L; + output[outputOffset + 19] = ((v8 >>> 58) & 0b111111L) | ((v9 & 0b111111111111111111111111L) << 6); + output[outputOffset + 20] = (v9 >>> 24) & 0b111111111111111111111111111111L; + output[outputOffset + 21] = ((v9 >>> 54) & 0b1111111111L) | ((v10 & 0b11111111111111111111L) << 10); + output[outputOffset + 22] = (v10 >>> 20) & 0b111111111111111111111111111111L; + output[outputOffset + 23] = ((v10 >>> 50) & 0b11111111111111L) | ((v11 & 0b1111111111111111L) << 14); + output[outputOffset + 24] = (v11 >>> 16) & 0b111111111111111111111111111111L; + output[outputOffset + 25] = ((v11 >>> 46) & 0b111111111111111111L) | ((v12 & 0b111111111111L) << 18); + output[outputOffset + 26] = (v12 >>> 12) & 0b111111111111111111111111111111L; + output[outputOffset + 27] = ((v12 >>> 42) & 0b1111111111111111111111L) | ((v13 & 0b11111111L) << 22); + output[outputOffset + 28] = (v13 >>> 8) & 0b111111111111111111111111111111L; + output[outputOffset + 29] = ((v13 >>> 38) & 0b11111111111111111111111111L) | ((v14 & 0b1111L) << 26); + output[outputOffset + 30] = (v14 >>> 4) & 0b111111111111111111111111111111L; + output[outputOffset + 31] = (v14 >>> 34) & 0b111111111111111111111111111111L; + } + } + + private static final class Unpacker31 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + int v15 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 31) & 0b1111111111111111111111111111111L; + output[outputOffset + 2] = ((v0 >>> 62) & 0b11L) | ((v1 & 0b11111111111111111111111111111L) << 2); + output[outputOffset + 3] = (v1 >>> 29) & 0b1111111111111111111111111111111L; + output[outputOffset + 4] = ((v1 >>> 60) & 0b1111L) | ((v2 & 0b111111111111111111111111111L) << 4); + output[outputOffset + 5] = (v2 >>> 27) & 0b1111111111111111111111111111111L; + output[outputOffset + 6] = ((v2 >>> 58) & 0b111111L) | ((v3 & 0b1111111111111111111111111L) << 6); + output[outputOffset + 7] = (v3 >>> 25) & 0b1111111111111111111111111111111L; + output[outputOffset + 8] = ((v3 >>> 56) & 0b11111111L) | ((v4 & 0b11111111111111111111111L) << 8); + output[outputOffset + 9] = (v4 >>> 23) & 0b1111111111111111111111111111111L; + output[outputOffset + 10] = ((v4 >>> 54) & 0b1111111111L) | ((v5 & 0b111111111111111111111L) << 10); + output[outputOffset + 11] = (v5 >>> 21) & 0b1111111111111111111111111111111L; + output[outputOffset + 12] = ((v5 >>> 52) & 0b111111111111L) | ((v6 & 0b1111111111111111111L) << 12); + output[outputOffset + 13] = (v6 >>> 19) & 0b1111111111111111111111111111111L; + output[outputOffset + 14] = ((v6 >>> 50) & 0b11111111111111L) | ((v7 & 0b11111111111111111L) << 14); + output[outputOffset + 15] = (v7 >>> 17) & 0b1111111111111111111111111111111L; + output[outputOffset + 16] = ((v7 >>> 48) & 0b1111111111111111L) | ((v8 & 0b111111111111111L) << 16); + output[outputOffset + 17] = (v8 >>> 15) & 0b1111111111111111111111111111111L; + output[outputOffset + 18] = ((v8 >>> 46) & 0b111111111111111111L) | ((v9 & 0b1111111111111L) << 18); + output[outputOffset + 19] = (v9 >>> 13) & 0b1111111111111111111111111111111L; + output[outputOffset + 20] = ((v9 >>> 44) & 0b11111111111111111111L) | ((v10 & 0b11111111111L) << 20); + output[outputOffset + 21] = (v10 >>> 11) & 0b1111111111111111111111111111111L; + output[outputOffset + 22] = ((v10 >>> 42) & 0b1111111111111111111111L) | ((v11 & 0b111111111L) << 22); + output[outputOffset + 23] = (v11 >>> 9) & 0b1111111111111111111111111111111L; + output[outputOffset + 24] = ((v11 >>> 40) & 0b111111111111111111111111L) | ((v12 & 0b1111111L) << 24); + output[outputOffset + 25] = (v12 >>> 7) & 0b1111111111111111111111111111111L; + output[outputOffset + 26] = ((v12 >>> 38) & 0b11111111111111111111111111L) | ((v13 & 0b11111L) << 26); + output[outputOffset + 27] = (v13 >>> 5) & 0b1111111111111111111111111111111L; + output[outputOffset + 28] = ((v13 >>> 36) & 0b1111111111111111111111111111L) | ((v14 & 0b111L) << 28); + output[outputOffset + 29] = (v14 >>> 3) & 0b1111111111111111111111111111111L; + output[outputOffset + 30] = ((v14 >>> 34) & 0b111111111111111111111111111111L) | ((v15 & 0b1L) << 30); + output[outputOffset + 31] = (v15 >>> 1) & 0b1111111111111111111111111111111L; + } + } + + private static final class Unpacker32 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111L; + output[outputOffset + 1] = (v0 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 2] = v1 & 0b11111111111111111111111111111111L; + output[outputOffset + 3] = (v1 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 4] = v2 & 0b11111111111111111111111111111111L; + output[outputOffset + 5] = (v2 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 6] = v3 & 0b11111111111111111111111111111111L; + output[outputOffset + 7] = (v3 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 8] = v4 & 0b11111111111111111111111111111111L; + output[outputOffset + 9] = (v4 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 10] = v5 & 0b11111111111111111111111111111111L; + output[outputOffset + 11] = (v5 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 12] = v6 & 0b11111111111111111111111111111111L; + output[outputOffset + 13] = (v6 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 14] = v7 & 0b11111111111111111111111111111111L; + output[outputOffset + 15] = (v7 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 16] = v8 & 0b11111111111111111111111111111111L; + output[outputOffset + 17] = (v8 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 18] = v9 & 0b11111111111111111111111111111111L; + output[outputOffset + 19] = (v9 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 20] = v10 & 0b11111111111111111111111111111111L; + output[outputOffset + 21] = (v10 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 22] = v11 & 0b11111111111111111111111111111111L; + output[outputOffset + 23] = (v11 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 24] = v12 & 0b11111111111111111111111111111111L; + output[outputOffset + 25] = (v12 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 26] = v13 & 0b11111111111111111111111111111111L; + output[outputOffset + 27] = (v13 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 28] = v14 & 0b11111111111111111111111111111111L; + output[outputOffset + 29] = (v14 >>> 32) & 0b11111111111111111111111111111111L; + output[outputOffset + 30] = v15 & 0b11111111111111111111111111111111L; + output[outputOffset + 31] = (v15 >>> 32) & 0b11111111111111111111111111111111L; + } + } + + private static final class Unpacker33 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + int v16 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 33) & 0b1111111111111111111111111111111L) | ((v1 & 0b11L) << 31); + output[outputOffset + 2] = (v1 >>> 2) & 0b111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 35) & 0b11111111111111111111111111111L) | ((v2 & 0b1111L) << 29); + output[outputOffset + 4] = (v2 >>> 4) & 0b111111111111111111111111111111111L; + output[outputOffset + 5] = ((v2 >>> 37) & 0b111111111111111111111111111L) | ((v3 & 0b111111L) << 27); + output[outputOffset + 6] = (v3 >>> 6) & 0b111111111111111111111111111111111L; + output[outputOffset + 7] = ((v3 >>> 39) & 0b1111111111111111111111111L) | ((v4 & 0b11111111L) << 25); + output[outputOffset + 8] = (v4 >>> 8) & 0b111111111111111111111111111111111L; + output[outputOffset + 9] = ((v4 >>> 41) & 0b11111111111111111111111L) | ((v5 & 0b1111111111L) << 23); + output[outputOffset + 10] = (v5 >>> 10) & 0b111111111111111111111111111111111L; + output[outputOffset + 11] = ((v5 >>> 43) & 0b111111111111111111111L) | ((v6 & 0b111111111111L) << 21); + output[outputOffset + 12] = (v6 >>> 12) & 0b111111111111111111111111111111111L; + output[outputOffset + 13] = ((v6 >>> 45) & 0b1111111111111111111L) | ((v7 & 0b11111111111111L) << 19); + output[outputOffset + 14] = (v7 >>> 14) & 0b111111111111111111111111111111111L; + output[outputOffset + 15] = ((v7 >>> 47) & 0b11111111111111111L) | ((v8 & 0b1111111111111111L) << 17); + output[outputOffset + 16] = (v8 >>> 16) & 0b111111111111111111111111111111111L; + output[outputOffset + 17] = ((v8 >>> 49) & 0b111111111111111L) | ((v9 & 0b111111111111111111L) << 15); + output[outputOffset + 18] = (v9 >>> 18) & 0b111111111111111111111111111111111L; + output[outputOffset + 19] = ((v9 >>> 51) & 0b1111111111111L) | ((v10 & 0b11111111111111111111L) << 13); + output[outputOffset + 20] = (v10 >>> 20) & 0b111111111111111111111111111111111L; + output[outputOffset + 21] = ((v10 >>> 53) & 0b11111111111L) | ((v11 & 0b1111111111111111111111L) << 11); + output[outputOffset + 22] = (v11 >>> 22) & 0b111111111111111111111111111111111L; + output[outputOffset + 23] = ((v11 >>> 55) & 0b111111111L) | ((v12 & 0b111111111111111111111111L) << 9); + output[outputOffset + 24] = (v12 >>> 24) & 0b111111111111111111111111111111111L; + output[outputOffset + 25] = ((v12 >>> 57) & 0b1111111L) | ((v13 & 0b11111111111111111111111111L) << 7); + output[outputOffset + 26] = (v13 >>> 26) & 0b111111111111111111111111111111111L; + output[outputOffset + 27] = ((v13 >>> 59) & 0b11111L) | ((v14 & 0b1111111111111111111111111111L) << 5); + output[outputOffset + 28] = (v14 >>> 28) & 0b111111111111111111111111111111111L; + output[outputOffset + 29] = ((v14 >>> 61) & 0b111L) | ((v15 & 0b111111111111111111111111111111L) << 3); + output[outputOffset + 30] = (v15 >>> 30) & 0b111111111111111111111111111111111L; + output[outputOffset + 31] = ((v15 >>> 63) & 0b1L) | ((v16 & 0b11111111111111111111111111111111L) << 1); + } + } + + private static final class Unpacker34 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 34) & 0b111111111111111111111111111111L) | ((v1 & 0b1111L) << 30); + output[outputOffset + 2] = (v1 >>> 4) & 0b1111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 38) & 0b11111111111111111111111111L) | ((v2 & 0b11111111L) << 26); + output[outputOffset + 4] = (v2 >>> 8) & 0b1111111111111111111111111111111111L; + output[outputOffset + 5] = ((v2 >>> 42) & 0b1111111111111111111111L) | ((v3 & 0b111111111111L) << 22); + output[outputOffset + 6] = (v3 >>> 12) & 0b1111111111111111111111111111111111L; + output[outputOffset + 7] = ((v3 >>> 46) & 0b111111111111111111L) | ((v4 & 0b1111111111111111L) << 18); + output[outputOffset + 8] = (v4 >>> 16) & 0b1111111111111111111111111111111111L; + output[outputOffset + 9] = ((v4 >>> 50) & 0b11111111111111L) | ((v5 & 0b11111111111111111111L) << 14); + output[outputOffset + 10] = (v5 >>> 20) & 0b1111111111111111111111111111111111L; + output[outputOffset + 11] = ((v5 >>> 54) & 0b1111111111L) | ((v6 & 0b111111111111111111111111L) << 10); + output[outputOffset + 12] = (v6 >>> 24) & 0b1111111111111111111111111111111111L; + output[outputOffset + 13] = ((v6 >>> 58) & 0b111111L) | ((v7 & 0b1111111111111111111111111111L) << 6); + output[outputOffset + 14] = (v7 >>> 28) & 0b1111111111111111111111111111111111L; + output[outputOffset + 15] = ((v7 >>> 62) & 0b11L) | ((v8 & 0b11111111111111111111111111111111L) << 2); + output[outputOffset + 16] = ((v8 >>> 32) & 0b11111111111111111111111111111111L) | ((v9 & 0b11L) << 32); + output[outputOffset + 17] = (v9 >>> 2) & 0b1111111111111111111111111111111111L; + output[outputOffset + 18] = ((v9 >>> 36) & 0b1111111111111111111111111111L) | ((v10 & 0b111111L) << 28); + output[outputOffset + 19] = (v10 >>> 6) & 0b1111111111111111111111111111111111L; + output[outputOffset + 20] = ((v10 >>> 40) & 0b111111111111111111111111L) | ((v11 & 0b1111111111L) << 24); + output[outputOffset + 21] = (v11 >>> 10) & 0b1111111111111111111111111111111111L; + output[outputOffset + 22] = ((v11 >>> 44) & 0b11111111111111111111L) | ((v12 & 0b11111111111111L) << 20); + output[outputOffset + 23] = (v12 >>> 14) & 0b1111111111111111111111111111111111L; + output[outputOffset + 24] = ((v12 >>> 48) & 0b1111111111111111L) | ((v13 & 0b111111111111111111L) << 16); + output[outputOffset + 25] = (v13 >>> 18) & 0b1111111111111111111111111111111111L; + output[outputOffset + 26] = ((v13 >>> 52) & 0b111111111111L) | ((v14 & 0b1111111111111111111111L) << 12); + output[outputOffset + 27] = (v14 >>> 22) & 0b1111111111111111111111111111111111L; + output[outputOffset + 28] = ((v14 >>> 56) & 0b11111111L) | ((v15 & 0b11111111111111111111111111L) << 8); + output[outputOffset + 29] = (v15 >>> 26) & 0b1111111111111111111111111111111111L; + output[outputOffset + 30] = ((v15 >>> 60) & 0b1111L) | ((v16 & 0b111111111111111111111111111111L) << 4); + output[outputOffset + 31] = (v16 >>> 30) & 0b1111111111111111111111111111111111L; + } + } + + private static final class Unpacker35 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + int v17 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 35) & 0b11111111111111111111111111111L) | ((v1 & 0b111111L) << 29); + output[outputOffset + 2] = (v1 >>> 6) & 0b11111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 41) & 0b11111111111111111111111L) | ((v2 & 0b111111111111L) << 23); + output[outputOffset + 4] = (v2 >>> 12) & 0b11111111111111111111111111111111111L; + output[outputOffset + 5] = ((v2 >>> 47) & 0b11111111111111111L) | ((v3 & 0b111111111111111111L) << 17); + output[outputOffset + 6] = (v3 >>> 18) & 0b11111111111111111111111111111111111L; + output[outputOffset + 7] = ((v3 >>> 53) & 0b11111111111L) | ((v4 & 0b111111111111111111111111L) << 11); + output[outputOffset + 8] = (v4 >>> 24) & 0b11111111111111111111111111111111111L; + output[outputOffset + 9] = ((v4 >>> 59) & 0b11111L) | ((v5 & 0b111111111111111111111111111111L) << 5); + output[outputOffset + 10] = ((v5 >>> 30) & 0b1111111111111111111111111111111111L) | ((v6 & 0b1L) << 34); + output[outputOffset + 11] = (v6 >>> 1) & 0b11111111111111111111111111111111111L; + output[outputOffset + 12] = ((v6 >>> 36) & 0b1111111111111111111111111111L) | ((v7 & 0b1111111L) << 28); + output[outputOffset + 13] = (v7 >>> 7) & 0b11111111111111111111111111111111111L; + output[outputOffset + 14] = ((v7 >>> 42) & 0b1111111111111111111111L) | ((v8 & 0b1111111111111L) << 22); + output[outputOffset + 15] = (v8 >>> 13) & 0b11111111111111111111111111111111111L; + output[outputOffset + 16] = ((v8 >>> 48) & 0b1111111111111111L) | ((v9 & 0b1111111111111111111L) << 16); + output[outputOffset + 17] = (v9 >>> 19) & 0b11111111111111111111111111111111111L; + output[outputOffset + 18] = ((v9 >>> 54) & 0b1111111111L) | ((v10 & 0b1111111111111111111111111L) << 10); + output[outputOffset + 19] = (v10 >>> 25) & 0b11111111111111111111111111111111111L; + output[outputOffset + 20] = ((v10 >>> 60) & 0b1111L) | ((v11 & 0b1111111111111111111111111111111L) << 4); + output[outputOffset + 21] = ((v11 >>> 31) & 0b111111111111111111111111111111111L) | ((v12 & 0b11L) << 33); + output[outputOffset + 22] = (v12 >>> 2) & 0b11111111111111111111111111111111111L; + output[outputOffset + 23] = ((v12 >>> 37) & 0b111111111111111111111111111L) | ((v13 & 0b11111111L) << 27); + output[outputOffset + 24] = (v13 >>> 8) & 0b11111111111111111111111111111111111L; + output[outputOffset + 25] = ((v13 >>> 43) & 0b111111111111111111111L) | ((v14 & 0b11111111111111L) << 21); + output[outputOffset + 26] = (v14 >>> 14) & 0b11111111111111111111111111111111111L; + output[outputOffset + 27] = ((v14 >>> 49) & 0b111111111111111L) | ((v15 & 0b11111111111111111111L) << 15); + output[outputOffset + 28] = (v15 >>> 20) & 0b11111111111111111111111111111111111L; + output[outputOffset + 29] = ((v15 >>> 55) & 0b111111111L) | ((v16 & 0b11111111111111111111111111L) << 9); + output[outputOffset + 30] = (v16 >>> 26) & 0b11111111111111111111111111111111111L; + output[outputOffset + 31] = ((v16 >>> 61) & 0b111L) | ((v17 & 0b11111111111111111111111111111111L) << 3); + } + } + + private static final class Unpacker36 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 36) & 0b1111111111111111111111111111L) | ((v1 & 0b11111111L) << 28); + output[outputOffset + 2] = (v1 >>> 8) & 0b111111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 44) & 0b11111111111111111111L) | ((v2 & 0b1111111111111111L) << 20); + output[outputOffset + 4] = (v2 >>> 16) & 0b111111111111111111111111111111111111L; + output[outputOffset + 5] = ((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111111111111111111111111L) << 12); + output[outputOffset + 6] = (v3 >>> 24) & 0b111111111111111111111111111111111111L; + output[outputOffset + 7] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111111111111111111111111111111L) << 4); + output[outputOffset + 8] = ((v4 >>> 32) & 0b11111111111111111111111111111111L) | ((v5 & 0b1111L) << 32); + output[outputOffset + 9] = (v5 >>> 4) & 0b111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v5 >>> 40) & 0b111111111111111111111111L) | ((v6 & 0b111111111111L) << 24); + output[outputOffset + 11] = (v6 >>> 12) & 0b111111111111111111111111111111111111L; + output[outputOffset + 12] = ((v6 >>> 48) & 0b1111111111111111L) | ((v7 & 0b11111111111111111111L) << 16); + output[outputOffset + 13] = (v7 >>> 20) & 0b111111111111111111111111111111111111L; + output[outputOffset + 14] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b1111111111111111111111111111L) << 8); + output[outputOffset + 15] = (v8 >>> 28) & 0b111111111111111111111111111111111111L; + output[outputOffset + 16] = v9 & 0b111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v9 >>> 36) & 0b1111111111111111111111111111L) | ((v10 & 0b11111111L) << 28); + output[outputOffset + 18] = (v10 >>> 8) & 0b111111111111111111111111111111111111L; + output[outputOffset + 19] = ((v10 >>> 44) & 0b11111111111111111111L) | ((v11 & 0b1111111111111111L) << 20); + output[outputOffset + 20] = (v11 >>> 16) & 0b111111111111111111111111111111111111L; + output[outputOffset + 21] = ((v11 >>> 52) & 0b111111111111L) | ((v12 & 0b111111111111111111111111L) << 12); + output[outputOffset + 22] = (v12 >>> 24) & 0b111111111111111111111111111111111111L; + output[outputOffset + 23] = ((v12 >>> 60) & 0b1111L) | ((v13 & 0b11111111111111111111111111111111L) << 4); + output[outputOffset + 24] = ((v13 >>> 32) & 0b11111111111111111111111111111111L) | ((v14 & 0b1111L) << 32); + output[outputOffset + 25] = (v14 >>> 4) & 0b111111111111111111111111111111111111L; + output[outputOffset + 26] = ((v14 >>> 40) & 0b111111111111111111111111L) | ((v15 & 0b111111111111L) << 24); + output[outputOffset + 27] = (v15 >>> 12) & 0b111111111111111111111111111111111111L; + output[outputOffset + 28] = ((v15 >>> 48) & 0b1111111111111111L) | ((v16 & 0b11111111111111111111L) << 16); + output[outputOffset + 29] = (v16 >>> 20) & 0b111111111111111111111111111111111111L; + output[outputOffset + 30] = ((v16 >>> 56) & 0b11111111L) | ((v17 & 0b1111111111111111111111111111L) << 8); + output[outputOffset + 31] = (v17 >>> 28) & 0b111111111111111111111111111111111111L; + } + } + + private static final class Unpacker37 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + int v18 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 37) & 0b111111111111111111111111111L) | ((v1 & 0b1111111111L) << 27); + output[outputOffset + 2] = (v1 >>> 10) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 47) & 0b11111111111111111L) | ((v2 & 0b11111111111111111111L) << 17); + output[outputOffset + 4] = (v2 >>> 20) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 5] = ((v2 >>> 57) & 0b1111111L) | ((v3 & 0b111111111111111111111111111111L) << 7); + output[outputOffset + 6] = ((v3 >>> 30) & 0b1111111111111111111111111111111111L) | ((v4 & 0b111L) << 34); + output[outputOffset + 7] = (v4 >>> 3) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 8] = ((v4 >>> 40) & 0b111111111111111111111111L) | ((v5 & 0b1111111111111L) << 24); + output[outputOffset + 9] = (v5 >>> 13) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v5 >>> 50) & 0b11111111111111L) | ((v6 & 0b11111111111111111111111L) << 14); + output[outputOffset + 11] = (v6 >>> 23) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 12] = ((v6 >>> 60) & 0b1111L) | ((v7 & 0b111111111111111111111111111111111L) << 4); + output[outputOffset + 13] = ((v7 >>> 33) & 0b1111111111111111111111111111111L) | ((v8 & 0b111111L) << 31); + output[outputOffset + 14] = (v8 >>> 6) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 15] = ((v8 >>> 43) & 0b111111111111111111111L) | ((v9 & 0b1111111111111111L) << 21); + output[outputOffset + 16] = (v9 >>> 16) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v9 >>> 53) & 0b11111111111L) | ((v10 & 0b11111111111111111111111111L) << 11); + output[outputOffset + 18] = (v10 >>> 26) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 19] = ((v10 >>> 63) & 0b1L) | ((v11 & 0b111111111111111111111111111111111111L) << 1); + output[outputOffset + 20] = ((v11 >>> 36) & 0b1111111111111111111111111111L) | ((v12 & 0b111111111L) << 28); + output[outputOffset + 21] = (v12 >>> 9) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v12 >>> 46) & 0b111111111111111111L) | ((v13 & 0b1111111111111111111L) << 18); + output[outputOffset + 23] = (v13 >>> 19) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 24] = ((v13 >>> 56) & 0b11111111L) | ((v14 & 0b11111111111111111111111111111L) << 8); + output[outputOffset + 25] = ((v14 >>> 29) & 0b11111111111111111111111111111111111L) | ((v15 & 0b11L) << 35); + output[outputOffset + 26] = (v15 >>> 2) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 27] = ((v15 >>> 39) & 0b1111111111111111111111111L) | ((v16 & 0b111111111111L) << 25); + output[outputOffset + 28] = (v16 >>> 12) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 29] = ((v16 >>> 49) & 0b111111111111111L) | ((v17 & 0b1111111111111111111111L) << 15); + output[outputOffset + 30] = (v17 >>> 22) & 0b1111111111111111111111111111111111111L; + output[outputOffset + 31] = ((v17 >>> 59) & 0b11111L) | ((v18 & 0b11111111111111111111111111111111L) << 5); + } + } + + private static final class Unpacker38 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 38) & 0b11111111111111111111111111L) | ((v1 & 0b111111111111L) << 26); + output[outputOffset + 2] = (v1 >>> 12) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 50) & 0b11111111111111L) | ((v2 & 0b111111111111111111111111L) << 14); + output[outputOffset + 4] = (v2 >>> 24) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 5] = ((v2 >>> 62) & 0b11L) | ((v3 & 0b111111111111111111111111111111111111L) << 2); + output[outputOffset + 6] = ((v3 >>> 36) & 0b1111111111111111111111111111L) | ((v4 & 0b1111111111L) << 28); + output[outputOffset + 7] = (v4 >>> 10) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 8] = ((v4 >>> 48) & 0b1111111111111111L) | ((v5 & 0b1111111111111111111111L) << 16); + output[outputOffset + 9] = (v5 >>> 22) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v5 >>> 60) & 0b1111L) | ((v6 & 0b1111111111111111111111111111111111L) << 4); + output[outputOffset + 11] = ((v6 >>> 34) & 0b111111111111111111111111111111L) | ((v7 & 0b11111111L) << 30); + output[outputOffset + 12] = (v7 >>> 8) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v7 >>> 46) & 0b111111111111111111L) | ((v8 & 0b11111111111111111111L) << 18); + output[outputOffset + 14] = (v8 >>> 20) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 15] = ((v8 >>> 58) & 0b111111L) | ((v9 & 0b11111111111111111111111111111111L) << 6); + output[outputOffset + 16] = ((v9 >>> 32) & 0b11111111111111111111111111111111L) | ((v10 & 0b111111L) << 32); + output[outputOffset + 17] = (v10 >>> 6) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 18] = ((v10 >>> 44) & 0b11111111111111111111L) | ((v11 & 0b111111111111111111L) << 20); + output[outputOffset + 19] = (v11 >>> 18) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 20] = ((v11 >>> 56) & 0b11111111L) | ((v12 & 0b111111111111111111111111111111L) << 8); + output[outputOffset + 21] = ((v12 >>> 30) & 0b1111111111111111111111111111111111L) | ((v13 & 0b1111L) << 34); + output[outputOffset + 22] = (v13 >>> 4) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 23] = ((v13 >>> 42) & 0b1111111111111111111111L) | ((v14 & 0b1111111111111111L) << 22); + output[outputOffset + 24] = (v14 >>> 16) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 25] = ((v14 >>> 54) & 0b1111111111L) | ((v15 & 0b1111111111111111111111111111L) << 10); + output[outputOffset + 26] = ((v15 >>> 28) & 0b111111111111111111111111111111111111L) | ((v16 & 0b11L) << 36); + output[outputOffset + 27] = (v16 >>> 2) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 28] = ((v16 >>> 40) & 0b111111111111111111111111L) | ((v17 & 0b11111111111111L) << 24); + output[outputOffset + 29] = (v17 >>> 14) & 0b11111111111111111111111111111111111111L; + output[outputOffset + 30] = ((v17 >>> 52) & 0b111111111111L) | ((v18 & 0b11111111111111111111111111L) << 12); + output[outputOffset + 31] = (v18 >>> 26) & 0b11111111111111111111111111111111111111L; + } + } + + private static final class Unpacker39 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + int v19 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 39) & 0b1111111111111111111111111L) | ((v1 & 0b11111111111111L) << 25); + output[outputOffset + 2] = (v1 >>> 14) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 53) & 0b11111111111L) | ((v2 & 0b1111111111111111111111111111L) << 11); + output[outputOffset + 4] = ((v2 >>> 28) & 0b111111111111111111111111111111111111L) | ((v3 & 0b111L) << 36); + output[outputOffset + 5] = (v3 >>> 3) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 6] = ((v3 >>> 42) & 0b1111111111111111111111L) | ((v4 & 0b11111111111111111L) << 22); + output[outputOffset + 7] = (v4 >>> 17) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 8] = ((v4 >>> 56) & 0b11111111L) | ((v5 & 0b1111111111111111111111111111111L) << 8); + output[outputOffset + 9] = ((v5 >>> 31) & 0b111111111111111111111111111111111L) | ((v6 & 0b111111L) << 33); + output[outputOffset + 10] = (v6 >>> 6) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 11] = ((v6 >>> 45) & 0b1111111111111111111L) | ((v7 & 0b11111111111111111111L) << 19); + output[outputOffset + 12] = (v7 >>> 20) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v7 >>> 59) & 0b11111L) | ((v8 & 0b1111111111111111111111111111111111L) << 5); + output[outputOffset + 14] = ((v8 >>> 34) & 0b111111111111111111111111111111L) | ((v9 & 0b111111111L) << 30); + output[outputOffset + 15] = (v9 >>> 9) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 16] = ((v9 >>> 48) & 0b1111111111111111L) | ((v10 & 0b11111111111111111111111L) << 16); + output[outputOffset + 17] = (v10 >>> 23) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 18] = ((v10 >>> 62) & 0b11L) | ((v11 & 0b1111111111111111111111111111111111111L) << 2); + output[outputOffset + 19] = ((v11 >>> 37) & 0b111111111111111111111111111L) | ((v12 & 0b111111111111L) << 27); + output[outputOffset + 20] = (v12 >>> 12) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 21] = ((v12 >>> 51) & 0b1111111111111L) | ((v13 & 0b11111111111111111111111111L) << 13); + output[outputOffset + 22] = ((v13 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v14 & 0b1L) << 38); + output[outputOffset + 23] = (v14 >>> 1) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 24] = ((v14 >>> 40) & 0b111111111111111111111111L) | ((v15 & 0b111111111111111L) << 24); + output[outputOffset + 25] = (v15 >>> 15) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 26] = ((v15 >>> 54) & 0b1111111111L) | ((v16 & 0b11111111111111111111111111111L) << 10); + output[outputOffset + 27] = ((v16 >>> 29) & 0b11111111111111111111111111111111111L) | ((v17 & 0b1111L) << 35); + output[outputOffset + 28] = (v17 >>> 4) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 29] = ((v17 >>> 43) & 0b111111111111111111111L) | ((v18 & 0b111111111111111111L) << 21); + output[outputOffset + 30] = (v18 >>> 18) & 0b111111111111111111111111111111111111111L; + output[outputOffset + 31] = ((v18 >>> 57) & 0b1111111L) | ((v19 & 0b11111111111111111111111111111111L) << 7); + } + } + + private static final class Unpacker40 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 40) & 0b111111111111111111111111L) | ((v1 & 0b1111111111111111L) << 24); + output[outputOffset + 2] = (v1 >>> 16) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b11111111111111111111111111111111L) << 8); + output[outputOffset + 4] = ((v2 >>> 32) & 0b11111111111111111111111111111111L) | ((v3 & 0b11111111L) << 32); + output[outputOffset + 5] = (v3 >>> 8) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 6] = ((v3 >>> 48) & 0b1111111111111111L) | ((v4 & 0b111111111111111111111111L) << 16); + output[outputOffset + 7] = (v4 >>> 24) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 8] = v5 & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 9] = ((v5 >>> 40) & 0b111111111111111111111111L) | ((v6 & 0b1111111111111111L) << 24); + output[outputOffset + 10] = (v6 >>> 16) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 11] = ((v6 >>> 56) & 0b11111111L) | ((v7 & 0b11111111111111111111111111111111L) << 8); + output[outputOffset + 12] = ((v7 >>> 32) & 0b11111111111111111111111111111111L) | ((v8 & 0b11111111L) << 32); + output[outputOffset + 13] = (v8 >>> 8) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 14] = ((v8 >>> 48) & 0b1111111111111111L) | ((v9 & 0b111111111111111111111111L) << 16); + output[outputOffset + 15] = (v9 >>> 24) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 16] = v10 & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v10 >>> 40) & 0b111111111111111111111111L) | ((v11 & 0b1111111111111111L) << 24); + output[outputOffset + 18] = (v11 >>> 16) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 19] = ((v11 >>> 56) & 0b11111111L) | ((v12 & 0b11111111111111111111111111111111L) << 8); + output[outputOffset + 20] = ((v12 >>> 32) & 0b11111111111111111111111111111111L) | ((v13 & 0b11111111L) << 32); + output[outputOffset + 21] = (v13 >>> 8) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v13 >>> 48) & 0b1111111111111111L) | ((v14 & 0b111111111111111111111111L) << 16); + output[outputOffset + 23] = (v14 >>> 24) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 24] = v15 & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 25] = ((v15 >>> 40) & 0b111111111111111111111111L) | ((v16 & 0b1111111111111111L) << 24); + output[outputOffset + 26] = (v16 >>> 16) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 27] = ((v16 >>> 56) & 0b11111111L) | ((v17 & 0b11111111111111111111111111111111L) << 8); + output[outputOffset + 28] = ((v17 >>> 32) & 0b11111111111111111111111111111111L) | ((v18 & 0b11111111L) << 32); + output[outputOffset + 29] = (v18 >>> 8) & 0b1111111111111111111111111111111111111111L; + output[outputOffset + 30] = ((v18 >>> 48) & 0b1111111111111111L) | ((v19 & 0b111111111111111111111111L) << 16); + output[outputOffset + 31] = (v19 >>> 24) & 0b1111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker41 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + int v20 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 41) & 0b11111111111111111111111L) | ((v1 & 0b111111111111111111L) << 23); + output[outputOffset + 2] = (v1 >>> 18) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 59) & 0b11111L) | ((v2 & 0b111111111111111111111111111111111111L) << 5); + output[outputOffset + 4] = ((v2 >>> 36) & 0b1111111111111111111111111111L) | ((v3 & 0b1111111111111L) << 28); + output[outputOffset + 5] = (v3 >>> 13) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 6] = ((v3 >>> 54) & 0b1111111111L) | ((v4 & 0b1111111111111111111111111111111L) << 10); + output[outputOffset + 7] = ((v4 >>> 31) & 0b111111111111111111111111111111111L) | ((v5 & 0b11111111L) << 33); + output[outputOffset + 8] = (v5 >>> 8) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 9] = ((v5 >>> 49) & 0b111111111111111L) | ((v6 & 0b11111111111111111111111111L) << 15); + output[outputOffset + 10] = ((v6 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v7 & 0b111L) << 38); + output[outputOffset + 11] = (v7 >>> 3) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 12] = ((v7 >>> 44) & 0b11111111111111111111L) | ((v8 & 0b111111111111111111111L) << 20); + output[outputOffset + 13] = (v8 >>> 21) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 14] = ((v8 >>> 62) & 0b11L) | ((v9 & 0b111111111111111111111111111111111111111L) << 2); + output[outputOffset + 15] = ((v9 >>> 39) & 0b1111111111111111111111111L) | ((v10 & 0b1111111111111111L) << 25); + output[outputOffset + 16] = (v10 >>> 16) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v10 >>> 57) & 0b1111111L) | ((v11 & 0b1111111111111111111111111111111111L) << 7); + output[outputOffset + 18] = ((v11 >>> 34) & 0b111111111111111111111111111111L) | ((v12 & 0b11111111111L) << 30); + output[outputOffset + 19] = (v12 >>> 11) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 20] = ((v12 >>> 52) & 0b111111111111L) | ((v13 & 0b11111111111111111111111111111L) << 12); + output[outputOffset + 21] = ((v13 >>> 29) & 0b11111111111111111111111111111111111L) | ((v14 & 0b111111L) << 35); + output[outputOffset + 22] = (v14 >>> 6) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 23] = ((v14 >>> 47) & 0b11111111111111111L) | ((v15 & 0b111111111111111111111111L) << 17); + output[outputOffset + 24] = ((v15 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v16 & 0b1L) << 40); + output[outputOffset + 25] = (v16 >>> 1) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 26] = ((v16 >>> 42) & 0b1111111111111111111111L) | ((v17 & 0b1111111111111111111L) << 22); + output[outputOffset + 27] = (v17 >>> 19) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 28] = ((v17 >>> 60) & 0b1111L) | ((v18 & 0b1111111111111111111111111111111111111L) << 4); + output[outputOffset + 29] = ((v18 >>> 37) & 0b111111111111111111111111111L) | ((v19 & 0b11111111111111L) << 27); + output[outputOffset + 30] = (v19 >>> 14) & 0b11111111111111111111111111111111111111111L; + output[outputOffset + 31] = ((v19 >>> 55) & 0b111111111L) | ((v20 & 0b11111111111111111111111111111111L) << 9); + } + } + + private static final class Unpacker42 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 42) & 0b1111111111111111111111L) | ((v1 & 0b11111111111111111111L) << 22); + output[outputOffset + 2] = (v1 >>> 20) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 3] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111111111111111111111111111111111111L) << 2); + output[outputOffset + 4] = ((v2 >>> 40) & 0b111111111111111111111111L) | ((v3 & 0b111111111111111111L) << 24); + output[outputOffset + 5] = (v3 >>> 18) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 6] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111111111111111111111111111111111111L) << 4); + output[outputOffset + 7] = ((v4 >>> 38) & 0b11111111111111111111111111L) | ((v5 & 0b1111111111111111L) << 26); + output[outputOffset + 8] = (v5 >>> 16) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 9] = ((v5 >>> 58) & 0b111111L) | ((v6 & 0b111111111111111111111111111111111111L) << 6); + output[outputOffset + 10] = ((v6 >>> 36) & 0b1111111111111111111111111111L) | ((v7 & 0b11111111111111L) << 28); + output[outputOffset + 11] = (v7 >>> 14) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 12] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b1111111111111111111111111111111111L) << 8); + output[outputOffset + 13] = ((v8 >>> 34) & 0b111111111111111111111111111111L) | ((v9 & 0b111111111111L) << 30); + output[outputOffset + 14] = (v9 >>> 12) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 15] = ((v9 >>> 54) & 0b1111111111L) | ((v10 & 0b11111111111111111111111111111111L) << 10); + output[outputOffset + 16] = ((v10 >>> 32) & 0b11111111111111111111111111111111L) | ((v11 & 0b1111111111L) << 32); + output[outputOffset + 17] = (v11 >>> 10) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 18] = ((v11 >>> 52) & 0b111111111111L) | ((v12 & 0b111111111111111111111111111111L) << 12); + output[outputOffset + 19] = ((v12 >>> 30) & 0b1111111111111111111111111111111111L) | ((v13 & 0b11111111L) << 34); + output[outputOffset + 20] = (v13 >>> 8) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 21] = ((v13 >>> 50) & 0b11111111111111L) | ((v14 & 0b1111111111111111111111111111L) << 14); + output[outputOffset + 22] = ((v14 >>> 28) & 0b111111111111111111111111111111111111L) | ((v15 & 0b111111L) << 36); + output[outputOffset + 23] = (v15 >>> 6) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 24] = ((v15 >>> 48) & 0b1111111111111111L) | ((v16 & 0b11111111111111111111111111L) << 16); + output[outputOffset + 25] = ((v16 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v17 & 0b1111L) << 38); + output[outputOffset + 26] = (v17 >>> 4) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 27] = ((v17 >>> 46) & 0b111111111111111111L) | ((v18 & 0b111111111111111111111111L) << 18); + output[outputOffset + 28] = ((v18 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v19 & 0b11L) << 40); + output[outputOffset + 29] = (v19 >>> 2) & 0b111111111111111111111111111111111111111111L; + output[outputOffset + 30] = ((v19 >>> 44) & 0b11111111111111111111L) | ((v20 & 0b1111111111111111111111L) << 20); + output[outputOffset + 31] = (v20 >>> 22) & 0b111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker43 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + int v21 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 43) & 0b111111111111111111111L) | ((v1 & 0b1111111111111111111111L) << 21); + output[outputOffset + 2] = ((v1 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v2 & 0b1L) << 42); + output[outputOffset + 3] = (v2 >>> 1) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 4] = ((v2 >>> 44) & 0b11111111111111111111L) | ((v3 & 0b11111111111111111111111L) << 20); + output[outputOffset + 5] = ((v3 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v4 & 0b11L) << 41); + output[outputOffset + 6] = (v4 >>> 2) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 7] = ((v4 >>> 45) & 0b1111111111111111111L) | ((v5 & 0b111111111111111111111111L) << 19); + output[outputOffset + 8] = ((v5 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v6 & 0b111L) << 40); + output[outputOffset + 9] = (v6 >>> 3) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v6 >>> 46) & 0b111111111111111111L) | ((v7 & 0b1111111111111111111111111L) << 18); + output[outputOffset + 11] = ((v7 >>> 25) & 0b111111111111111111111111111111111111111L) | ((v8 & 0b1111L) << 39); + output[outputOffset + 12] = (v8 >>> 4) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v8 >>> 47) & 0b11111111111111111L) | ((v9 & 0b11111111111111111111111111L) << 17); + output[outputOffset + 14] = ((v9 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v10 & 0b11111L) << 38); + output[outputOffset + 15] = (v10 >>> 5) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 16] = ((v10 >>> 48) & 0b1111111111111111L) | ((v11 & 0b111111111111111111111111111L) << 16); + output[outputOffset + 17] = ((v11 >>> 27) & 0b1111111111111111111111111111111111111L) | ((v12 & 0b111111L) << 37); + output[outputOffset + 18] = (v12 >>> 6) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 19] = ((v12 >>> 49) & 0b111111111111111L) | ((v13 & 0b1111111111111111111111111111L) << 15); + output[outputOffset + 20] = ((v13 >>> 28) & 0b111111111111111111111111111111111111L) | ((v14 & 0b1111111L) << 36); + output[outputOffset + 21] = (v14 >>> 7) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v14 >>> 50) & 0b11111111111111L) | ((v15 & 0b11111111111111111111111111111L) << 14); + output[outputOffset + 23] = ((v15 >>> 29) & 0b11111111111111111111111111111111111L) | ((v16 & 0b11111111L) << 35); + output[outputOffset + 24] = (v16 >>> 8) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 25] = ((v16 >>> 51) & 0b1111111111111L) | ((v17 & 0b111111111111111111111111111111L) << 13); + output[outputOffset + 26] = ((v17 >>> 30) & 0b1111111111111111111111111111111111L) | ((v18 & 0b111111111L) << 34); + output[outputOffset + 27] = (v18 >>> 9) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 28] = ((v18 >>> 52) & 0b111111111111L) | ((v19 & 0b1111111111111111111111111111111L) << 12); + output[outputOffset + 29] = ((v19 >>> 31) & 0b111111111111111111111111111111111L) | ((v20 & 0b1111111111L) << 33); + output[outputOffset + 30] = (v20 >>> 10) & 0b1111111111111111111111111111111111111111111L; + output[outputOffset + 31] = ((v20 >>> 53) & 0b11111111111L) | ((v21 & 0b11111111111111111111111111111111L) << 11); + } + } + + private static final class Unpacker44 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 44) & 0b11111111111111111111L) | ((v1 & 0b111111111111111111111111L) << 20); + output[outputOffset + 2] = ((v1 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v2 & 0b1111L) << 40); + output[outputOffset + 3] = (v2 >>> 4) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 4] = ((v2 >>> 48) & 0b1111111111111111L) | ((v3 & 0b1111111111111111111111111111L) << 16); + output[outputOffset + 5] = ((v3 >>> 28) & 0b111111111111111111111111111111111111L) | ((v4 & 0b11111111L) << 36); + output[outputOffset + 6] = (v4 >>> 8) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 7] = ((v4 >>> 52) & 0b111111111111L) | ((v5 & 0b11111111111111111111111111111111L) << 12); + output[outputOffset + 8] = ((v5 >>> 32) & 0b11111111111111111111111111111111L) | ((v6 & 0b111111111111L) << 32); + output[outputOffset + 9] = (v6 >>> 12) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v6 >>> 56) & 0b11111111L) | ((v7 & 0b111111111111111111111111111111111111L) << 8); + output[outputOffset + 11] = ((v7 >>> 36) & 0b1111111111111111111111111111L) | ((v8 & 0b1111111111111111L) << 28); + output[outputOffset + 12] = (v8 >>> 16) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v8 >>> 60) & 0b1111L) | ((v9 & 0b1111111111111111111111111111111111111111L) << 4); + output[outputOffset + 14] = ((v9 >>> 40) & 0b111111111111111111111111L) | ((v10 & 0b11111111111111111111L) << 24); + output[outputOffset + 15] = (v10 >>> 20) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 16] = v11 & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v11 >>> 44) & 0b11111111111111111111L) | ((v12 & 0b111111111111111111111111L) << 20); + output[outputOffset + 18] = ((v12 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v13 & 0b1111L) << 40); + output[outputOffset + 19] = (v13 >>> 4) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 20] = ((v13 >>> 48) & 0b1111111111111111L) | ((v14 & 0b1111111111111111111111111111L) << 16); + output[outputOffset + 21] = ((v14 >>> 28) & 0b111111111111111111111111111111111111L) | ((v15 & 0b11111111L) << 36); + output[outputOffset + 22] = (v15 >>> 8) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 23] = ((v15 >>> 52) & 0b111111111111L) | ((v16 & 0b11111111111111111111111111111111L) << 12); + output[outputOffset + 24] = ((v16 >>> 32) & 0b11111111111111111111111111111111L) | ((v17 & 0b111111111111L) << 32); + output[outputOffset + 25] = (v17 >>> 12) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 26] = ((v17 >>> 56) & 0b11111111L) | ((v18 & 0b111111111111111111111111111111111111L) << 8); + output[outputOffset + 27] = ((v18 >>> 36) & 0b1111111111111111111111111111L) | ((v19 & 0b1111111111111111L) << 28); + output[outputOffset + 28] = (v19 >>> 16) & 0b11111111111111111111111111111111111111111111L; + output[outputOffset + 29] = ((v19 >>> 60) & 0b1111L) | ((v20 & 0b1111111111111111111111111111111111111111L) << 4); + output[outputOffset + 30] = ((v20 >>> 40) & 0b111111111111111111111111L) | ((v21 & 0b11111111111111111111L) << 24); + output[outputOffset + 31] = (v21 >>> 20) & 0b11111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker45 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + int v22 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 45) & 0b1111111111111111111L) | ((v1 & 0b11111111111111111111111111L) << 19); + output[outputOffset + 2] = ((v1 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v2 & 0b1111111L) << 38); + output[outputOffset + 3] = (v2 >>> 7) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 4] = ((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111111111111111111111111111111111L) << 12); + output[outputOffset + 5] = ((v3 >>> 33) & 0b1111111111111111111111111111111L) | ((v4 & 0b11111111111111L) << 31); + output[outputOffset + 6] = (v4 >>> 14) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 7] = ((v4 >>> 59) & 0b11111L) | ((v5 & 0b1111111111111111111111111111111111111111L) << 5); + output[outputOffset + 8] = ((v5 >>> 40) & 0b111111111111111111111111L) | ((v6 & 0b111111111111111111111L) << 24); + output[outputOffset + 9] = ((v6 >>> 21) & 0b1111111111111111111111111111111111111111111L) | ((v7 & 0b11L) << 43); + output[outputOffset + 10] = (v7 >>> 2) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 11] = ((v7 >>> 47) & 0b11111111111111111L) | ((v8 & 0b1111111111111111111111111111L) << 17); + output[outputOffset + 12] = ((v8 >>> 28) & 0b111111111111111111111111111111111111L) | ((v9 & 0b111111111L) << 36); + output[outputOffset + 13] = (v9 >>> 9) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 14] = ((v9 >>> 54) & 0b1111111111L) | ((v10 & 0b11111111111111111111111111111111111L) << 10); + output[outputOffset + 15] = ((v10 >>> 35) & 0b11111111111111111111111111111L) | ((v11 & 0b1111111111111111L) << 29); + output[outputOffset + 16] = (v11 >>> 16) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v11 >>> 61) & 0b111L) | ((v12 & 0b111111111111111111111111111111111111111111L) << 3); + output[outputOffset + 18] = ((v12 >>> 42) & 0b1111111111111111111111L) | ((v13 & 0b11111111111111111111111L) << 22); + output[outputOffset + 19] = ((v13 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v14 & 0b1111L) << 41); + output[outputOffset + 20] = (v14 >>> 4) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 21] = ((v14 >>> 49) & 0b111111111111111L) | ((v15 & 0b111111111111111111111111111111L) << 15); + output[outputOffset + 22] = ((v15 >>> 30) & 0b1111111111111111111111111111111111L) | ((v16 & 0b11111111111L) << 34); + output[outputOffset + 23] = (v16 >>> 11) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 24] = ((v16 >>> 56) & 0b11111111L) | ((v17 & 0b1111111111111111111111111111111111111L) << 8); + output[outputOffset + 25] = ((v17 >>> 37) & 0b111111111111111111111111111L) | ((v18 & 0b111111111111111111L) << 27); + output[outputOffset + 26] = (v18 >>> 18) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 27] = ((v18 >>> 63) & 0b1L) | ((v19 & 0b11111111111111111111111111111111111111111111L) << 1); + output[outputOffset + 28] = ((v19 >>> 44) & 0b11111111111111111111L) | ((v20 & 0b1111111111111111111111111L) << 20); + output[outputOffset + 29] = ((v20 >>> 25) & 0b111111111111111111111111111111111111111L) | ((v21 & 0b111111L) << 39); + output[outputOffset + 30] = (v21 >>> 6) & 0b111111111111111111111111111111111111111111111L; + output[outputOffset + 31] = ((v21 >>> 51) & 0b1111111111111L) | ((v22 & 0b11111111111111111111111111111111L) << 13); + } + } + + private static final class Unpacker46 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 46) & 0b111111111111111111L) | ((v1 & 0b1111111111111111111111111111L) << 18); + output[outputOffset + 2] = ((v1 >>> 28) & 0b111111111111111111111111111111111111L) | ((v2 & 0b1111111111L) << 36); + output[outputOffset + 3] = (v2 >>> 10) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 4] = ((v2 >>> 56) & 0b11111111L) | ((v3 & 0b11111111111111111111111111111111111111L) << 8); + output[outputOffset + 5] = ((v3 >>> 38) & 0b11111111111111111111111111L) | ((v4 & 0b11111111111111111111L) << 26); + output[outputOffset + 6] = ((v4 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v5 & 0b11L) << 44); + output[outputOffset + 7] = (v5 >>> 2) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 8] = ((v5 >>> 48) & 0b1111111111111111L) | ((v6 & 0b111111111111111111111111111111L) << 16); + output[outputOffset + 9] = ((v6 >>> 30) & 0b1111111111111111111111111111111111L) | ((v7 & 0b111111111111L) << 34); + output[outputOffset + 10] = (v7 >>> 12) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 11] = ((v7 >>> 58) & 0b111111L) | ((v8 & 0b1111111111111111111111111111111111111111L) << 6); + output[outputOffset + 12] = ((v8 >>> 40) & 0b111111111111111111111111L) | ((v9 & 0b1111111111111111111111L) << 24); + output[outputOffset + 13] = ((v9 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v10 & 0b1111L) << 42); + output[outputOffset + 14] = (v10 >>> 4) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 15] = ((v10 >>> 50) & 0b11111111111111L) | ((v11 & 0b11111111111111111111111111111111L) << 14); + output[outputOffset + 16] = ((v11 >>> 32) & 0b11111111111111111111111111111111L) | ((v12 & 0b11111111111111L) << 32); + output[outputOffset + 17] = (v12 >>> 14) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 18] = ((v12 >>> 60) & 0b1111L) | ((v13 & 0b111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 19] = ((v13 >>> 42) & 0b1111111111111111111111L) | ((v14 & 0b111111111111111111111111L) << 22); + output[outputOffset + 20] = ((v14 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v15 & 0b111111L) << 40); + output[outputOffset + 21] = (v15 >>> 6) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v15 >>> 52) & 0b111111111111L) | ((v16 & 0b1111111111111111111111111111111111L) << 12); + output[outputOffset + 23] = ((v16 >>> 34) & 0b111111111111111111111111111111L) | ((v17 & 0b1111111111111111L) << 30); + output[outputOffset + 24] = (v17 >>> 16) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 25] = ((v17 >>> 62) & 0b11L) | ((v18 & 0b11111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 26] = ((v18 >>> 44) & 0b11111111111111111111L) | ((v19 & 0b11111111111111111111111111L) << 20); + output[outputOffset + 27] = ((v19 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v20 & 0b11111111L) << 38); + output[outputOffset + 28] = (v20 >>> 8) & 0b1111111111111111111111111111111111111111111111L; + output[outputOffset + 29] = ((v20 >>> 54) & 0b1111111111L) | ((v21 & 0b111111111111111111111111111111111111L) << 10); + output[outputOffset + 30] = ((v21 >>> 36) & 0b1111111111111111111111111111L) | ((v22 & 0b111111111111111111L) << 28); + output[outputOffset + 31] = (v22 >>> 18) & 0b1111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker47 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + int v23 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 47) & 0b11111111111111111L) | ((v1 & 0b111111111111111111111111111111L) << 17); + output[outputOffset + 2] = ((v1 >>> 30) & 0b1111111111111111111111111111111111L) | ((v2 & 0b1111111111111L) << 34); + output[outputOffset + 3] = (v2 >>> 13) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 4] = ((v2 >>> 60) & 0b1111L) | ((v3 & 0b1111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 5] = ((v3 >>> 43) & 0b111111111111111111111L) | ((v4 & 0b11111111111111111111111111L) << 21); + output[outputOffset + 6] = ((v4 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v5 & 0b111111111L) << 38); + output[outputOffset + 7] = (v5 >>> 9) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 8] = ((v5 >>> 56) & 0b11111111L) | ((v6 & 0b111111111111111111111111111111111111111L) << 8); + output[outputOffset + 9] = ((v6 >>> 39) & 0b1111111111111111111111111L) | ((v7 & 0b1111111111111111111111L) << 25); + output[outputOffset + 10] = ((v7 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v8 & 0b11111L) << 42); + output[outputOffset + 11] = (v8 >>> 5) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 12] = ((v8 >>> 52) & 0b111111111111L) | ((v9 & 0b11111111111111111111111111111111111L) << 12); + output[outputOffset + 13] = ((v9 >>> 35) & 0b11111111111111111111111111111L) | ((v10 & 0b111111111111111111L) << 29); + output[outputOffset + 14] = ((v10 >>> 18) & 0b1111111111111111111111111111111111111111111111L) | ((v11 & 0b1L) << 46); + output[outputOffset + 15] = (v11 >>> 1) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 16] = ((v11 >>> 48) & 0b1111111111111111L) | ((v12 & 0b1111111111111111111111111111111L) << 16); + output[outputOffset + 17] = ((v12 >>> 31) & 0b111111111111111111111111111111111L) | ((v13 & 0b11111111111111L) << 33); + output[outputOffset + 18] = (v13 >>> 14) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 19] = ((v13 >>> 61) & 0b111L) | ((v14 & 0b11111111111111111111111111111111111111111111L) << 3); + output[outputOffset + 20] = ((v14 >>> 44) & 0b11111111111111111111L) | ((v15 & 0b111111111111111111111111111L) << 20); + output[outputOffset + 21] = ((v15 >>> 27) & 0b1111111111111111111111111111111111111L) | ((v16 & 0b1111111111L) << 37); + output[outputOffset + 22] = (v16 >>> 10) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 23] = ((v16 >>> 57) & 0b1111111L) | ((v17 & 0b1111111111111111111111111111111111111111L) << 7); + output[outputOffset + 24] = ((v17 >>> 40) & 0b111111111111111111111111L) | ((v18 & 0b11111111111111111111111L) << 24); + output[outputOffset + 25] = ((v18 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v19 & 0b111111L) << 41); + output[outputOffset + 26] = (v19 >>> 6) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 27] = ((v19 >>> 53) & 0b11111111111L) | ((v20 & 0b111111111111111111111111111111111111L) << 11); + output[outputOffset + 28] = ((v20 >>> 36) & 0b1111111111111111111111111111L) | ((v21 & 0b1111111111111111111L) << 28); + output[outputOffset + 29] = ((v21 >>> 19) & 0b111111111111111111111111111111111111111111111L) | ((v22 & 0b11L) << 45); + output[outputOffset + 30] = (v22 >>> 2) & 0b11111111111111111111111111111111111111111111111L; + output[outputOffset + 31] = ((v22 >>> 49) & 0b111111111111111L) | ((v23 & 0b11111111111111111111111111111111L) << 15); + } + } + + private static final class Unpacker48 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 48) & 0b1111111111111111L) | ((v1 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 2] = ((v1 >>> 32) & 0b11111111111111111111111111111111L) | ((v2 & 0b1111111111111111L) << 32); + output[outputOffset + 3] = (v2 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 4] = v3 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 5] = ((v3 >>> 48) & 0b1111111111111111L) | ((v4 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 6] = ((v4 >>> 32) & 0b11111111111111111111111111111111L) | ((v5 & 0b1111111111111111L) << 32); + output[outputOffset + 7] = (v5 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 8] = v6 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 9] = ((v6 >>> 48) & 0b1111111111111111L) | ((v7 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 10] = ((v7 >>> 32) & 0b11111111111111111111111111111111L) | ((v8 & 0b1111111111111111L) << 32); + output[outputOffset + 11] = (v8 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 12] = v9 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v9 >>> 48) & 0b1111111111111111L) | ((v10 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 14] = ((v10 >>> 32) & 0b11111111111111111111111111111111L) | ((v11 & 0b1111111111111111L) << 32); + output[outputOffset + 15] = (v11 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 16] = v12 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v12 >>> 48) & 0b1111111111111111L) | ((v13 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 18] = ((v13 >>> 32) & 0b11111111111111111111111111111111L) | ((v14 & 0b1111111111111111L) << 32); + output[outputOffset + 19] = (v14 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 20] = v15 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 21] = ((v15 >>> 48) & 0b1111111111111111L) | ((v16 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 22] = ((v16 >>> 32) & 0b11111111111111111111111111111111L) | ((v17 & 0b1111111111111111L) << 32); + output[outputOffset + 23] = (v17 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 24] = v18 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 25] = ((v18 >>> 48) & 0b1111111111111111L) | ((v19 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 26] = ((v19 >>> 32) & 0b11111111111111111111111111111111L) | ((v20 & 0b1111111111111111L) << 32); + output[outputOffset + 27] = (v20 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 28] = v21 & 0b111111111111111111111111111111111111111111111111L; + output[outputOffset + 29] = ((v21 >>> 48) & 0b1111111111111111L) | ((v22 & 0b11111111111111111111111111111111L) << 16); + output[outputOffset + 30] = ((v22 >>> 32) & 0b11111111111111111111111111111111L) | ((v23 & 0b1111111111111111L) << 32); + output[outputOffset + 31] = (v23 >>> 16) & 0b111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker49 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + int v24 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 49) & 0b111111111111111L) | ((v1 & 0b1111111111111111111111111111111111L) << 15); + output[outputOffset + 2] = ((v1 >>> 34) & 0b111111111111111111111111111111L) | ((v2 & 0b1111111111111111111L) << 30); + output[outputOffset + 3] = ((v2 >>> 19) & 0b111111111111111111111111111111111111111111111L) | ((v3 & 0b1111L) << 45); + output[outputOffset + 4] = (v3 >>> 4) & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 5] = ((v3 >>> 53) & 0b11111111111L) | ((v4 & 0b11111111111111111111111111111111111111L) << 11); + output[outputOffset + 6] = ((v4 >>> 38) & 0b11111111111111111111111111L) | ((v5 & 0b11111111111111111111111L) << 26); + output[outputOffset + 7] = ((v5 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v6 & 0b11111111L) << 41); + output[outputOffset + 8] = (v6 >>> 8) & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 9] = ((v6 >>> 57) & 0b1111111L) | ((v7 & 0b111111111111111111111111111111111111111111L) << 7); + output[outputOffset + 10] = ((v7 >>> 42) & 0b1111111111111111111111L) | ((v8 & 0b111111111111111111111111111L) << 22); + output[outputOffset + 11] = ((v8 >>> 27) & 0b1111111111111111111111111111111111111L) | ((v9 & 0b111111111111L) << 37); + output[outputOffset + 12] = (v9 >>> 12) & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v9 >>> 61) & 0b111L) | ((v10 & 0b1111111111111111111111111111111111111111111111L) << 3); + output[outputOffset + 14] = ((v10 >>> 46) & 0b111111111111111111L) | ((v11 & 0b1111111111111111111111111111111L) << 18); + output[outputOffset + 15] = ((v11 >>> 31) & 0b111111111111111111111111111111111L) | ((v12 & 0b1111111111111111L) << 33); + output[outputOffset + 16] = ((v12 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v13 & 0b1L) << 48); + output[outputOffset + 17] = (v13 >>> 1) & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 18] = ((v13 >>> 50) & 0b11111111111111L) | ((v14 & 0b11111111111111111111111111111111111L) << 14); + output[outputOffset + 19] = ((v14 >>> 35) & 0b11111111111111111111111111111L) | ((v15 & 0b11111111111111111111L) << 29); + output[outputOffset + 20] = ((v15 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v16 & 0b11111L) << 44); + output[outputOffset + 21] = (v16 >>> 5) & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v16 >>> 54) & 0b1111111111L) | ((v17 & 0b111111111111111111111111111111111111111L) << 10); + output[outputOffset + 23] = ((v17 >>> 39) & 0b1111111111111111111111111L) | ((v18 & 0b111111111111111111111111L) << 25); + output[outputOffset + 24] = ((v18 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v19 & 0b111111111L) << 40); + output[outputOffset + 25] = (v19 >>> 9) & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 26] = ((v19 >>> 58) & 0b111111L) | ((v20 & 0b1111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 27] = ((v20 >>> 43) & 0b111111111111111111111L) | ((v21 & 0b1111111111111111111111111111L) << 21); + output[outputOffset + 28] = ((v21 >>> 28) & 0b111111111111111111111111111111111111L) | ((v22 & 0b1111111111111L) << 36); + output[outputOffset + 29] = (v22 >>> 13) & 0b1111111111111111111111111111111111111111111111111L; + output[outputOffset + 30] = ((v22 >>> 62) & 0b11L) | ((v23 & 0b11111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 31] = ((v23 >>> 47) & 0b11111111111111111L) | ((v24 & 0b11111111111111111111111111111111L) << 17); + } + } + + private static final class Unpacker50 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 50) & 0b11111111111111L) | ((v1 & 0b111111111111111111111111111111111111L) << 14); + output[outputOffset + 2] = ((v1 >>> 36) & 0b1111111111111111111111111111L) | ((v2 & 0b1111111111111111111111L) << 28); + output[outputOffset + 3] = ((v2 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v3 & 0b11111111L) << 42); + output[outputOffset + 4] = (v3 >>> 8) & 0b11111111111111111111111111111111111111111111111111L; + output[outputOffset + 5] = ((v3 >>> 58) & 0b111111L) | ((v4 & 0b11111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 6] = ((v4 >>> 44) & 0b11111111111111111111L) | ((v5 & 0b111111111111111111111111111111L) << 20); + output[outputOffset + 7] = ((v5 >>> 30) & 0b1111111111111111111111111111111111L) | ((v6 & 0b1111111111111111L) << 34); + output[outputOffset + 8] = ((v6 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v7 & 0b11L) << 48); + output[outputOffset + 9] = (v7 >>> 2) & 0b11111111111111111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v7 >>> 52) & 0b111111111111L) | ((v8 & 0b11111111111111111111111111111111111111L) << 12); + output[outputOffset + 11] = ((v8 >>> 38) & 0b11111111111111111111111111L) | ((v9 & 0b111111111111111111111111L) << 26); + output[outputOffset + 12] = ((v9 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v10 & 0b1111111111L) << 40); + output[outputOffset + 13] = (v10 >>> 10) & 0b11111111111111111111111111111111111111111111111111L; + output[outputOffset + 14] = ((v10 >>> 60) & 0b1111L) | ((v11 & 0b1111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 15] = ((v11 >>> 46) & 0b111111111111111111L) | ((v12 & 0b11111111111111111111111111111111L) << 18); + output[outputOffset + 16] = ((v12 >>> 32) & 0b11111111111111111111111111111111L) | ((v13 & 0b111111111111111111L) << 32); + output[outputOffset + 17] = ((v13 >>> 18) & 0b1111111111111111111111111111111111111111111111L) | ((v14 & 0b1111L) << 46); + output[outputOffset + 18] = (v14 >>> 4) & 0b11111111111111111111111111111111111111111111111111L; + output[outputOffset + 19] = ((v14 >>> 54) & 0b1111111111L) | ((v15 & 0b1111111111111111111111111111111111111111L) << 10); + output[outputOffset + 20] = ((v15 >>> 40) & 0b111111111111111111111111L) | ((v16 & 0b11111111111111111111111111L) << 24); + output[outputOffset + 21] = ((v16 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v17 & 0b111111111111L) << 38); + output[outputOffset + 22] = (v17 >>> 12) & 0b11111111111111111111111111111111111111111111111111L; + output[outputOffset + 23] = ((v17 >>> 62) & 0b11L) | ((v18 & 0b111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 24] = ((v18 >>> 48) & 0b1111111111111111L) | ((v19 & 0b1111111111111111111111111111111111L) << 16); + output[outputOffset + 25] = ((v19 >>> 34) & 0b111111111111111111111111111111L) | ((v20 & 0b11111111111111111111L) << 30); + output[outputOffset + 26] = ((v20 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v21 & 0b111111L) << 44); + output[outputOffset + 27] = (v21 >>> 6) & 0b11111111111111111111111111111111111111111111111111L; + output[outputOffset + 28] = ((v21 >>> 56) & 0b11111111L) | ((v22 & 0b111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 29] = ((v22 >>> 42) & 0b1111111111111111111111L) | ((v23 & 0b1111111111111111111111111111L) << 22); + output[outputOffset + 30] = ((v23 >>> 28) & 0b111111111111111111111111111111111111L) | ((v24 & 0b11111111111111L) << 36); + output[outputOffset + 31] = (v24 >>> 14) & 0b11111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker51 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + int v25 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 51) & 0b1111111111111L) | ((v1 & 0b11111111111111111111111111111111111111L) << 13); + output[outputOffset + 2] = ((v1 >>> 38) & 0b11111111111111111111111111L) | ((v2 & 0b1111111111111111111111111L) << 26); + output[outputOffset + 3] = ((v2 >>> 25) & 0b111111111111111111111111111111111111111L) | ((v3 & 0b111111111111L) << 39); + output[outputOffset + 4] = (v3 >>> 12) & 0b111111111111111111111111111111111111111111111111111L; + output[outputOffset + 5] = ((v3 >>> 63) & 0b1L) | ((v4 & 0b11111111111111111111111111111111111111111111111111L) << 1); + output[outputOffset + 6] = ((v4 >>> 50) & 0b11111111111111L) | ((v5 & 0b1111111111111111111111111111111111111L) << 14); + output[outputOffset + 7] = ((v5 >>> 37) & 0b111111111111111111111111111L) | ((v6 & 0b111111111111111111111111L) << 27); + output[outputOffset + 8] = ((v6 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v7 & 0b11111111111L) << 40); + output[outputOffset + 9] = (v7 >>> 11) & 0b111111111111111111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v7 >>> 62) & 0b11L) | ((v8 & 0b1111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 11] = ((v8 >>> 49) & 0b111111111111111L) | ((v9 & 0b111111111111111111111111111111111111L) << 15); + output[outputOffset + 12] = ((v9 >>> 36) & 0b1111111111111111111111111111L) | ((v10 & 0b11111111111111111111111L) << 28); + output[outputOffset + 13] = ((v10 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v11 & 0b1111111111L) << 41); + output[outputOffset + 14] = (v11 >>> 10) & 0b111111111111111111111111111111111111111111111111111L; + output[outputOffset + 15] = ((v11 >>> 61) & 0b111L) | ((v12 & 0b111111111111111111111111111111111111111111111111L) << 3); + output[outputOffset + 16] = ((v12 >>> 48) & 0b1111111111111111L) | ((v13 & 0b11111111111111111111111111111111111L) << 16); + output[outputOffset + 17] = ((v13 >>> 35) & 0b11111111111111111111111111111L) | ((v14 & 0b1111111111111111111111L) << 29); + output[outputOffset + 18] = ((v14 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v15 & 0b111111111L) << 42); + output[outputOffset + 19] = (v15 >>> 9) & 0b111111111111111111111111111111111111111111111111111L; + output[outputOffset + 20] = ((v15 >>> 60) & 0b1111L) | ((v16 & 0b11111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 21] = ((v16 >>> 47) & 0b11111111111111111L) | ((v17 & 0b1111111111111111111111111111111111L) << 17); + output[outputOffset + 22] = ((v17 >>> 34) & 0b111111111111111111111111111111L) | ((v18 & 0b111111111111111111111L) << 30); + output[outputOffset + 23] = ((v18 >>> 21) & 0b1111111111111111111111111111111111111111111L) | ((v19 & 0b11111111L) << 43); + output[outputOffset + 24] = (v19 >>> 8) & 0b111111111111111111111111111111111111111111111111111L; + output[outputOffset + 25] = ((v19 >>> 59) & 0b11111L) | ((v20 & 0b1111111111111111111111111111111111111111111111L) << 5); + output[outputOffset + 26] = ((v20 >>> 46) & 0b111111111111111111L) | ((v21 & 0b111111111111111111111111111111111L) << 18); + output[outputOffset + 27] = ((v21 >>> 33) & 0b1111111111111111111111111111111L) | ((v22 & 0b11111111111111111111L) << 31); + output[outputOffset + 28] = ((v22 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v23 & 0b1111111L) << 44); + output[outputOffset + 29] = (v23 >>> 7) & 0b111111111111111111111111111111111111111111111111111L; + output[outputOffset + 30] = ((v23 >>> 58) & 0b111111L) | ((v24 & 0b111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 31] = ((v24 >>> 45) & 0b1111111111111111111L) | ((v25 & 0b11111111111111111111111111111111L) << 19); + } + } + + private static final class Unpacker52 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 52) & 0b111111111111L) | ((v1 & 0b1111111111111111111111111111111111111111L) << 12); + output[outputOffset + 2] = ((v1 >>> 40) & 0b111111111111111111111111L) | ((v2 & 0b1111111111111111111111111111L) << 24); + output[outputOffset + 3] = ((v2 >>> 28) & 0b111111111111111111111111111111111111L) | ((v3 & 0b1111111111111111L) << 36); + output[outputOffset + 4] = ((v3 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v4 & 0b1111L) << 48); + output[outputOffset + 5] = (v4 >>> 4) & 0b1111111111111111111111111111111111111111111111111111L; + output[outputOffset + 6] = ((v4 >>> 56) & 0b11111111L) | ((v5 & 0b11111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 7] = ((v5 >>> 44) & 0b11111111111111111111L) | ((v6 & 0b11111111111111111111111111111111L) << 20); + output[outputOffset + 8] = ((v6 >>> 32) & 0b11111111111111111111111111111111L) | ((v7 & 0b11111111111111111111L) << 32); + output[outputOffset + 9] = ((v7 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v8 & 0b11111111L) << 44); + output[outputOffset + 10] = (v8 >>> 8) & 0b1111111111111111111111111111111111111111111111111111L; + output[outputOffset + 11] = ((v8 >>> 60) & 0b1111L) | ((v9 & 0b111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 12] = ((v9 >>> 48) & 0b1111111111111111L) | ((v10 & 0b111111111111111111111111111111111111L) << 16); + output[outputOffset + 13] = ((v10 >>> 36) & 0b1111111111111111111111111111L) | ((v11 & 0b111111111111111111111111L) << 28); + output[outputOffset + 14] = ((v11 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v12 & 0b111111111111L) << 40); + output[outputOffset + 15] = (v12 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L; + output[outputOffset + 16] = v13 & 0b1111111111111111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v13 >>> 52) & 0b111111111111L) | ((v14 & 0b1111111111111111111111111111111111111111L) << 12); + output[outputOffset + 18] = ((v14 >>> 40) & 0b111111111111111111111111L) | ((v15 & 0b1111111111111111111111111111L) << 24); + output[outputOffset + 19] = ((v15 >>> 28) & 0b111111111111111111111111111111111111L) | ((v16 & 0b1111111111111111L) << 36); + output[outputOffset + 20] = ((v16 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v17 & 0b1111L) << 48); + output[outputOffset + 21] = (v17 >>> 4) & 0b1111111111111111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v17 >>> 56) & 0b11111111L) | ((v18 & 0b11111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 23] = ((v18 >>> 44) & 0b11111111111111111111L) | ((v19 & 0b11111111111111111111111111111111L) << 20); + output[outputOffset + 24] = ((v19 >>> 32) & 0b11111111111111111111111111111111L) | ((v20 & 0b11111111111111111111L) << 32); + output[outputOffset + 25] = ((v20 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v21 & 0b11111111L) << 44); + output[outputOffset + 26] = (v21 >>> 8) & 0b1111111111111111111111111111111111111111111111111111L; + output[outputOffset + 27] = ((v21 >>> 60) & 0b1111L) | ((v22 & 0b111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 28] = ((v22 >>> 48) & 0b1111111111111111L) | ((v23 & 0b111111111111111111111111111111111111L) << 16); + output[outputOffset + 29] = ((v23 >>> 36) & 0b1111111111111111111111111111L) | ((v24 & 0b111111111111111111111111L) << 28); + output[outputOffset + 30] = ((v24 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v25 & 0b111111111111L) << 40); + output[outputOffset + 31] = (v25 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker53 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + int v26 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 53) & 0b11111111111L) | ((v1 & 0b111111111111111111111111111111111111111111L) << 11); + output[outputOffset + 2] = ((v1 >>> 42) & 0b1111111111111111111111L) | ((v2 & 0b1111111111111111111111111111111L) << 22); + output[outputOffset + 3] = ((v2 >>> 31) & 0b111111111111111111111111111111111L) | ((v3 & 0b11111111111111111111L) << 33); + output[outputOffset + 4] = ((v3 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v4 & 0b111111111L) << 44); + output[outputOffset + 5] = (v4 >>> 9) & 0b11111111111111111111111111111111111111111111111111111L; + output[outputOffset + 6] = ((v4 >>> 62) & 0b11L) | ((v5 & 0b111111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 7] = ((v5 >>> 51) & 0b1111111111111L) | ((v6 & 0b1111111111111111111111111111111111111111L) << 13); + output[outputOffset + 8] = ((v6 >>> 40) & 0b111111111111111111111111L) | ((v7 & 0b11111111111111111111111111111L) << 24); + output[outputOffset + 9] = ((v7 >>> 29) & 0b11111111111111111111111111111111111L) | ((v8 & 0b111111111111111111L) << 35); + output[outputOffset + 10] = ((v8 >>> 18) & 0b1111111111111111111111111111111111111111111111L) | ((v9 & 0b1111111L) << 46); + output[outputOffset + 11] = (v9 >>> 7) & 0b11111111111111111111111111111111111111111111111111111L; + output[outputOffset + 12] = ((v9 >>> 60) & 0b1111L) | ((v10 & 0b1111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 13] = ((v10 >>> 49) & 0b111111111111111L) | ((v11 & 0b11111111111111111111111111111111111111L) << 15); + output[outputOffset + 14] = ((v11 >>> 38) & 0b11111111111111111111111111L) | ((v12 & 0b111111111111111111111111111L) << 26); + output[outputOffset + 15] = ((v12 >>> 27) & 0b1111111111111111111111111111111111111L) | ((v13 & 0b1111111111111111L) << 37); + output[outputOffset + 16] = ((v13 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v14 & 0b11111L) << 48); + output[outputOffset + 17] = (v14 >>> 5) & 0b11111111111111111111111111111111111111111111111111111L; + output[outputOffset + 18] = ((v14 >>> 58) & 0b111111L) | ((v15 & 0b11111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 19] = ((v15 >>> 47) & 0b11111111111111111L) | ((v16 & 0b111111111111111111111111111111111111L) << 17); + output[outputOffset + 20] = ((v16 >>> 36) & 0b1111111111111111111111111111L) | ((v17 & 0b1111111111111111111111111L) << 28); + output[outputOffset + 21] = ((v17 >>> 25) & 0b111111111111111111111111111111111111111L) | ((v18 & 0b11111111111111L) << 39); + output[outputOffset + 22] = ((v18 >>> 14) & 0b11111111111111111111111111111111111111111111111111L) | ((v19 & 0b111L) << 50); + output[outputOffset + 23] = (v19 >>> 3) & 0b11111111111111111111111111111111111111111111111111111L; + output[outputOffset + 24] = ((v19 >>> 56) & 0b11111111L) | ((v20 & 0b111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 25] = ((v20 >>> 45) & 0b1111111111111111111L) | ((v21 & 0b1111111111111111111111111111111111L) << 19); + output[outputOffset + 26] = ((v21 >>> 34) & 0b111111111111111111111111111111L) | ((v22 & 0b11111111111111111111111L) << 30); + output[outputOffset + 27] = ((v22 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v23 & 0b111111111111L) << 41); + output[outputOffset + 28] = ((v23 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L) | ((v24 & 0b1L) << 52); + output[outputOffset + 29] = (v24 >>> 1) & 0b11111111111111111111111111111111111111111111111111111L; + output[outputOffset + 30] = ((v24 >>> 54) & 0b1111111111L) | ((v25 & 0b1111111111111111111111111111111111111111111L) << 10); + output[outputOffset + 31] = ((v25 >>> 43) & 0b111111111111111111111L) | ((v26 & 0b11111111111111111111111111111111L) << 21); + } + } + + private static final class Unpacker54 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 54) & 0b1111111111L) | ((v1 & 0b11111111111111111111111111111111111111111111L) << 10); + output[outputOffset + 2] = ((v1 >>> 44) & 0b11111111111111111111L) | ((v2 & 0b1111111111111111111111111111111111L) << 20); + output[outputOffset + 3] = ((v2 >>> 34) & 0b111111111111111111111111111111L) | ((v3 & 0b111111111111111111111111L) << 30); + output[outputOffset + 4] = ((v3 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v4 & 0b11111111111111L) << 40); + output[outputOffset + 5] = ((v4 >>> 14) & 0b11111111111111111111111111111111111111111111111111L) | ((v5 & 0b1111L) << 50); + output[outputOffset + 6] = (v5 >>> 4) & 0b111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 7] = ((v5 >>> 58) & 0b111111L) | ((v6 & 0b111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 8] = ((v6 >>> 48) & 0b1111111111111111L) | ((v7 & 0b11111111111111111111111111111111111111L) << 16); + output[outputOffset + 9] = ((v7 >>> 38) & 0b11111111111111111111111111L) | ((v8 & 0b1111111111111111111111111111L) << 26); + output[outputOffset + 10] = ((v8 >>> 28) & 0b111111111111111111111111111111111111L) | ((v9 & 0b111111111111111111L) << 36); + output[outputOffset + 11] = ((v9 >>> 18) & 0b1111111111111111111111111111111111111111111111L) | ((v10 & 0b11111111L) << 46); + output[outputOffset + 12] = (v10 >>> 8) & 0b111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v10 >>> 62) & 0b11L) | ((v11 & 0b1111111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 14] = ((v11 >>> 52) & 0b111111111111L) | ((v12 & 0b111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 15] = ((v12 >>> 42) & 0b1111111111111111111111L) | ((v13 & 0b11111111111111111111111111111111L) << 22); + output[outputOffset + 16] = ((v13 >>> 32) & 0b11111111111111111111111111111111L) | ((v14 & 0b1111111111111111111111L) << 32); + output[outputOffset + 17] = ((v14 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v15 & 0b111111111111L) << 42); + output[outputOffset + 18] = ((v15 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L) | ((v16 & 0b11L) << 52); + output[outputOffset + 19] = (v16 >>> 2) & 0b111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 20] = ((v16 >>> 56) & 0b11111111L) | ((v17 & 0b1111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 21] = ((v17 >>> 46) & 0b111111111111111111L) | ((v18 & 0b111111111111111111111111111111111111L) << 18); + output[outputOffset + 22] = ((v18 >>> 36) & 0b1111111111111111111111111111L) | ((v19 & 0b11111111111111111111111111L) << 28); + output[outputOffset + 23] = ((v19 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v20 & 0b1111111111111111L) << 38); + output[outputOffset + 24] = ((v20 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v21 & 0b111111L) << 48); + output[outputOffset + 25] = (v21 >>> 6) & 0b111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 26] = ((v21 >>> 60) & 0b1111L) | ((v22 & 0b11111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 27] = ((v22 >>> 50) & 0b11111111111111L) | ((v23 & 0b1111111111111111111111111111111111111111L) << 14); + output[outputOffset + 28] = ((v23 >>> 40) & 0b111111111111111111111111L) | ((v24 & 0b111111111111111111111111111111L) << 24); + output[outputOffset + 29] = ((v24 >>> 30) & 0b1111111111111111111111111111111111L) | ((v25 & 0b11111111111111111111L) << 34); + output[outputOffset + 30] = ((v25 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v26 & 0b1111111111L) << 44); + output[outputOffset + 31] = (v26 >>> 10) & 0b111111111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker55 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + int v27 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 55) & 0b111111111L) | ((v1 & 0b1111111111111111111111111111111111111111111111L) << 9); + output[outputOffset + 2] = ((v1 >>> 46) & 0b111111111111111111L) | ((v2 & 0b1111111111111111111111111111111111111L) << 18); + output[outputOffset + 3] = ((v2 >>> 37) & 0b111111111111111111111111111L) | ((v3 & 0b1111111111111111111111111111L) << 27); + output[outputOffset + 4] = ((v3 >>> 28) & 0b111111111111111111111111111111111111L) | ((v4 & 0b1111111111111111111L) << 36); + output[outputOffset + 5] = ((v4 >>> 19) & 0b111111111111111111111111111111111111111111111L) | ((v5 & 0b1111111111L) << 45); + output[outputOffset + 6] = ((v5 >>> 10) & 0b111111111111111111111111111111111111111111111111111111L) | ((v6 & 0b1L) << 54); + output[outputOffset + 7] = (v6 >>> 1) & 0b1111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 8] = ((v6 >>> 56) & 0b11111111L) | ((v7 & 0b11111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 9] = ((v7 >>> 47) & 0b11111111111111111L) | ((v8 & 0b11111111111111111111111111111111111111L) << 17); + output[outputOffset + 10] = ((v8 >>> 38) & 0b11111111111111111111111111L) | ((v9 & 0b11111111111111111111111111111L) << 26); + output[outputOffset + 11] = ((v9 >>> 29) & 0b11111111111111111111111111111111111L) | ((v10 & 0b11111111111111111111L) << 35); + output[outputOffset + 12] = ((v10 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v11 & 0b11111111111L) << 44); + output[outputOffset + 13] = ((v11 >>> 11) & 0b11111111111111111111111111111111111111111111111111111L) | ((v12 & 0b11L) << 53); + output[outputOffset + 14] = (v12 >>> 2) & 0b1111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 15] = ((v12 >>> 57) & 0b1111111L) | ((v13 & 0b111111111111111111111111111111111111111111111111L) << 7); + output[outputOffset + 16] = ((v13 >>> 48) & 0b1111111111111111L) | ((v14 & 0b111111111111111111111111111111111111111L) << 16); + output[outputOffset + 17] = ((v14 >>> 39) & 0b1111111111111111111111111L) | ((v15 & 0b111111111111111111111111111111L) << 25); + output[outputOffset + 18] = ((v15 >>> 30) & 0b1111111111111111111111111111111111L) | ((v16 & 0b111111111111111111111L) << 34); + output[outputOffset + 19] = ((v16 >>> 21) & 0b1111111111111111111111111111111111111111111L) | ((v17 & 0b111111111111L) << 43); + output[outputOffset + 20] = ((v17 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L) | ((v18 & 0b111L) << 52); + output[outputOffset + 21] = (v18 >>> 3) & 0b1111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v18 >>> 58) & 0b111111L) | ((v19 & 0b1111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 23] = ((v19 >>> 49) & 0b111111111111111L) | ((v20 & 0b1111111111111111111111111111111111111111L) << 15); + output[outputOffset + 24] = ((v20 >>> 40) & 0b111111111111111111111111L) | ((v21 & 0b1111111111111111111111111111111L) << 24); + output[outputOffset + 25] = ((v21 >>> 31) & 0b111111111111111111111111111111111L) | ((v22 & 0b1111111111111111111111L) << 33); + output[outputOffset + 26] = ((v22 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v23 & 0b1111111111111L) << 42); + output[outputOffset + 27] = ((v23 >>> 13) & 0b111111111111111111111111111111111111111111111111111L) | ((v24 & 0b1111L) << 51); + output[outputOffset + 28] = (v24 >>> 4) & 0b1111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 29] = ((v24 >>> 59) & 0b11111L) | ((v25 & 0b11111111111111111111111111111111111111111111111111L) << 5); + output[outputOffset + 30] = ((v25 >>> 50) & 0b11111111111111L) | ((v26 & 0b11111111111111111111111111111111111111111L) << 14); + output[outputOffset + 31] = ((v26 >>> 41) & 0b11111111111111111111111L) | ((v27 & 0b11111111111111111111111111111111L) << 23); + } + } + + private static final class Unpacker56 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 56) & 0b11111111L) | ((v1 & 0b111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 2] = ((v1 >>> 48) & 0b1111111111111111L) | ((v2 & 0b1111111111111111111111111111111111111111L) << 16); + output[outputOffset + 3] = ((v2 >>> 40) & 0b111111111111111111111111L) | ((v3 & 0b11111111111111111111111111111111L) << 24); + output[outputOffset + 4] = ((v3 >>> 32) & 0b11111111111111111111111111111111L) | ((v4 & 0b111111111111111111111111L) << 32); + output[outputOffset + 5] = ((v4 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v5 & 0b1111111111111111L) << 40); + output[outputOffset + 6] = ((v5 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v6 & 0b11111111L) << 48); + output[outputOffset + 7] = (v6 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 8] = v7 & 0b11111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 9] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 10] = ((v8 >>> 48) & 0b1111111111111111L) | ((v9 & 0b1111111111111111111111111111111111111111L) << 16); + output[outputOffset + 11] = ((v9 >>> 40) & 0b111111111111111111111111L) | ((v10 & 0b11111111111111111111111111111111L) << 24); + output[outputOffset + 12] = ((v10 >>> 32) & 0b11111111111111111111111111111111L) | ((v11 & 0b111111111111111111111111L) << 32); + output[outputOffset + 13] = ((v11 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v12 & 0b1111111111111111L) << 40); + output[outputOffset + 14] = ((v12 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v13 & 0b11111111L) << 48); + output[outputOffset + 15] = (v13 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 16] = v14 & 0b11111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v14 >>> 56) & 0b11111111L) | ((v15 & 0b111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 18] = ((v15 >>> 48) & 0b1111111111111111L) | ((v16 & 0b1111111111111111111111111111111111111111L) << 16); + output[outputOffset + 19] = ((v16 >>> 40) & 0b111111111111111111111111L) | ((v17 & 0b11111111111111111111111111111111L) << 24); + output[outputOffset + 20] = ((v17 >>> 32) & 0b11111111111111111111111111111111L) | ((v18 & 0b111111111111111111111111L) << 32); + output[outputOffset + 21] = ((v18 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v19 & 0b1111111111111111L) << 40); + output[outputOffset + 22] = ((v19 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v20 & 0b11111111L) << 48); + output[outputOffset + 23] = (v20 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 24] = v21 & 0b11111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 25] = ((v21 >>> 56) & 0b11111111L) | ((v22 & 0b111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 26] = ((v22 >>> 48) & 0b1111111111111111L) | ((v23 & 0b1111111111111111111111111111111111111111L) << 16); + output[outputOffset + 27] = ((v23 >>> 40) & 0b111111111111111111111111L) | ((v24 & 0b11111111111111111111111111111111L) << 24); + output[outputOffset + 28] = ((v24 >>> 32) & 0b11111111111111111111111111111111L) | ((v25 & 0b111111111111111111111111L) << 32); + output[outputOffset + 29] = ((v25 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v26 & 0b1111111111111111L) << 40); + output[outputOffset + 30] = ((v26 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v27 & 0b11111111L) << 48); + output[outputOffset + 31] = (v27 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker57 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + int v28 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 57) & 0b1111111L) | ((v1 & 0b11111111111111111111111111111111111111111111111111L) << 7); + output[outputOffset + 2] = ((v1 >>> 50) & 0b11111111111111L) | ((v2 & 0b1111111111111111111111111111111111111111111L) << 14); + output[outputOffset + 3] = ((v2 >>> 43) & 0b111111111111111111111L) | ((v3 & 0b111111111111111111111111111111111111L) << 21); + output[outputOffset + 4] = ((v3 >>> 36) & 0b1111111111111111111111111111L) | ((v4 & 0b11111111111111111111111111111L) << 28); + output[outputOffset + 5] = ((v4 >>> 29) & 0b11111111111111111111111111111111111L) | ((v5 & 0b1111111111111111111111L) << 35); + output[outputOffset + 6] = ((v5 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v6 & 0b111111111111111L) << 42); + output[outputOffset + 7] = ((v6 >>> 15) & 0b1111111111111111111111111111111111111111111111111L) | ((v7 & 0b11111111L) << 49); + output[outputOffset + 8] = ((v7 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L) | ((v8 & 0b1L) << 56); + output[outputOffset + 9] = (v8 >>> 1) & 0b111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 10] = ((v8 >>> 58) & 0b111111L) | ((v9 & 0b111111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 11] = ((v9 >>> 51) & 0b1111111111111L) | ((v10 & 0b11111111111111111111111111111111111111111111L) << 13); + output[outputOffset + 12] = ((v10 >>> 44) & 0b11111111111111111111L) | ((v11 & 0b1111111111111111111111111111111111111L) << 20); + output[outputOffset + 13] = ((v11 >>> 37) & 0b111111111111111111111111111L) | ((v12 & 0b111111111111111111111111111111L) << 27); + output[outputOffset + 14] = ((v12 >>> 30) & 0b1111111111111111111111111111111111L) | ((v13 & 0b11111111111111111111111L) << 34); + output[outputOffset + 15] = ((v13 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v14 & 0b1111111111111111L) << 41); + output[outputOffset + 16] = ((v14 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v15 & 0b111111111L) << 48); + output[outputOffset + 17] = ((v15 >>> 9) & 0b1111111111111111111111111111111111111111111111111111111L) | ((v16 & 0b11L) << 55); + output[outputOffset + 18] = (v16 >>> 2) & 0b111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 19] = ((v16 >>> 59) & 0b11111L) | ((v17 & 0b1111111111111111111111111111111111111111111111111111L) << 5); + output[outputOffset + 20] = ((v17 >>> 52) & 0b111111111111L) | ((v18 & 0b111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 21] = ((v18 >>> 45) & 0b1111111111111111111L) | ((v19 & 0b11111111111111111111111111111111111111L) << 19); + output[outputOffset + 22] = ((v19 >>> 38) & 0b11111111111111111111111111L) | ((v20 & 0b1111111111111111111111111111111L) << 26); + output[outputOffset + 23] = ((v20 >>> 31) & 0b111111111111111111111111111111111L) | ((v21 & 0b111111111111111111111111L) << 33); + output[outputOffset + 24] = ((v21 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v22 & 0b11111111111111111L) << 40); + output[outputOffset + 25] = ((v22 >>> 17) & 0b11111111111111111111111111111111111111111111111L) | ((v23 & 0b1111111111L) << 47); + output[outputOffset + 26] = ((v23 >>> 10) & 0b111111111111111111111111111111111111111111111111111111L) | ((v24 & 0b111L) << 54); + output[outputOffset + 27] = (v24 >>> 3) & 0b111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 28] = ((v24 >>> 60) & 0b1111L) | ((v25 & 0b11111111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 29] = ((v25 >>> 53) & 0b11111111111L) | ((v26 & 0b1111111111111111111111111111111111111111111111L) << 11); + output[outputOffset + 30] = ((v26 >>> 46) & 0b111111111111111111L) | ((v27 & 0b111111111111111111111111111111111111111L) << 18); + output[outputOffset + 31] = ((v27 >>> 39) & 0b1111111111111111111111111L) | ((v28 & 0b11111111111111111111111111111111L) << 25); + } + } + + private static final class Unpacker58 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + long v28 = input.readLong(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 58) & 0b111111L) | ((v1 & 0b1111111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 2] = ((v1 >>> 52) & 0b111111111111L) | ((v2 & 0b1111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 3] = ((v2 >>> 46) & 0b111111111111111111L) | ((v3 & 0b1111111111111111111111111111111111111111L) << 18); + output[outputOffset + 4] = ((v3 >>> 40) & 0b111111111111111111111111L) | ((v4 & 0b1111111111111111111111111111111111L) << 24); + output[outputOffset + 5] = ((v4 >>> 34) & 0b111111111111111111111111111111L) | ((v5 & 0b1111111111111111111111111111L) << 30); + output[outputOffset + 6] = ((v5 >>> 28) & 0b111111111111111111111111111111111111L) | ((v6 & 0b1111111111111111111111L) << 36); + output[outputOffset + 7] = ((v6 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v7 & 0b1111111111111111L) << 42); + output[outputOffset + 8] = ((v7 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v8 & 0b1111111111L) << 48); + output[outputOffset + 9] = ((v8 >>> 10) & 0b111111111111111111111111111111111111111111111111111111L) | ((v9 & 0b1111L) << 54); + output[outputOffset + 10] = (v9 >>> 4) & 0b1111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 11] = ((v9 >>> 62) & 0b11L) | ((v10 & 0b11111111111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 12] = ((v10 >>> 56) & 0b11111111L) | ((v11 & 0b11111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 13] = ((v11 >>> 50) & 0b11111111111111L) | ((v12 & 0b11111111111111111111111111111111111111111111L) << 14); + output[outputOffset + 14] = ((v12 >>> 44) & 0b11111111111111111111L) | ((v13 & 0b11111111111111111111111111111111111111L) << 20); + output[outputOffset + 15] = ((v13 >>> 38) & 0b11111111111111111111111111L) | ((v14 & 0b11111111111111111111111111111111L) << 26); + output[outputOffset + 16] = ((v14 >>> 32) & 0b11111111111111111111111111111111L) | ((v15 & 0b11111111111111111111111111L) << 32); + output[outputOffset + 17] = ((v15 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v16 & 0b11111111111111111111L) << 38); + output[outputOffset + 18] = ((v16 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v17 & 0b11111111111111L) << 44); + output[outputOffset + 19] = ((v17 >>> 14) & 0b11111111111111111111111111111111111111111111111111L) | ((v18 & 0b11111111L) << 50); + output[outputOffset + 20] = ((v18 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L) | ((v19 & 0b11L) << 56); + output[outputOffset + 21] = (v19 >>> 2) & 0b1111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v19 >>> 60) & 0b1111L) | ((v20 & 0b111111111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 23] = ((v20 >>> 54) & 0b1111111111L) | ((v21 & 0b111111111111111111111111111111111111111111111111L) << 10); + output[outputOffset + 24] = ((v21 >>> 48) & 0b1111111111111111L) | ((v22 & 0b111111111111111111111111111111111111111111L) << 16); + output[outputOffset + 25] = ((v22 >>> 42) & 0b1111111111111111111111L) | ((v23 & 0b111111111111111111111111111111111111L) << 22); + output[outputOffset + 26] = ((v23 >>> 36) & 0b1111111111111111111111111111L) | ((v24 & 0b111111111111111111111111111111L) << 28); + output[outputOffset + 27] = ((v24 >>> 30) & 0b1111111111111111111111111111111111L) | ((v25 & 0b111111111111111111111111L) << 34); + output[outputOffset + 28] = ((v25 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v26 & 0b111111111111111111L) << 40); + output[outputOffset + 29] = ((v26 >>> 18) & 0b1111111111111111111111111111111111111111111111L) | ((v27 & 0b111111111111L) << 46); + output[outputOffset + 30] = ((v27 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L) | ((v28 & 0b111111L) << 52); + output[outputOffset + 31] = (v28 >>> 6) & 0b1111111111111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker59 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + long v28 = input.readLong(); + int v29 = input.readInt(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 59) & 0b11111L) | ((v1 & 0b111111111111111111111111111111111111111111111111111111L) << 5); + output[outputOffset + 2] = ((v1 >>> 54) & 0b1111111111L) | ((v2 & 0b1111111111111111111111111111111111111111111111111L) << 10); + output[outputOffset + 3] = ((v2 >>> 49) & 0b111111111111111L) | ((v3 & 0b11111111111111111111111111111111111111111111L) << 15); + output[outputOffset + 4] = ((v3 >>> 44) & 0b11111111111111111111L) | ((v4 & 0b111111111111111111111111111111111111111L) << 20); + output[outputOffset + 5] = ((v4 >>> 39) & 0b1111111111111111111111111L) | ((v5 & 0b1111111111111111111111111111111111L) << 25); + output[outputOffset + 6] = ((v5 >>> 34) & 0b111111111111111111111111111111L) | ((v6 & 0b11111111111111111111111111111L) << 30); + output[outputOffset + 7] = ((v6 >>> 29) & 0b11111111111111111111111111111111111L) | ((v7 & 0b111111111111111111111111L) << 35); + output[outputOffset + 8] = ((v7 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v8 & 0b1111111111111111111L) << 40); + output[outputOffset + 9] = ((v8 >>> 19) & 0b111111111111111111111111111111111111111111111L) | ((v9 & 0b11111111111111L) << 45); + output[outputOffset + 10] = ((v9 >>> 14) & 0b11111111111111111111111111111111111111111111111111L) | ((v10 & 0b111111111L) << 50); + output[outputOffset + 11] = ((v10 >>> 9) & 0b1111111111111111111111111111111111111111111111111111111L) | ((v11 & 0b1111L) << 55); + output[outputOffset + 12] = (v11 >>> 4) & 0b11111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 13] = ((v11 >>> 63) & 0b1L) | ((v12 & 0b1111111111111111111111111111111111111111111111111111111111L) << 1); + output[outputOffset + 14] = ((v12 >>> 58) & 0b111111L) | ((v13 & 0b11111111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 15] = ((v13 >>> 53) & 0b11111111111L) | ((v14 & 0b111111111111111111111111111111111111111111111111L) << 11); + output[outputOffset + 16] = ((v14 >>> 48) & 0b1111111111111111L) | ((v15 & 0b1111111111111111111111111111111111111111111L) << 16); + output[outputOffset + 17] = ((v15 >>> 43) & 0b111111111111111111111L) | ((v16 & 0b11111111111111111111111111111111111111L) << 21); + output[outputOffset + 18] = ((v16 >>> 38) & 0b11111111111111111111111111L) | ((v17 & 0b111111111111111111111111111111111L) << 26); + output[outputOffset + 19] = ((v17 >>> 33) & 0b1111111111111111111111111111111L) | ((v18 & 0b1111111111111111111111111111L) << 31); + output[outputOffset + 20] = ((v18 >>> 28) & 0b111111111111111111111111111111111111L) | ((v19 & 0b11111111111111111111111L) << 36); + output[outputOffset + 21] = ((v19 >>> 23) & 0b11111111111111111111111111111111111111111L) | ((v20 & 0b111111111111111111L) << 41); + output[outputOffset + 22] = ((v20 >>> 18) & 0b1111111111111111111111111111111111111111111111L) | ((v21 & 0b1111111111111L) << 46); + output[outputOffset + 23] = ((v21 >>> 13) & 0b111111111111111111111111111111111111111111111111111L) | ((v22 & 0b11111111L) << 51); + output[outputOffset + 24] = ((v22 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L) | ((v23 & 0b111L) << 56); + output[outputOffset + 25] = (v23 >>> 3) & 0b11111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 26] = ((v23 >>> 62) & 0b11L) | ((v24 & 0b111111111111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 27] = ((v24 >>> 57) & 0b1111111L) | ((v25 & 0b1111111111111111111111111111111111111111111111111111L) << 7); + output[outputOffset + 28] = ((v25 >>> 52) & 0b111111111111L) | ((v26 & 0b11111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 29] = ((v26 >>> 47) & 0b11111111111111111L) | ((v27 & 0b111111111111111111111111111111111111111111L) << 17); + output[outputOffset + 30] = ((v27 >>> 42) & 0b1111111111111111111111L) | ((v28 & 0b1111111111111111111111111111111111111L) << 22); + output[outputOffset + 31] = ((v28 >>> 37) & 0b111111111111111111111111111L) | ((v29 & 0b11111111111111111111111111111111L) << 27); + } + } + + private static final class Unpacker60 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + long v28 = input.readLong(); + long v29 = input.readLong(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 2] = ((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 3] = ((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 4] = ((v3 >>> 48) & 0b1111111111111111L) | ((v4 & 0b11111111111111111111111111111111111111111111L) << 16); + output[outputOffset + 5] = ((v4 >>> 44) & 0b11111111111111111111L) | ((v5 & 0b1111111111111111111111111111111111111111L) << 20); + output[outputOffset + 6] = ((v5 >>> 40) & 0b111111111111111111111111L) | ((v6 & 0b111111111111111111111111111111111111L) << 24); + output[outputOffset + 7] = ((v6 >>> 36) & 0b1111111111111111111111111111L) | ((v7 & 0b11111111111111111111111111111111L) << 28); + output[outputOffset + 8] = ((v7 >>> 32) & 0b11111111111111111111111111111111L) | ((v8 & 0b1111111111111111111111111111L) << 32); + output[outputOffset + 9] = ((v8 >>> 28) & 0b111111111111111111111111111111111111L) | ((v9 & 0b111111111111111111111111L) << 36); + output[outputOffset + 10] = ((v9 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v10 & 0b11111111111111111111L) << 40); + output[outputOffset + 11] = ((v10 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v11 & 0b1111111111111111L) << 44); + output[outputOffset + 12] = ((v11 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v12 & 0b111111111111L) << 48); + output[outputOffset + 13] = ((v12 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L) | ((v13 & 0b11111111L) << 52); + output[outputOffset + 14] = ((v13 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L) | ((v14 & 0b1111L) << 56); + output[outputOffset + 15] = (v14 >>> 4) & 0b111111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 16] = v15 & 0b111111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 17] = ((v15 >>> 60) & 0b1111L) | ((v16 & 0b11111111111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 18] = ((v16 >>> 56) & 0b11111111L) | ((v17 & 0b1111111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 19] = ((v17 >>> 52) & 0b111111111111L) | ((v18 & 0b111111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 20] = ((v18 >>> 48) & 0b1111111111111111L) | ((v19 & 0b11111111111111111111111111111111111111111111L) << 16); + output[outputOffset + 21] = ((v19 >>> 44) & 0b11111111111111111111L) | ((v20 & 0b1111111111111111111111111111111111111111L) << 20); + output[outputOffset + 22] = ((v20 >>> 40) & 0b111111111111111111111111L) | ((v21 & 0b111111111111111111111111111111111111L) << 24); + output[outputOffset + 23] = ((v21 >>> 36) & 0b1111111111111111111111111111L) | ((v22 & 0b11111111111111111111111111111111L) << 28); + output[outputOffset + 24] = ((v22 >>> 32) & 0b11111111111111111111111111111111L) | ((v23 & 0b1111111111111111111111111111L) << 32); + output[outputOffset + 25] = ((v23 >>> 28) & 0b111111111111111111111111111111111111L) | ((v24 & 0b111111111111111111111111L) << 36); + output[outputOffset + 26] = ((v24 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v25 & 0b11111111111111111111L) << 40); + output[outputOffset + 27] = ((v25 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v26 & 0b1111111111111111L) << 44); + output[outputOffset + 28] = ((v26 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v27 & 0b111111111111L) << 48); + output[outputOffset + 29] = ((v27 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L) | ((v28 & 0b11111111L) << 52); + output[outputOffset + 30] = ((v28 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L) | ((v29 & 0b1111L) << 56); + output[outputOffset + 31] = (v29 >>> 4) & 0b111111111111111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker61 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + long v28 = input.readLong(); + long v29 = input.readLong(); + int v30 = input.readInt(); + output[outputOffset] = v0 & 0b1111111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 61) & 0b111L) | ((v1 & 0b1111111111111111111111111111111111111111111111111111111111L) << 3); + output[outputOffset + 2] = ((v1 >>> 58) & 0b111111L) | ((v2 & 0b1111111111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 3] = ((v2 >>> 55) & 0b111111111L) | ((v3 & 0b1111111111111111111111111111111111111111111111111111L) << 9); + output[outputOffset + 4] = ((v3 >>> 52) & 0b111111111111L) | ((v4 & 0b1111111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 5] = ((v4 >>> 49) & 0b111111111111111L) | ((v5 & 0b1111111111111111111111111111111111111111111111L) << 15); + output[outputOffset + 6] = ((v5 >>> 46) & 0b111111111111111111L) | ((v6 & 0b1111111111111111111111111111111111111111111L) << 18); + output[outputOffset + 7] = ((v6 >>> 43) & 0b111111111111111111111L) | ((v7 & 0b1111111111111111111111111111111111111111L) << 21); + output[outputOffset + 8] = ((v7 >>> 40) & 0b111111111111111111111111L) | ((v8 & 0b1111111111111111111111111111111111111L) << 24); + output[outputOffset + 9] = ((v8 >>> 37) & 0b111111111111111111111111111L) | ((v9 & 0b1111111111111111111111111111111111L) << 27); + output[outputOffset + 10] = ((v9 >>> 34) & 0b111111111111111111111111111111L) | ((v10 & 0b1111111111111111111111111111111L) << 30); + output[outputOffset + 11] = ((v10 >>> 31) & 0b111111111111111111111111111111111L) | ((v11 & 0b1111111111111111111111111111L) << 33); + output[outputOffset + 12] = ((v11 >>> 28) & 0b111111111111111111111111111111111111L) | ((v12 & 0b1111111111111111111111111L) << 36); + output[outputOffset + 13] = ((v12 >>> 25) & 0b111111111111111111111111111111111111111L) | ((v13 & 0b1111111111111111111111L) << 39); + output[outputOffset + 14] = ((v13 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v14 & 0b1111111111111111111L) << 42); + output[outputOffset + 15] = ((v14 >>> 19) & 0b111111111111111111111111111111111111111111111L) | ((v15 & 0b1111111111111111L) << 45); + output[outputOffset + 16] = ((v15 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v16 & 0b1111111111111L) << 48); + output[outputOffset + 17] = ((v16 >>> 13) & 0b111111111111111111111111111111111111111111111111111L) | ((v17 & 0b1111111111L) << 51); + output[outputOffset + 18] = ((v17 >>> 10) & 0b111111111111111111111111111111111111111111111111111111L) | ((v18 & 0b1111111L) << 54); + output[outputOffset + 19] = ((v18 >>> 7) & 0b111111111111111111111111111111111111111111111111111111111L) | ((v19 & 0b1111L) << 57); + output[outputOffset + 20] = ((v19 >>> 4) & 0b111111111111111111111111111111111111111111111111111111111111L) | ((v20 & 0b1L) << 60); + output[outputOffset + 21] = (v20 >>> 1) & 0b1111111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 22] = ((v20 >>> 62) & 0b11L) | ((v21 & 0b11111111111111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 23] = ((v21 >>> 59) & 0b11111L) | ((v22 & 0b11111111111111111111111111111111111111111111111111111111L) << 5); + output[outputOffset + 24] = ((v22 >>> 56) & 0b11111111L) | ((v23 & 0b11111111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 25] = ((v23 >>> 53) & 0b11111111111L) | ((v24 & 0b11111111111111111111111111111111111111111111111111L) << 11); + output[outputOffset + 26] = ((v24 >>> 50) & 0b11111111111111L) | ((v25 & 0b11111111111111111111111111111111111111111111111L) << 14); + output[outputOffset + 27] = ((v25 >>> 47) & 0b11111111111111111L) | ((v26 & 0b11111111111111111111111111111111111111111111L) << 17); + output[outputOffset + 28] = ((v26 >>> 44) & 0b11111111111111111111L) | ((v27 & 0b11111111111111111111111111111111111111111L) << 20); + output[outputOffset + 29] = ((v27 >>> 41) & 0b11111111111111111111111L) | ((v28 & 0b11111111111111111111111111111111111111L) << 23); + output[outputOffset + 30] = ((v28 >>> 38) & 0b11111111111111111111111111L) | ((v29 & 0b11111111111111111111111111111111111L) << 26); + output[outputOffset + 31] = ((v29 >>> 35) & 0b11111111111111111111111111111L) | ((v30 & 0b11111111111111111111111111111111L) << 29); + } + } + + private static final class Unpacker62 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + long v28 = input.readLong(); + long v29 = input.readLong(); + long v30 = input.readLong(); + output[outputOffset] = v0 & 0b11111111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 62) & 0b11L) | ((v1 & 0b111111111111111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 2] = ((v1 >>> 60) & 0b1111L) | ((v2 & 0b1111111111111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 3] = ((v2 >>> 58) & 0b111111L) | ((v3 & 0b11111111111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 4] = ((v3 >>> 56) & 0b11111111L) | ((v4 & 0b111111111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 5] = ((v4 >>> 54) & 0b1111111111L) | ((v5 & 0b1111111111111111111111111111111111111111111111111111L) << 10); + output[outputOffset + 6] = ((v5 >>> 52) & 0b111111111111L) | ((v6 & 0b11111111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 7] = ((v6 >>> 50) & 0b11111111111111L) | ((v7 & 0b111111111111111111111111111111111111111111111111L) << 14); + output[outputOffset + 8] = ((v7 >>> 48) & 0b1111111111111111L) | ((v8 & 0b1111111111111111111111111111111111111111111111L) << 16); + output[outputOffset + 9] = ((v8 >>> 46) & 0b111111111111111111L) | ((v9 & 0b11111111111111111111111111111111111111111111L) << 18); + output[outputOffset + 10] = ((v9 >>> 44) & 0b11111111111111111111L) | ((v10 & 0b111111111111111111111111111111111111111111L) << 20); + output[outputOffset + 11] = ((v10 >>> 42) & 0b1111111111111111111111L) | ((v11 & 0b1111111111111111111111111111111111111111L) << 22); + output[outputOffset + 12] = ((v11 >>> 40) & 0b111111111111111111111111L) | ((v12 & 0b11111111111111111111111111111111111111L) << 24); + output[outputOffset + 13] = ((v12 >>> 38) & 0b11111111111111111111111111L) | ((v13 & 0b111111111111111111111111111111111111L) << 26); + output[outputOffset + 14] = ((v13 >>> 36) & 0b1111111111111111111111111111L) | ((v14 & 0b1111111111111111111111111111111111L) << 28); + output[outputOffset + 15] = ((v14 >>> 34) & 0b111111111111111111111111111111L) | ((v15 & 0b11111111111111111111111111111111L) << 30); + output[outputOffset + 16] = ((v15 >>> 32) & 0b11111111111111111111111111111111L) | ((v16 & 0b111111111111111111111111111111L) << 32); + output[outputOffset + 17] = ((v16 >>> 30) & 0b1111111111111111111111111111111111L) | ((v17 & 0b1111111111111111111111111111L) << 34); + output[outputOffset + 18] = ((v17 >>> 28) & 0b111111111111111111111111111111111111L) | ((v18 & 0b11111111111111111111111111L) << 36); + output[outputOffset + 19] = ((v18 >>> 26) & 0b11111111111111111111111111111111111111L) | ((v19 & 0b111111111111111111111111L) << 38); + output[outputOffset + 20] = ((v19 >>> 24) & 0b1111111111111111111111111111111111111111L) | ((v20 & 0b1111111111111111111111L) << 40); + output[outputOffset + 21] = ((v20 >>> 22) & 0b111111111111111111111111111111111111111111L) | ((v21 & 0b11111111111111111111L) << 42); + output[outputOffset + 22] = ((v21 >>> 20) & 0b11111111111111111111111111111111111111111111L) | ((v22 & 0b111111111111111111L) << 44); + output[outputOffset + 23] = ((v22 >>> 18) & 0b1111111111111111111111111111111111111111111111L) | ((v23 & 0b1111111111111111L) << 46); + output[outputOffset + 24] = ((v23 >>> 16) & 0b111111111111111111111111111111111111111111111111L) | ((v24 & 0b11111111111111L) << 48); + output[outputOffset + 25] = ((v24 >>> 14) & 0b11111111111111111111111111111111111111111111111111L) | ((v25 & 0b111111111111L) << 50); + output[outputOffset + 26] = ((v25 >>> 12) & 0b1111111111111111111111111111111111111111111111111111L) | ((v26 & 0b1111111111L) << 52); + output[outputOffset + 27] = ((v26 >>> 10) & 0b111111111111111111111111111111111111111111111111111111L) | ((v27 & 0b11111111L) << 54); + output[outputOffset + 28] = ((v27 >>> 8) & 0b11111111111111111111111111111111111111111111111111111111L) | ((v28 & 0b111111L) << 56); + output[outputOffset + 29] = ((v28 >>> 6) & 0b1111111111111111111111111111111111111111111111111111111111L) | ((v29 & 0b1111L) << 58); + output[outputOffset + 30] = ((v29 >>> 4) & 0b111111111111111111111111111111111111111111111111111111111111L) | ((v30 & 0b11L) << 60); + output[outputOffset + 31] = (v30 >>> 2) & 0b11111111111111111111111111111111111111111111111111111111111111L; + } + } + + private static final class Unpacker63 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(long[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + long v8 = input.readLong(); + long v9 = input.readLong(); + long v10 = input.readLong(); + long v11 = input.readLong(); + long v12 = input.readLong(); + long v13 = input.readLong(); + long v14 = input.readLong(); + long v15 = input.readLong(); + long v16 = input.readLong(); + long v17 = input.readLong(); + long v18 = input.readLong(); + long v19 = input.readLong(); + long v20 = input.readLong(); + long v21 = input.readLong(); + long v22 = input.readLong(); + long v23 = input.readLong(); + long v24 = input.readLong(); + long v25 = input.readLong(); + long v26 = input.readLong(); + long v27 = input.readLong(); + long v28 = input.readLong(); + long v29 = input.readLong(); + long v30 = input.readLong(); + int v31 = input.readInt(); + output[outputOffset] = v0 & 0b111111111111111111111111111111111111111111111111111111111111111L; + output[outputOffset + 1] = ((v0 >>> 63) & 0b1L) | ((v1 & 0b11111111111111111111111111111111111111111111111111111111111111L) << 1); + output[outputOffset + 2] = ((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111111111111111111111111111111111111111111111111111111111L) << 2); + output[outputOffset + 3] = ((v2 >>> 61) & 0b111L) | ((v3 & 0b111111111111111111111111111111111111111111111111111111111111L) << 3); + output[outputOffset + 4] = ((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111111111111111111111111111111111111111111111111111111111L) << 4); + output[outputOffset + 5] = ((v4 >>> 59) & 0b11111L) | ((v5 & 0b1111111111111111111111111111111111111111111111111111111111L) << 5); + output[outputOffset + 6] = ((v5 >>> 58) & 0b111111L) | ((v6 & 0b111111111111111111111111111111111111111111111111111111111L) << 6); + output[outputOffset + 7] = ((v6 >>> 57) & 0b1111111L) | ((v7 & 0b11111111111111111111111111111111111111111111111111111111L) << 7); + output[outputOffset + 8] = ((v7 >>> 56) & 0b11111111L) | ((v8 & 0b1111111111111111111111111111111111111111111111111111111L) << 8); + output[outputOffset + 9] = ((v8 >>> 55) & 0b111111111L) | ((v9 & 0b111111111111111111111111111111111111111111111111111111L) << 9); + output[outputOffset + 10] = ((v9 >>> 54) & 0b1111111111L) | ((v10 & 0b11111111111111111111111111111111111111111111111111111L) << 10); + output[outputOffset + 11] = ((v10 >>> 53) & 0b11111111111L) | ((v11 & 0b1111111111111111111111111111111111111111111111111111L) << 11); + output[outputOffset + 12] = ((v11 >>> 52) & 0b111111111111L) | ((v12 & 0b111111111111111111111111111111111111111111111111111L) << 12); + output[outputOffset + 13] = ((v12 >>> 51) & 0b1111111111111L) | ((v13 & 0b11111111111111111111111111111111111111111111111111L) << 13); + output[outputOffset + 14] = ((v13 >>> 50) & 0b11111111111111L) | ((v14 & 0b1111111111111111111111111111111111111111111111111L) << 14); + output[outputOffset + 15] = ((v14 >>> 49) & 0b111111111111111L) | ((v15 & 0b111111111111111111111111111111111111111111111111L) << 15); + output[outputOffset + 16] = ((v15 >>> 48) & 0b1111111111111111L) | ((v16 & 0b11111111111111111111111111111111111111111111111L) << 16); + output[outputOffset + 17] = ((v16 >>> 47) & 0b11111111111111111L) | ((v17 & 0b1111111111111111111111111111111111111111111111L) << 17); + output[outputOffset + 18] = ((v17 >>> 46) & 0b111111111111111111L) | ((v18 & 0b111111111111111111111111111111111111111111111L) << 18); + output[outputOffset + 19] = ((v18 >>> 45) & 0b1111111111111111111L) | ((v19 & 0b11111111111111111111111111111111111111111111L) << 19); + output[outputOffset + 20] = ((v19 >>> 44) & 0b11111111111111111111L) | ((v20 & 0b1111111111111111111111111111111111111111111L) << 20); + output[outputOffset + 21] = ((v20 >>> 43) & 0b111111111111111111111L) | ((v21 & 0b111111111111111111111111111111111111111111L) << 21); + output[outputOffset + 22] = ((v21 >>> 42) & 0b1111111111111111111111L) | ((v22 & 0b11111111111111111111111111111111111111111L) << 22); + output[outputOffset + 23] = ((v22 >>> 41) & 0b11111111111111111111111L) | ((v23 & 0b1111111111111111111111111111111111111111L) << 23); + output[outputOffset + 24] = ((v23 >>> 40) & 0b111111111111111111111111L) | ((v24 & 0b111111111111111111111111111111111111111L) << 24); + output[outputOffset + 25] = ((v24 >>> 39) & 0b1111111111111111111111111L) | ((v25 & 0b11111111111111111111111111111111111111L) << 25); + output[outputOffset + 26] = ((v25 >>> 38) & 0b11111111111111111111111111L) | ((v26 & 0b1111111111111111111111111111111111111L) << 26); + output[outputOffset + 27] = ((v26 >>> 37) & 0b111111111111111111111111111L) | ((v27 & 0b111111111111111111111111111111111111L) << 27); + output[outputOffset + 28] = ((v27 >>> 36) & 0b1111111111111111111111111111L) | ((v28 & 0b11111111111111111111111111111111111L) << 28); + output[outputOffset + 29] = ((v28 >>> 35) & 0b11111111111111111111111111111L) | ((v29 & 0b1111111111111111111111111111111111L) << 29); + output[outputOffset + 30] = ((v29 >>> 34) & 0b111111111111111111111111111111L) | ((v30 & 0b111111111111111111111111111111111L) << 30); + output[outputOffset + 31] = ((v30 >>> 33) & 0b1111111111111111111111111111111L) | ((v31 & 0b11111111111111111111111111111111L) << 31); + } + } + + private static class Unpacker64 + implements LongBitUnpacker + { + @Override + public void unpack(long[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + input.readLongs(output, outputOffset, length); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/PlainByteArrayDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/PlainByteArrayDecoders.java new file mode 100644 index 000000000000..f3d0e333f940 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/PlainByteArrayDecoders.java @@ -0,0 +1,241 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.flat.BinaryBuffer; +import io.trino.spi.type.CharType; +import io.trino.spi.type.VarcharType; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.spi.type.Chars.byteCountWithoutTrailingSpace; +import static io.trino.spi.type.Varchars.byteCount; +import static java.lang.System.arraycopy; +import static java.util.Objects.requireNonNull; + +/** + * read methods in this class calculate offsets and lengths of positions and then + * create a single byte array that is pushed to the output buffer + */ +public class PlainByteArrayDecoders +{ + private PlainByteArrayDecoders() {} + + public static final class BoundedVarcharPlainValueDecoder + implements ValueDecoder + { + private final int boundedLength; + + private SimpleSliceInputStream input; + + public BoundedVarcharPlainValueDecoder(VarcharType varcharType) + { + checkArgument( + !varcharType.isUnbounded(), + "Trino type %s is not a bounded varchar", + varcharType); + this.boundedLength = varcharType.getBoundedLength(); + } + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + Slice inputSlice = input.asSlice(); + // Output offsets array is used as a temporary space for position lengths + int[] offsets = values.getOffsets(); + int currentInputOffset = 0; + int outputBufferSize = 0; + + for (int i = offset; i < offset + length; i++) { + int positionLength = inputSlice.getInt(currentInputOffset); + currentInputOffset += Integer.BYTES; + int outputLength = byteCount(inputSlice, currentInputOffset, positionLength, boundedLength); + offsets[i + 1] = outputLength; + outputBufferSize += outputLength; + currentInputOffset += positionLength; + } + + createOutputBuffer(values, offset, length, inputSlice, outputBufferSize); + input.skip(currentInputOffset); + } + + @Override + public void skip(int n) + { + skipPlainValues(input, n); + } + } + + public static final class CharPlainValueDecoder + implements ValueDecoder + { + private final int maxLength; + + private SimpleSliceInputStream input; + + public CharPlainValueDecoder(CharType charType) + { + this.maxLength = charType.getLength(); + } + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + Slice inputSlice = input.asSlice(); + // Output offsets array is used as a temporary space for position lengths + int[] offsets = values.getOffsets(); + int currentInputOffset = 0; + int outputBufferSize = 0; + + for (int i = offset; i < offset + length; i++) { + int positionLength = inputSlice.getInt(currentInputOffset); + currentInputOffset += Integer.BYTES; + int outputLength = byteCountWithoutTrailingSpace(inputSlice, currentInputOffset, positionLength, maxLength); + offsets[i + 1] = outputLength; + outputBufferSize += outputLength; + currentInputOffset += positionLength; + } + + createOutputBuffer(values, offset, length, inputSlice, outputBufferSize); + input.skip(currentInputOffset); + } + + @Override + public void skip(int n) + { + skipPlainValues(input, n); + } + } + + public static final class BinaryPlainValueDecoder + implements ValueDecoder + { + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + Slice inputSlice = input.asSlice(); + // Output offsets array is used as a temporary space for position lengths + int[] offsets = values.getOffsets(); + int currentInputOffset = 0; + int outputBufferSize = 0; + + for (int i = 0; i < length; i++) { + int positionLength = inputSlice.getInt(currentInputOffset); + offsets[offset + i + 1] = positionLength; + outputBufferSize += positionLength; + currentInputOffset += positionLength + Integer.BYTES; + } + + values.addChunk(createOutputBuffer(values.getOffsets(), offset, length, inputSlice, outputBufferSize)); + input.skip(currentInputOffset); + } + + @Override + public void skip(int n) + { + skipPlainValues(input, n); + } + + /** + * Create one big slice of data and add it to the output buffer since buffer size is known. + * Specialized for the case of strings with no truncation + */ + private static Slice createOutputBuffer(int[] offsets, int offset, int length, Slice inputSlice, int outputBufferSize) + { + byte[] outputBuffer = new byte[outputBufferSize]; + int currentInputOffset = 0; + int currentOutputOffset = 0; + + byte[] inputArray; + int inputArrayOffset; + if (length != 0) { + inputArray = inputSlice.byteArray(); + inputArrayOffset = inputSlice.byteArrayOffset(); + } + else { + inputArray = new byte[0]; + inputArrayOffset = 0; + } + for (int i = 0; i < length; i++) { + int positionLength = offsets[offset + i + 1]; + arraycopy(inputArray, inputArrayOffset + currentInputOffset + Integer.BYTES, outputBuffer, currentOutputOffset, positionLength); + offsets[offset + i + 1] = offsets[offset + i] + positionLength; + currentInputOffset += positionLength + Integer.BYTES; + currentOutputOffset += positionLength; + } + return Slices.wrappedBuffer(outputBuffer); + } + } + + private static void skipPlainValues(SimpleSliceInputStream input, int n) + { + for (int i = 0; i < n; i++) { + int positionLength = input.readInt(); + input.skip(positionLength); + } + } + + /** + * Create one big slice of data and add it to the output buffer since buffer size is known + */ + private static void createOutputBuffer(BinaryBuffer values, int offset, int length, Slice inputSlice, int outputBufferSize) + { + int[] offsets = values.getOffsets(); + byte[] outputBuffer = new byte[outputBufferSize]; + int currentInputOffset = 0; + int currentOutputOffset = 0; + + byte[] inputArray; + int inputArrayOffset; + if (length != 0) { + inputArray = inputSlice.byteArray(); + inputArrayOffset = inputSlice.byteArrayOffset(); + } + else { + inputArray = new byte[0]; + inputArrayOffset = 0; + } + for (int i = 0; i < length; i++) { + int inputPositionLength = inputSlice.getInt(currentInputOffset); + int outputPositionLength = offsets[offset + i + 1]; + arraycopy(inputArray, inputArrayOffset + currentInputOffset + Integer.BYTES, outputBuffer, currentOutputOffset, outputPositionLength); + offsets[offset + i + 1] = offsets[offset + i] + outputPositionLength; + currentInputOffset += inputPositionLength + Integer.BYTES; + currentOutputOffset += outputPositionLength; + } + values.addChunk(Slices.wrappedBuffer(outputBuffer)); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/PlainValueDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/PlainValueDecoders.java new file mode 100644 index 000000000000..fa315ec3e61e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/PlainValueDecoders.java @@ -0,0 +1,329 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.airlift.slice.Slices; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.flat.BinaryBuffer; +import io.trino.plugin.base.type.DecodedTimestamp; +import io.trino.spi.type.Decimals; +import io.trino.spi.type.Int128; +import org.apache.parquet.column.ColumnDescriptor; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.SIZE_OF_LONG; +import static io.trino.parquet.ParquetReaderUtils.toByteExact; +import static io.trino.parquet.ParquetReaderUtils.toShortExact; +import static io.trino.parquet.ParquetTimestampUtils.decodeInt96Timestamp; +import static io.trino.parquet.ParquetTypeUtils.checkBytesFitInShortDecimal; +import static io.trino.parquet.ParquetTypeUtils.getShortDecimalValue; +import static io.trino.spi.block.Fixed12Block.encodeFixed12; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; + +public final class PlainValueDecoders +{ + private PlainValueDecoders() {} + + public static final class LongPlainValueDecoder + implements ValueDecoder + { + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(long[] values, int offset, int length) + { + input.readLongs(values, offset, length); + } + + @Override + public void skip(int n) + { + input.skip(n * Long.BYTES); + } + } + + public static final class IntPlainValueDecoder + implements ValueDecoder + { + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(int[] values, int offset, int length) + { + input.readInts(values, offset, length); + } + + @Override + public void skip(int n) + { + input.skip(n * Integer.BYTES); + } + } + + public static final class IntToShortPlainValueDecoder + implements ValueDecoder + { + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(short[] values, int offset, int length) + { + for (int i = offset; i < offset + length; i++) { + values[i] = toShortExact(input.readIntUnchecked()); + } + } + + @Override + public void skip(int n) + { + input.skip(n * Integer.BYTES); + } + } + + public static final class IntToBytePlainValueDecoder + implements ValueDecoder + { + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(byte[] values, int offset, int length) + { + for (int i = offset; i < offset + length; i++) { + values[i] = toByteExact(input.readIntUnchecked()); + } + } + + @Override + public void skip(int n) + { + input.skip(n * Integer.BYTES); + } + } + + public static final class ShortDecimalFixedLengthByteArrayDecoder + implements ValueDecoder + { + private final int typeLength; + private final ColumnDescriptor descriptor; + private final ShortDecimalFixedWidthByteArrayBatchDecoder decimalValueDecoder; + + private SimpleSliceInputStream input; + + public ShortDecimalFixedLengthByteArrayDecoder(ColumnDescriptor descriptor) + { + DecimalLogicalTypeAnnotation decimalAnnotation = (DecimalLogicalTypeAnnotation) descriptor.getPrimitiveType().getLogicalTypeAnnotation(); + checkArgument( + decimalAnnotation.getPrecision() <= Decimals.MAX_SHORT_PRECISION, + "Decimal type %s is not a short decimal", + decimalAnnotation); + this.typeLength = descriptor.getPrimitiveType().getTypeLength(); + checkArgument(typeLength > 0 && typeLength <= 16, "Expected column %s to have type length in range (1-16)", descriptor); + this.descriptor = descriptor; + this.decimalValueDecoder = new ShortDecimalFixedWidthByteArrayBatchDecoder(Math.min(typeLength, Long.BYTES)); + } + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(long[] values, int offset, int length) + { + if (typeLength <= Long.BYTES) { + decimalValueDecoder.getShortDecimalValues(input, values, offset, length); + return; + } + int extraBytesLength = typeLength - Long.BYTES; + byte[] inputBytes = input.getByteArray(); + int inputBytesOffset = input.getByteArrayOffset(); + for (int i = offset; i < offset + length; i++) { + checkBytesFitInShortDecimal(inputBytes, inputBytesOffset, extraBytesLength, descriptor); + values[i] = getShortDecimalValue(inputBytes, inputBytesOffset + extraBytesLength, Long.BYTES); + inputBytesOffset += typeLength; + } + input.skip(length * typeLength); + } + + @Override + public void skip(int n) + { + input.skip(n * typeLength); + } + } + + public static final class LongDecimalPlainValueDecoder + implements ValueDecoder + { + private final int typeLength; + private final byte[] inputBytes; + + private SimpleSliceInputStream input; + + public LongDecimalPlainValueDecoder(int typeLength) + { + checkArgument(typeLength > 0 && typeLength <= 16, "typeLength %s should be in range (1-16) for a long decimal", typeLength); + this.typeLength = typeLength; + this.inputBytes = new byte[typeLength]; + } + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(long[] values, int offset, int length) + { + int endOffset = (offset + length) * 2; + for (int currentOutputOffset = offset * 2; currentOutputOffset < endOffset; currentOutputOffset += 2) { + input.readBytes(Slices.wrappedBuffer(inputBytes), 0, typeLength); + Int128 value = Int128.fromBigEndian(inputBytes); + values[currentOutputOffset] = value.getHigh(); + values[currentOutputOffset + 1] = value.getLow(); + } + } + + @Override + public void skip(int n) + { + input.skip(n * typeLength); + } + } + + public static final class UuidPlainValueDecoder + implements ValueDecoder + { + private static final int UUID_SIZE = 16; + + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(long[] values, int offset, int length) + { + int endOffset = (offset + length) * 2; + for (int currentOutputOffset = offset * 2; currentOutputOffset < endOffset; currentOutputOffset += 2) { + values[currentOutputOffset] = input.readLong(); + values[currentOutputOffset + 1] = input.readLong(); + } + } + + @Override + public void skip(int n) + { + input.skip(n * UUID_SIZE); + } + } + + public static final class Int96TimestampPlainValueDecoder + implements ValueDecoder + { + private static final int LENGTH = SIZE_OF_LONG + SIZE_OF_INT; + + private SimpleSliceInputStream input; + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(int[] values, int offset, int length) + { + for (int i = offset; i < offset + length; i++) { + DecodedTimestamp timestamp = decodeInt96Timestamp(input.readLongUnchecked(), input.readIntUnchecked()); + encodeFixed12(timestamp.epochSeconds(), timestamp.nanosOfSecond(), values, i); + } + } + + @Override + public void skip(int n) + { + input.skip(n * LENGTH); + } + } + + public static final class FixedLengthPlainValueDecoder + implements ValueDecoder + { + private final int typeLength; + + private SimpleSliceInputStream input; + + public FixedLengthPlainValueDecoder(int typeLength) + { + this.typeLength = typeLength; + } + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + } + + @Override + public void read(BinaryBuffer values, int offset, int length) + { + values.addChunk(input.readSlice(typeLength * length)); + int[] outputOffsets = values.getOffsets(); + + int inputLength = outputOffsets[offset] + typeLength; + for (int i = offset; i < offset + length; i++) { + outputOffsets[i + 1] = inputLength; + inputLength += typeLength; + } + } + + @Override + public void skip(int n) + { + input.skip(n * typeLength); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/RleBitPackingHybridBooleanDecoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/RleBitPackingHybridBooleanDecoder.java new file mode 100644 index 000000000000..501a8dc93859 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/RleBitPackingHybridBooleanDecoder.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.flat.FlatDefinitionLevelDecoder; + +import static io.trino.parquet.reader.flat.NullsDecoders.createNullsDecoder; + +/** + * Decoder for RLE encoded values of BOOLEAN primitive type + * + * Run Length Encoding / Bit-Packing Hybrid (RLE) + * + */ +public final class RleBitPackingHybridBooleanDecoder + implements ValueDecoder +{ + private final FlatDefinitionLevelDecoder decoder; + + public RleBitPackingHybridBooleanDecoder(boolean vectorizedDecodingEnabled) + { + this.decoder = createNullsDecoder(vectorizedDecodingEnabled); + } + + @Override + public void init(SimpleSliceInputStream input) + { + // First int is size in bytes which is not needed here + input.skip(Integer.BYTES); + this.decoder.init(input.asSlice()); + } + + @Override + public void read(byte[] values, int offset, int length) + { + boolean[] buffer = new boolean[length]; + decoder.readNext(buffer, 0, length); + for (int i = 0; i < length; i++) { + // NullsDecoder returns false for 1 (non-null) and true for 0 (null) + values[offset + i] = buffer[i] ? (byte) 0 : (byte) 1; + } + } + + @Override + public void skip(int n) + { + decoder.skip(n); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/RleBitPackingHybridDecoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/RleBitPackingHybridDecoder.java new file mode 100644 index 000000000000..3e0a7460a38c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/RleBitPackingHybridDecoder.java @@ -0,0 +1,171 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +import java.util.Arrays; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetReaderUtils.readFixedWidthInt; +import static io.trino.parquet.ParquetReaderUtils.readUleb128Int; +import static io.trino.parquet.reader.decoders.IntBitUnpackers.getIntBitUnpacker; +import static io.trino.parquet.reader.decoders.VectorIntBitUnpackers.getVectorIntBitUnpacker; +import static java.lang.Math.min; +import static java.util.Objects.requireNonNull; + +/** + * + * Run Length Encoding / Bit-Packing Hybrid (RLE) + * + * This class is similar to {@link io.trino.parquet.reader.flat.NullsDecoders} but specialized for reading integers stored in bit width of 0 - 32. + * It is used specifically for decoding dictionary ids currently. + * It can be used for decoding definition and repetition levels of nested columns in future. + */ +public final class RleBitPackingHybridDecoder + implements ValueDecoder +{ + // Bit-packed values comes in batches of 8 according to specs + private static final int EXTRACT_BATCH_SIZE = 8; + + private final int bitWidth; + private final int byteWidth; + private final IntBitUnpacker unpacker; + + private SimpleSliceInputStream input; + + // Encoding type if decoding stopped in the middle of the group + private boolean isRle; + // Values left to decode in the current group + private int valuesLeftInGroup; + // With RLE encoding - the current value + private int rleValue; + // With bit-packing - buffer of EXTRACT_BATCH_SIZE values currently read. + private int[] valuesBuffer; + // Number of values already read in the current buffer while reading bit-packed values + private int alreadyReadInBuffer; + + public RleBitPackingHybridDecoder(int bitWidth, boolean vectorizedDecodingEnabled) + { + checkArgument(bitWidth >= 0 && bitWidth <= 32, "bit width need to be between 0 and 32"); + this.bitWidth = bitWidth; + this.byteWidth = byteWidth(bitWidth); + this.unpacker = vectorizedDecodingEnabled ? getVectorIntBitUnpacker(bitWidth) : getIntBitUnpacker(bitWidth); + } + + @Override + public void init(SimpleSliceInputStream input) + { + this.input = requireNonNull(input, "input is null"); + this.valuesBuffer = new int[EXTRACT_BATCH_SIZE]; + } + + @Override + public void read(int[] values, int offset, int length) + { + while (length > 0) { + if (valuesLeftInGroup == 0) { + readGroupHeader(); + } + + if (isRle) { + int chunkSize = min(length, valuesLeftInGroup); + Arrays.fill(values, offset, offset + chunkSize, rleValue); + valuesLeftInGroup -= chunkSize; + offset += chunkSize; + length -= chunkSize; + } + else if (alreadyReadInBuffer != 0) { // bit-packed - read remaining bytes stored in the buffer + int remainingValues = EXTRACT_BATCH_SIZE - alreadyReadInBuffer; + int chunkSize = min(remainingValues, length); + System.arraycopy(valuesBuffer, alreadyReadInBuffer, values, offset, chunkSize); + valuesLeftInGroup -= chunkSize; + alreadyReadInBuffer = (alreadyReadInBuffer + chunkSize) % EXTRACT_BATCH_SIZE; + offset += chunkSize; + length -= chunkSize; + } + else { // bit-packed + // At this point we have only full batches to read and valuesLeftInGroup is a multiplication of 8 + int chunkSize = min(length, valuesLeftInGroup); + int leftToRead = chunkSize % EXTRACT_BATCH_SIZE; + int fullBatchesToRead = chunkSize - leftToRead; + unpacker.unpack(values, offset, input, fullBatchesToRead); + offset += fullBatchesToRead; + if (leftToRead > 0) { + unpacker.unpack(valuesBuffer, 0, input, EXTRACT_BATCH_SIZE); // Unpack to temporary buffer + System.arraycopy(valuesBuffer, 0, values, offset, leftToRead); + offset += leftToRead; + } + alreadyReadInBuffer = leftToRead; + valuesLeftInGroup -= chunkSize; + length -= chunkSize; + } + } + } + + @Override + public void skip(int n) + { + while (n > 0) { + if (valuesLeftInGroup == 0) { + readGroupHeader(); + } + + if (isRle) { + int chunkSize = min(n, valuesLeftInGroup); + valuesLeftInGroup -= chunkSize; + n -= chunkSize; + } + else if (alreadyReadInBuffer != 0) { // bit-packed - skip remaining bytes stored in the buffer + int remainingValues = EXTRACT_BATCH_SIZE - alreadyReadInBuffer; + int chunkSize = min(remainingValues, n); + valuesLeftInGroup -= chunkSize; + alreadyReadInBuffer = (alreadyReadInBuffer + chunkSize) % EXTRACT_BATCH_SIZE; + n -= chunkSize; + } + else { // bit-packed + int chunkSize = min(n, valuesLeftInGroup); + int fullBatchesToRead = chunkSize / EXTRACT_BATCH_SIZE; + input.skip(fullBatchesToRead * bitWidth * EXTRACT_BATCH_SIZE / Byte.SIZE); + int leftToRead = chunkSize % EXTRACT_BATCH_SIZE; + if (leftToRead > 0) { + unpacker.unpack(valuesBuffer, 0, input, EXTRACT_BATCH_SIZE); + } + alreadyReadInBuffer = leftToRead; + valuesLeftInGroup -= chunkSize; + n -= chunkSize; + } + } + } + + private void readGroupHeader() + { + int header = readUleb128Int(input); + isRle = (header & 1) == 0; + valuesLeftInGroup = header >>> 1; + if (isRle) { + rleValue = readFixedWidthInt(input, byteWidth); + } + else { + // Only full bytes are encoded + valuesLeftInGroup *= 8; + } + } + + private static int byteWidth(int bitWidth) + { + // Equivalent of Math.ceil(bitWidth / Byte.SIZE) but without double arithmetics + return (bitWidth + Byte.SIZE - 1) / Byte.SIZE; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortBitUnpacker.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortBitUnpacker.java new file mode 100644 index 000000000000..a41ac6fcaf6d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortBitUnpacker.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +public interface ShortBitUnpacker +{ + /** + * @param length must be a multiple of 32 + */ + void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortBitUnpackers.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortBitUnpackers.java new file mode 100644 index 000000000000..e2542267c055 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortBitUnpackers.java @@ -0,0 +1,935 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +import static com.google.common.base.Preconditions.checkArgument; + +public final class ShortBitUnpackers +{ + private static final ShortBitUnpacker[] UNPACKERS = { + new Unpacker1(), + new Unpacker2(), + new Unpacker3(), + new Unpacker4(), + new Unpacker5(), + new Unpacker6(), + new Unpacker7(), + new Unpacker8(), + new Unpacker9(), + new Unpacker10(), + new Unpacker11(), + new Unpacker12(), + new Unpacker13(), + new Unpacker14(), + new Unpacker15(), + new Unpacker16(), + new Unpacker17()}; + + // Short unpacker also exists for the out-of-range 17 value. + // This unpacker truncates the most significant bit of the resulted numbers. + // This is due to the fact that deltas may require more than 16 bits to be stored. + public static ShortBitUnpacker getShortBitUnpacker(int bitWidth) + { + checkArgument(bitWidth > 0 && bitWidth <= 17, "bitWidth %s should be in the range 1-17", bitWidth); + return UNPACKERS[bitWidth - 1]; + } + + private ShortBitUnpackers() {} + + private static final class Unpacker1 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b1L); + output[outputOffset + 1] = (short) ((v0 >>> 1) & 0b1L); + output[outputOffset + 2] = (short) ((v0 >>> 2) & 0b1L); + output[outputOffset + 3] = (short) ((v0 >>> 3) & 0b1L); + output[outputOffset + 4] = (short) ((v0 >>> 4) & 0b1L); + output[outputOffset + 5] = (short) ((v0 >>> 5) & 0b1L); + output[outputOffset + 6] = (short) ((v0 >>> 6) & 0b1L); + output[outputOffset + 7] = (short) ((v0 >>> 7) & 0b1L); + output[outputOffset + 8] = (short) ((v0 >>> 8) & 0b1L); + output[outputOffset + 9] = (short) ((v0 >>> 9) & 0b1L); + output[outputOffset + 10] = (short) ((v0 >>> 10) & 0b1L); + output[outputOffset + 11] = (short) ((v0 >>> 11) & 0b1L); + output[outputOffset + 12] = (short) ((v0 >>> 12) & 0b1L); + output[outputOffset + 13] = (short) ((v0 >>> 13) & 0b1L); + output[outputOffset + 14] = (short) ((v0 >>> 14) & 0b1L); + output[outputOffset + 15] = (short) ((v0 >>> 15) & 0b1L); + output[outputOffset + 16] = (short) ((v0 >>> 16) & 0b1L); + output[outputOffset + 17] = (short) ((v0 >>> 17) & 0b1L); + output[outputOffset + 18] = (short) ((v0 >>> 18) & 0b1L); + output[outputOffset + 19] = (short) ((v0 >>> 19) & 0b1L); + output[outputOffset + 20] = (short) ((v0 >>> 20) & 0b1L); + output[outputOffset + 21] = (short) ((v0 >>> 21) & 0b1L); + output[outputOffset + 22] = (short) ((v0 >>> 22) & 0b1L); + output[outputOffset + 23] = (short) ((v0 >>> 23) & 0b1L); + output[outputOffset + 24] = (short) ((v0 >>> 24) & 0b1L); + output[outputOffset + 25] = (short) ((v0 >>> 25) & 0b1L); + output[outputOffset + 26] = (short) ((v0 >>> 26) & 0b1L); + output[outputOffset + 27] = (short) ((v0 >>> 27) & 0b1L); + output[outputOffset + 28] = (short) ((v0 >>> 28) & 0b1L); + output[outputOffset + 29] = (short) ((v0 >>> 29) & 0b1L); + output[outputOffset + 30] = (short) ((v0 >>> 30) & 0b1L); + output[outputOffset + 31] = (short) ((v0 >>> 31) & 0b1L); + } + } + + private static final class Unpacker2 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + output[outputOffset] = (short) (v0 & 0b11L); + output[outputOffset + 1] = (short) ((v0 >>> 2) & 0b11L); + output[outputOffset + 2] = (short) ((v0 >>> 4) & 0b11L); + output[outputOffset + 3] = (short) ((v0 >>> 6) & 0b11L); + output[outputOffset + 4] = (short) ((v0 >>> 8) & 0b11L); + output[outputOffset + 5] = (short) ((v0 >>> 10) & 0b11L); + output[outputOffset + 6] = (short) ((v0 >>> 12) & 0b11L); + output[outputOffset + 7] = (short) ((v0 >>> 14) & 0b11L); + output[outputOffset + 8] = (short) ((v0 >>> 16) & 0b11L); + output[outputOffset + 9] = (short) ((v0 >>> 18) & 0b11L); + output[outputOffset + 10] = (short) ((v0 >>> 20) & 0b11L); + output[outputOffset + 11] = (short) ((v0 >>> 22) & 0b11L); + output[outputOffset + 12] = (short) ((v0 >>> 24) & 0b11L); + output[outputOffset + 13] = (short) ((v0 >>> 26) & 0b11L); + output[outputOffset + 14] = (short) ((v0 >>> 28) & 0b11L); + output[outputOffset + 15] = (short) ((v0 >>> 30) & 0b11L); + output[outputOffset + 16] = (short) ((v0 >>> 32) & 0b11L); + output[outputOffset + 17] = (short) ((v0 >>> 34) & 0b11L); + output[outputOffset + 18] = (short) ((v0 >>> 36) & 0b11L); + output[outputOffset + 19] = (short) ((v0 >>> 38) & 0b11L); + output[outputOffset + 20] = (short) ((v0 >>> 40) & 0b11L); + output[outputOffset + 21] = (short) ((v0 >>> 42) & 0b11L); + output[outputOffset + 22] = (short) ((v0 >>> 44) & 0b11L); + output[outputOffset + 23] = (short) ((v0 >>> 46) & 0b11L); + output[outputOffset + 24] = (short) ((v0 >>> 48) & 0b11L); + output[outputOffset + 25] = (short) ((v0 >>> 50) & 0b11L); + output[outputOffset + 26] = (short) ((v0 >>> 52) & 0b11L); + output[outputOffset + 27] = (short) ((v0 >>> 54) & 0b11L); + output[outputOffset + 28] = (short) ((v0 >>> 56) & 0b11L); + output[outputOffset + 29] = (short) ((v0 >>> 58) & 0b11L); + output[outputOffset + 30] = (short) ((v0 >>> 60) & 0b11L); + output[outputOffset + 31] = (short) ((v0 >>> 62) & 0b11L); + } + } + + private static final class Unpacker3 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b111L); + output[outputOffset + 1] = (short) ((v0 >>> 3) & 0b111L); + output[outputOffset + 2] = (short) ((v0 >>> 6) & 0b111L); + output[outputOffset + 3] = (short) ((v0 >>> 9) & 0b111L); + output[outputOffset + 4] = (short) ((v0 >>> 12) & 0b111L); + output[outputOffset + 5] = (short) ((v0 >>> 15) & 0b111L); + output[outputOffset + 6] = (short) ((v0 >>> 18) & 0b111L); + output[outputOffset + 7] = (short) ((v0 >>> 21) & 0b111L); + output[outputOffset + 8] = (short) ((v0 >>> 24) & 0b111L); + output[outputOffset + 9] = (short) ((v0 >>> 27) & 0b111L); + output[outputOffset + 10] = (short) ((v0 >>> 30) & 0b111L); + output[outputOffset + 11] = (short) ((v0 >>> 33) & 0b111L); + output[outputOffset + 12] = (short) ((v0 >>> 36) & 0b111L); + output[outputOffset + 13] = (short) ((v0 >>> 39) & 0b111L); + output[outputOffset + 14] = (short) ((v0 >>> 42) & 0b111L); + output[outputOffset + 15] = (short) ((v0 >>> 45) & 0b111L); + output[outputOffset + 16] = (short) ((v0 >>> 48) & 0b111L); + output[outputOffset + 17] = (short) ((v0 >>> 51) & 0b111L); + output[outputOffset + 18] = (short) ((v0 >>> 54) & 0b111L); + output[outputOffset + 19] = (short) ((v0 >>> 57) & 0b111L); + output[outputOffset + 20] = (short) ((v0 >>> 60) & 0b111L); + output[outputOffset + 21] = (short) (((v0 >>> 63) & 0b1L) | ((v1 & 0b11L) << 1)); + output[outputOffset + 22] = (short) ((v1 >>> 2) & 0b111L); + output[outputOffset + 23] = (short) ((v1 >>> 5) & 0b111L); + output[outputOffset + 24] = (short) ((v1 >>> 8) & 0b111L); + output[outputOffset + 25] = (short) ((v1 >>> 11) & 0b111L); + output[outputOffset + 26] = (short) ((v1 >>> 14) & 0b111L); + output[outputOffset + 27] = (short) ((v1 >>> 17) & 0b111L); + output[outputOffset + 28] = (short) ((v1 >>> 20) & 0b111L); + output[outputOffset + 29] = (short) ((v1 >>> 23) & 0b111L); + output[outputOffset + 30] = (short) ((v1 >>> 26) & 0b111L); + output[outputOffset + 31] = (short) ((v1 >>> 29) & 0b111L); + } + } + + private static final class Unpacker4 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + output[outputOffset] = (short) (v0 & 0b1111L); + output[outputOffset + 1] = (short) ((v0 >>> 4) & 0b1111L); + output[outputOffset + 2] = (short) ((v0 >>> 8) & 0b1111L); + output[outputOffset + 3] = (short) ((v0 >>> 12) & 0b1111L); + output[outputOffset + 4] = (short) ((v0 >>> 16) & 0b1111L); + output[outputOffset + 5] = (short) ((v0 >>> 20) & 0b1111L); + output[outputOffset + 6] = (short) ((v0 >>> 24) & 0b1111L); + output[outputOffset + 7] = (short) ((v0 >>> 28) & 0b1111L); + output[outputOffset + 8] = (short) ((v0 >>> 32) & 0b1111L); + output[outputOffset + 9] = (short) ((v0 >>> 36) & 0b1111L); + output[outputOffset + 10] = (short) ((v0 >>> 40) & 0b1111L); + output[outputOffset + 11] = (short) ((v0 >>> 44) & 0b1111L); + output[outputOffset + 12] = (short) ((v0 >>> 48) & 0b1111L); + output[outputOffset + 13] = (short) ((v0 >>> 52) & 0b1111L); + output[outputOffset + 14] = (short) ((v0 >>> 56) & 0b1111L); + output[outputOffset + 15] = (short) ((v0 >>> 60) & 0b1111L); + output[outputOffset + 16] = (short) (v1 & 0b1111L); + output[outputOffset + 17] = (short) ((v1 >>> 4) & 0b1111L); + output[outputOffset + 18] = (short) ((v1 >>> 8) & 0b1111L); + output[outputOffset + 19] = (short) ((v1 >>> 12) & 0b1111L); + output[outputOffset + 20] = (short) ((v1 >>> 16) & 0b1111L); + output[outputOffset + 21] = (short) ((v1 >>> 20) & 0b1111L); + output[outputOffset + 22] = (short) ((v1 >>> 24) & 0b1111L); + output[outputOffset + 23] = (short) ((v1 >>> 28) & 0b1111L); + output[outputOffset + 24] = (short) ((v1 >>> 32) & 0b1111L); + output[outputOffset + 25] = (short) ((v1 >>> 36) & 0b1111L); + output[outputOffset + 26] = (short) ((v1 >>> 40) & 0b1111L); + output[outputOffset + 27] = (short) ((v1 >>> 44) & 0b1111L); + output[outputOffset + 28] = (short) ((v1 >>> 48) & 0b1111L); + output[outputOffset + 29] = (short) ((v1 >>> 52) & 0b1111L); + output[outputOffset + 30] = (short) ((v1 >>> 56) & 0b1111L); + output[outputOffset + 31] = (short) ((v1 >>> 60) & 0b1111L); + } + } + + private static final class Unpacker5 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b11111L); + output[outputOffset + 1] = (short) ((v0 >>> 5) & 0b11111L); + output[outputOffset + 2] = (short) ((v0 >>> 10) & 0b11111L); + output[outputOffset + 3] = (short) ((v0 >>> 15) & 0b11111L); + output[outputOffset + 4] = (short) ((v0 >>> 20) & 0b11111L); + output[outputOffset + 5] = (short) ((v0 >>> 25) & 0b11111L); + output[outputOffset + 6] = (short) ((v0 >>> 30) & 0b11111L); + output[outputOffset + 7] = (short) ((v0 >>> 35) & 0b11111L); + output[outputOffset + 8] = (short) ((v0 >>> 40) & 0b11111L); + output[outputOffset + 9] = (short) ((v0 >>> 45) & 0b11111L); + output[outputOffset + 10] = (short) ((v0 >>> 50) & 0b11111L); + output[outputOffset + 11] = (short) ((v0 >>> 55) & 0b11111L); + output[outputOffset + 12] = (short) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b1L) << 4)); + output[outputOffset + 13] = (short) ((v1 >>> 1) & 0b11111L); + output[outputOffset + 14] = (short) ((v1 >>> 6) & 0b11111L); + output[outputOffset + 15] = (short) ((v1 >>> 11) & 0b11111L); + output[outputOffset + 16] = (short) ((v1 >>> 16) & 0b11111L); + output[outputOffset + 17] = (short) ((v1 >>> 21) & 0b11111L); + output[outputOffset + 18] = (short) ((v1 >>> 26) & 0b11111L); + output[outputOffset + 19] = (short) ((v1 >>> 31) & 0b11111L); + output[outputOffset + 20] = (short) ((v1 >>> 36) & 0b11111L); + output[outputOffset + 21] = (short) ((v1 >>> 41) & 0b11111L); + output[outputOffset + 22] = (short) ((v1 >>> 46) & 0b11111L); + output[outputOffset + 23] = (short) ((v1 >>> 51) & 0b11111L); + output[outputOffset + 24] = (short) ((v1 >>> 56) & 0b11111L); + output[outputOffset + 25] = (short) (((v1 >>> 61) & 0b111L) | ((v2 & 0b11L) << 3)); + output[outputOffset + 26] = (short) ((v2 >>> 2) & 0b11111L); + output[outputOffset + 27] = (short) ((v2 >>> 7) & 0b11111L); + output[outputOffset + 28] = (short) ((v2 >>> 12) & 0b11111L); + output[outputOffset + 29] = (short) ((v2 >>> 17) & 0b11111L); + output[outputOffset + 30] = (short) ((v2 >>> 22) & 0b11111L); + output[outputOffset + 31] = (short) ((v2 >>> 27) & 0b11111L); + } + } + + private static final class Unpacker6 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + output[outputOffset] = (short) (v0 & 0b111111L); + output[outputOffset + 1] = (short) ((v0 >>> 6) & 0b111111L); + output[outputOffset + 2] = (short) ((v0 >>> 12) & 0b111111L); + output[outputOffset + 3] = (short) ((v0 >>> 18) & 0b111111L); + output[outputOffset + 4] = (short) ((v0 >>> 24) & 0b111111L); + output[outputOffset + 5] = (short) ((v0 >>> 30) & 0b111111L); + output[outputOffset + 6] = (short) ((v0 >>> 36) & 0b111111L); + output[outputOffset + 7] = (short) ((v0 >>> 42) & 0b111111L); + output[outputOffset + 8] = (short) ((v0 >>> 48) & 0b111111L); + output[outputOffset + 9] = (short) ((v0 >>> 54) & 0b111111L); + output[outputOffset + 10] = (short) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11L) << 4)); + output[outputOffset + 11] = (short) ((v1 >>> 2) & 0b111111L); + output[outputOffset + 12] = (short) ((v1 >>> 8) & 0b111111L); + output[outputOffset + 13] = (short) ((v1 >>> 14) & 0b111111L); + output[outputOffset + 14] = (short) ((v1 >>> 20) & 0b111111L); + output[outputOffset + 15] = (short) ((v1 >>> 26) & 0b111111L); + output[outputOffset + 16] = (short) ((v1 >>> 32) & 0b111111L); + output[outputOffset + 17] = (short) ((v1 >>> 38) & 0b111111L); + output[outputOffset + 18] = (short) ((v1 >>> 44) & 0b111111L); + output[outputOffset + 19] = (short) ((v1 >>> 50) & 0b111111L); + output[outputOffset + 20] = (short) ((v1 >>> 56) & 0b111111L); + output[outputOffset + 21] = (short) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1111L) << 2)); + output[outputOffset + 22] = (short) ((v2 >>> 4) & 0b111111L); + output[outputOffset + 23] = (short) ((v2 >>> 10) & 0b111111L); + output[outputOffset + 24] = (short) ((v2 >>> 16) & 0b111111L); + output[outputOffset + 25] = (short) ((v2 >>> 22) & 0b111111L); + output[outputOffset + 26] = (short) ((v2 >>> 28) & 0b111111L); + output[outputOffset + 27] = (short) ((v2 >>> 34) & 0b111111L); + output[outputOffset + 28] = (short) ((v2 >>> 40) & 0b111111L); + output[outputOffset + 29] = (short) ((v2 >>> 46) & 0b111111L); + output[outputOffset + 30] = (short) ((v2 >>> 52) & 0b111111L); + output[outputOffset + 31] = (short) ((v2 >>> 58) & 0b111111L); + } + } + + private static final class Unpacker7 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + int v3 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b1111111L); + output[outputOffset + 1] = (short) ((v0 >>> 7) & 0b1111111L); + output[outputOffset + 2] = (short) ((v0 >>> 14) & 0b1111111L); + output[outputOffset + 3] = (short) ((v0 >>> 21) & 0b1111111L); + output[outputOffset + 4] = (short) ((v0 >>> 28) & 0b1111111L); + output[outputOffset + 5] = (short) ((v0 >>> 35) & 0b1111111L); + output[outputOffset + 6] = (short) ((v0 >>> 42) & 0b1111111L); + output[outputOffset + 7] = (short) ((v0 >>> 49) & 0b1111111L); + output[outputOffset + 8] = (short) ((v0 >>> 56) & 0b1111111L); + output[outputOffset + 9] = (short) (((v0 >>> 63) & 0b1L) | ((v1 & 0b111111L) << 1)); + output[outputOffset + 10] = (short) ((v1 >>> 6) & 0b1111111L); + output[outputOffset + 11] = (short) ((v1 >>> 13) & 0b1111111L); + output[outputOffset + 12] = (short) ((v1 >>> 20) & 0b1111111L); + output[outputOffset + 13] = (short) ((v1 >>> 27) & 0b1111111L); + output[outputOffset + 14] = (short) ((v1 >>> 34) & 0b1111111L); + output[outputOffset + 15] = (short) ((v1 >>> 41) & 0b1111111L); + output[outputOffset + 16] = (short) ((v1 >>> 48) & 0b1111111L); + output[outputOffset + 17] = (short) ((v1 >>> 55) & 0b1111111L); + output[outputOffset + 18] = (short) (((v1 >>> 62) & 0b11L) | ((v2 & 0b11111L) << 2)); + output[outputOffset + 19] = (short) ((v2 >>> 5) & 0b1111111L); + output[outputOffset + 20] = (short) ((v2 >>> 12) & 0b1111111L); + output[outputOffset + 21] = (short) ((v2 >>> 19) & 0b1111111L); + output[outputOffset + 22] = (short) ((v2 >>> 26) & 0b1111111L); + output[outputOffset + 23] = (short) ((v2 >>> 33) & 0b1111111L); + output[outputOffset + 24] = (short) ((v2 >>> 40) & 0b1111111L); + output[outputOffset + 25] = (short) ((v2 >>> 47) & 0b1111111L); + output[outputOffset + 26] = (short) ((v2 >>> 54) & 0b1111111L); + output[outputOffset + 27] = (short) (((v2 >>> 61) & 0b111L) | ((v3 & 0b1111L) << 3)); + output[outputOffset + 28] = (short) ((v3 >>> 4) & 0b1111111L); + output[outputOffset + 29] = (short) ((v3 >>> 11) & 0b1111111L); + output[outputOffset + 30] = (short) ((v3 >>> 18) & 0b1111111L); + output[outputOffset + 31] = (short) ((v3 >>> 25) & 0b1111111L); + } + } + + private static final class Unpacker8 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + output[outputOffset] = (short) (v0 & 0b11111111L); + output[outputOffset + 1] = (short) ((v0 >>> 8) & 0b11111111L); + output[outputOffset + 2] = (short) ((v0 >>> 16) & 0b11111111L); + output[outputOffset + 3] = (short) ((v0 >>> 24) & 0b11111111L); + output[outputOffset + 4] = (short) ((v0 >>> 32) & 0b11111111L); + output[outputOffset + 5] = (short) ((v0 >>> 40) & 0b11111111L); + output[outputOffset + 6] = (short) ((v0 >>> 48) & 0b11111111L); + output[outputOffset + 7] = (short) ((v0 >>> 56) & 0b11111111L); + output[outputOffset + 8] = (short) (v1 & 0b11111111L); + output[outputOffset + 9] = (short) ((v1 >>> 8) & 0b11111111L); + output[outputOffset + 10] = (short) ((v1 >>> 16) & 0b11111111L); + output[outputOffset + 11] = (short) ((v1 >>> 24) & 0b11111111L); + output[outputOffset + 12] = (short) ((v1 >>> 32) & 0b11111111L); + output[outputOffset + 13] = (short) ((v1 >>> 40) & 0b11111111L); + output[outputOffset + 14] = (short) ((v1 >>> 48) & 0b11111111L); + output[outputOffset + 15] = (short) ((v1 >>> 56) & 0b11111111L); + output[outputOffset + 16] = (short) (v2 & 0b11111111L); + output[outputOffset + 17] = (short) ((v2 >>> 8) & 0b11111111L); + output[outputOffset + 18] = (short) ((v2 >>> 16) & 0b11111111L); + output[outputOffset + 19] = (short) ((v2 >>> 24) & 0b11111111L); + output[outputOffset + 20] = (short) ((v2 >>> 32) & 0b11111111L); + output[outputOffset + 21] = (short) ((v2 >>> 40) & 0b11111111L); + output[outputOffset + 22] = (short) ((v2 >>> 48) & 0b11111111L); + output[outputOffset + 23] = (short) ((v2 >>> 56) & 0b11111111L); + output[outputOffset + 24] = (short) (v3 & 0b11111111L); + output[outputOffset + 25] = (short) ((v3 >>> 8) & 0b11111111L); + output[outputOffset + 26] = (short) ((v3 >>> 16) & 0b11111111L); + output[outputOffset + 27] = (short) ((v3 >>> 24) & 0b11111111L); + output[outputOffset + 28] = (short) ((v3 >>> 32) & 0b11111111L); + output[outputOffset + 29] = (short) ((v3 >>> 40) & 0b11111111L); + output[outputOffset + 30] = (short) ((v3 >>> 48) & 0b11111111L); + output[outputOffset + 31] = (short) ((v3 >>> 56) & 0b11111111L); + } + } + + private static final class Unpacker9 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + int v4 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 9) & 0b111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 18) & 0b111111111L); + output[outputOffset + 3] = (short) ((v0 >>> 27) & 0b111111111L); + output[outputOffset + 4] = (short) ((v0 >>> 36) & 0b111111111L); + output[outputOffset + 5] = (short) ((v0 >>> 45) & 0b111111111L); + output[outputOffset + 6] = (short) ((v0 >>> 54) & 0b111111111L); + output[outputOffset + 7] = (short) (((v0 >>> 63) & 0b1L) | ((v1 & 0b11111111L) << 1)); + output[outputOffset + 8] = (short) ((v1 >>> 8) & 0b111111111L); + output[outputOffset + 9] = (short) ((v1 >>> 17) & 0b111111111L); + output[outputOffset + 10] = (short) ((v1 >>> 26) & 0b111111111L); + output[outputOffset + 11] = (short) ((v1 >>> 35) & 0b111111111L); + output[outputOffset + 12] = (short) ((v1 >>> 44) & 0b111111111L); + output[outputOffset + 13] = (short) ((v1 >>> 53) & 0b111111111L); + output[outputOffset + 14] = (short) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111L) << 2)); + output[outputOffset + 15] = (short) ((v2 >>> 7) & 0b111111111L); + output[outputOffset + 16] = (short) ((v2 >>> 16) & 0b111111111L); + output[outputOffset + 17] = (short) ((v2 >>> 25) & 0b111111111L); + output[outputOffset + 18] = (short) ((v2 >>> 34) & 0b111111111L); + output[outputOffset + 19] = (short) ((v2 >>> 43) & 0b111111111L); + output[outputOffset + 20] = (short) ((v2 >>> 52) & 0b111111111L); + output[outputOffset + 21] = (short) (((v2 >>> 61) & 0b111L) | ((v3 & 0b111111L) << 3)); + output[outputOffset + 22] = (short) ((v3 >>> 6) & 0b111111111L); + output[outputOffset + 23] = (short) ((v3 >>> 15) & 0b111111111L); + output[outputOffset + 24] = (short) ((v3 >>> 24) & 0b111111111L); + output[outputOffset + 25] = (short) ((v3 >>> 33) & 0b111111111L); + output[outputOffset + 26] = (short) ((v3 >>> 42) & 0b111111111L); + output[outputOffset + 27] = (short) ((v3 >>> 51) & 0b111111111L); + output[outputOffset + 28] = (short) (((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111L) << 4)); + output[outputOffset + 29] = (short) ((v4 >>> 5) & 0b111111111L); + output[outputOffset + 30] = (short) ((v4 >>> 14) & 0b111111111L); + output[outputOffset + 31] = (short) ((v4 >>> 23) & 0b111111111L); + } + } + + private static final class Unpacker10 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + output[outputOffset] = (short) (v0 & 0b1111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 10) & 0b1111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 20) & 0b1111111111L); + output[outputOffset + 3] = (short) ((v0 >>> 30) & 0b1111111111L); + output[outputOffset + 4] = (short) ((v0 >>> 40) & 0b1111111111L); + output[outputOffset + 5] = (short) ((v0 >>> 50) & 0b1111111111L); + output[outputOffset + 6] = (short) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b111111L) << 4)); + output[outputOffset + 7] = (short) ((v1 >>> 6) & 0b1111111111L); + output[outputOffset + 8] = (short) ((v1 >>> 16) & 0b1111111111L); + output[outputOffset + 9] = (short) ((v1 >>> 26) & 0b1111111111L); + output[outputOffset + 10] = (short) ((v1 >>> 36) & 0b1111111111L); + output[outputOffset + 11] = (short) ((v1 >>> 46) & 0b1111111111L); + output[outputOffset + 12] = (short) (((v1 >>> 56) & 0b11111111L) | ((v2 & 0b11L) << 8)); + output[outputOffset + 13] = (short) ((v2 >>> 2) & 0b1111111111L); + output[outputOffset + 14] = (short) ((v2 >>> 12) & 0b1111111111L); + output[outputOffset + 15] = (short) ((v2 >>> 22) & 0b1111111111L); + output[outputOffset + 16] = (short) ((v2 >>> 32) & 0b1111111111L); + output[outputOffset + 17] = (short) ((v2 >>> 42) & 0b1111111111L); + output[outputOffset + 18] = (short) ((v2 >>> 52) & 0b1111111111L); + output[outputOffset + 19] = (short) (((v2 >>> 62) & 0b11L) | ((v3 & 0b11111111L) << 2)); + output[outputOffset + 20] = (short) ((v3 >>> 8) & 0b1111111111L); + output[outputOffset + 21] = (short) ((v3 >>> 18) & 0b1111111111L); + output[outputOffset + 22] = (short) ((v3 >>> 28) & 0b1111111111L); + output[outputOffset + 23] = (short) ((v3 >>> 38) & 0b1111111111L); + output[outputOffset + 24] = (short) ((v3 >>> 48) & 0b1111111111L); + output[outputOffset + 25] = (short) (((v3 >>> 58) & 0b111111L) | ((v4 & 0b1111L) << 6)); + output[outputOffset + 26] = (short) ((v4 >>> 4) & 0b1111111111L); + output[outputOffset + 27] = (short) ((v4 >>> 14) & 0b1111111111L); + output[outputOffset + 28] = (short) ((v4 >>> 24) & 0b1111111111L); + output[outputOffset + 29] = (short) ((v4 >>> 34) & 0b1111111111L); + output[outputOffset + 30] = (short) ((v4 >>> 44) & 0b1111111111L); + output[outputOffset + 31] = (short) ((v4 >>> 54) & 0b1111111111L); + } + } + + private static final class Unpacker11 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + int v5 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b11111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 11) & 0b11111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 22) & 0b11111111111L); + output[outputOffset + 3] = (short) ((v0 >>> 33) & 0b11111111111L); + output[outputOffset + 4] = (short) ((v0 >>> 44) & 0b11111111111L); + output[outputOffset + 5] = (short) (((v0 >>> 55) & 0b111111111L) | ((v1 & 0b11L) << 9)); + output[outputOffset + 6] = (short) ((v1 >>> 2) & 0b11111111111L); + output[outputOffset + 7] = (short) ((v1 >>> 13) & 0b11111111111L); + output[outputOffset + 8] = (short) ((v1 >>> 24) & 0b11111111111L); + output[outputOffset + 9] = (short) ((v1 >>> 35) & 0b11111111111L); + output[outputOffset + 10] = (short) ((v1 >>> 46) & 0b11111111111L); + output[outputOffset + 11] = (short) (((v1 >>> 57) & 0b1111111L) | ((v2 & 0b1111L) << 7)); + output[outputOffset + 12] = (short) ((v2 >>> 4) & 0b11111111111L); + output[outputOffset + 13] = (short) ((v2 >>> 15) & 0b11111111111L); + output[outputOffset + 14] = (short) ((v2 >>> 26) & 0b11111111111L); + output[outputOffset + 15] = (short) ((v2 >>> 37) & 0b11111111111L); + output[outputOffset + 16] = (short) ((v2 >>> 48) & 0b11111111111L); + output[outputOffset + 17] = (short) (((v2 >>> 59) & 0b11111L) | ((v3 & 0b111111L) << 5)); + output[outputOffset + 18] = (short) ((v3 >>> 6) & 0b11111111111L); + output[outputOffset + 19] = (short) ((v3 >>> 17) & 0b11111111111L); + output[outputOffset + 20] = (short) ((v3 >>> 28) & 0b11111111111L); + output[outputOffset + 21] = (short) ((v3 >>> 39) & 0b11111111111L); + output[outputOffset + 22] = (short) ((v3 >>> 50) & 0b11111111111L); + output[outputOffset + 23] = (short) (((v3 >>> 61) & 0b111L) | ((v4 & 0b11111111L) << 3)); + output[outputOffset + 24] = (short) ((v4 >>> 8) & 0b11111111111L); + output[outputOffset + 25] = (short) ((v4 >>> 19) & 0b11111111111L); + output[outputOffset + 26] = (short) ((v4 >>> 30) & 0b11111111111L); + output[outputOffset + 27] = (short) ((v4 >>> 41) & 0b11111111111L); + output[outputOffset + 28] = (short) ((v4 >>> 52) & 0b11111111111L); + output[outputOffset + 29] = (short) (((v4 >>> 63) & 0b1L) | ((v5 & 0b1111111111L) << 1)); + output[outputOffset + 30] = (short) ((v5 >>> 10) & 0b11111111111L); + output[outputOffset + 31] = (short) ((v5 >>> 21) & 0b11111111111L); + } + } + + private static final class Unpacker12 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + output[outputOffset] = (short) (v0 & 0b111111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 12) & 0b111111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 24) & 0b111111111111L); + output[outputOffset + 3] = (short) ((v0 >>> 36) & 0b111111111111L); + output[outputOffset + 4] = (short) ((v0 >>> 48) & 0b111111111111L); + output[outputOffset + 5] = (short) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111L) << 4)); + output[outputOffset + 6] = (short) ((v1 >>> 8) & 0b111111111111L); + output[outputOffset + 7] = (short) ((v1 >>> 20) & 0b111111111111L); + output[outputOffset + 8] = (short) ((v1 >>> 32) & 0b111111111111L); + output[outputOffset + 9] = (short) ((v1 >>> 44) & 0b111111111111L); + output[outputOffset + 10] = (short) (((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111L) << 8)); + output[outputOffset + 11] = (short) ((v2 >>> 4) & 0b111111111111L); + output[outputOffset + 12] = (short) ((v2 >>> 16) & 0b111111111111L); + output[outputOffset + 13] = (short) ((v2 >>> 28) & 0b111111111111L); + output[outputOffset + 14] = (short) ((v2 >>> 40) & 0b111111111111L); + output[outputOffset + 15] = (short) ((v2 >>> 52) & 0b111111111111L); + output[outputOffset + 16] = (short) (v3 & 0b111111111111L); + output[outputOffset + 17] = (short) ((v3 >>> 12) & 0b111111111111L); + output[outputOffset + 18] = (short) ((v3 >>> 24) & 0b111111111111L); + output[outputOffset + 19] = (short) ((v3 >>> 36) & 0b111111111111L); + output[outputOffset + 20] = (short) ((v3 >>> 48) & 0b111111111111L); + output[outputOffset + 21] = (short) (((v3 >>> 60) & 0b1111L) | ((v4 & 0b11111111L) << 4)); + output[outputOffset + 22] = (short) ((v4 >>> 8) & 0b111111111111L); + output[outputOffset + 23] = (short) ((v4 >>> 20) & 0b111111111111L); + output[outputOffset + 24] = (short) ((v4 >>> 32) & 0b111111111111L); + output[outputOffset + 25] = (short) ((v4 >>> 44) & 0b111111111111L); + output[outputOffset + 26] = (short) (((v4 >>> 56) & 0b11111111L) | ((v5 & 0b1111L) << 8)); + output[outputOffset + 27] = (short) ((v5 >>> 4) & 0b111111111111L); + output[outputOffset + 28] = (short) ((v5 >>> 16) & 0b111111111111L); + output[outputOffset + 29] = (short) ((v5 >>> 28) & 0b111111111111L); + output[outputOffset + 30] = (short) ((v5 >>> 40) & 0b111111111111L); + output[outputOffset + 31] = (short) ((v5 >>> 52) & 0b111111111111L); + } + } + + private static final class Unpacker13 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + int v6 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b1111111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 13) & 0b1111111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 26) & 0b1111111111111L); + output[outputOffset + 3] = (short) ((v0 >>> 39) & 0b1111111111111L); + output[outputOffset + 4] = (short) (((v0 >>> 52) & 0b111111111111L) | ((v1 & 0b1L) << 12)); + output[outputOffset + 5] = (short) ((v1 >>> 1) & 0b1111111111111L); + output[outputOffset + 6] = (short) ((v1 >>> 14) & 0b1111111111111L); + output[outputOffset + 7] = (short) ((v1 >>> 27) & 0b1111111111111L); + output[outputOffset + 8] = (short) ((v1 >>> 40) & 0b1111111111111L); + output[outputOffset + 9] = (short) (((v1 >>> 53) & 0b11111111111L) | ((v2 & 0b11L) << 11)); + output[outputOffset + 10] = (short) ((v2 >>> 2) & 0b1111111111111L); + output[outputOffset + 11] = (short) ((v2 >>> 15) & 0b1111111111111L); + output[outputOffset + 12] = (short) ((v2 >>> 28) & 0b1111111111111L); + output[outputOffset + 13] = (short) ((v2 >>> 41) & 0b1111111111111L); + output[outputOffset + 14] = (short) (((v2 >>> 54) & 0b1111111111L) | ((v3 & 0b111L) << 10)); + output[outputOffset + 15] = (short) ((v3 >>> 3) & 0b1111111111111L); + output[outputOffset + 16] = (short) ((v3 >>> 16) & 0b1111111111111L); + output[outputOffset + 17] = (short) ((v3 >>> 29) & 0b1111111111111L); + output[outputOffset + 18] = (short) ((v3 >>> 42) & 0b1111111111111L); + output[outputOffset + 19] = (short) (((v3 >>> 55) & 0b111111111L) | ((v4 & 0b1111L) << 9)); + output[outputOffset + 20] = (short) ((v4 >>> 4) & 0b1111111111111L); + output[outputOffset + 21] = (short) ((v4 >>> 17) & 0b1111111111111L); + output[outputOffset + 22] = (short) ((v4 >>> 30) & 0b1111111111111L); + output[outputOffset + 23] = (short) ((v4 >>> 43) & 0b1111111111111L); + output[outputOffset + 24] = (short) (((v4 >>> 56) & 0b11111111L) | ((v5 & 0b11111L) << 8)); + output[outputOffset + 25] = (short) ((v5 >>> 5) & 0b1111111111111L); + output[outputOffset + 26] = (short) ((v5 >>> 18) & 0b1111111111111L); + output[outputOffset + 27] = (short) ((v5 >>> 31) & 0b1111111111111L); + output[outputOffset + 28] = (short) ((v5 >>> 44) & 0b1111111111111L); + output[outputOffset + 29] = (short) (((v5 >>> 57) & 0b1111111L) | ((v6 & 0b111111L) << 7)); + output[outputOffset + 30] = (short) ((v6 >>> 6) & 0b1111111111111L); + output[outputOffset + 31] = (short) ((v6 >>> 19) & 0b1111111111111L); + } + } + + private static final class Unpacker14 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + output[outputOffset] = (short) (v0 & 0b11111111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 14) & 0b11111111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 28) & 0b11111111111111L); + output[outputOffset + 3] = (short) ((v0 >>> 42) & 0b11111111111111L); + output[outputOffset + 4] = (short) (((v0 >>> 56) & 0b11111111L) | ((v1 & 0b111111L) << 8)); + output[outputOffset + 5] = (short) ((v1 >>> 6) & 0b11111111111111L); + output[outputOffset + 6] = (short) ((v1 >>> 20) & 0b11111111111111L); + output[outputOffset + 7] = (short) ((v1 >>> 34) & 0b11111111111111L); + output[outputOffset + 8] = (short) ((v1 >>> 48) & 0b11111111111111L); + output[outputOffset + 9] = (short) (((v1 >>> 62) & 0b11L) | ((v2 & 0b111111111111L) << 2)); + output[outputOffset + 10] = (short) ((v2 >>> 12) & 0b11111111111111L); + output[outputOffset + 11] = (short) ((v2 >>> 26) & 0b11111111111111L); + output[outputOffset + 12] = (short) ((v2 >>> 40) & 0b11111111111111L); + output[outputOffset + 13] = (short) (((v2 >>> 54) & 0b1111111111L) | ((v3 & 0b1111L) << 10)); + output[outputOffset + 14] = (short) ((v3 >>> 4) & 0b11111111111111L); + output[outputOffset + 15] = (short) ((v3 >>> 18) & 0b11111111111111L); + output[outputOffset + 16] = (short) ((v3 >>> 32) & 0b11111111111111L); + output[outputOffset + 17] = (short) ((v3 >>> 46) & 0b11111111111111L); + output[outputOffset + 18] = (short) (((v3 >>> 60) & 0b1111L) | ((v4 & 0b1111111111L) << 4)); + output[outputOffset + 19] = (short) ((v4 >>> 10) & 0b11111111111111L); + output[outputOffset + 20] = (short) ((v4 >>> 24) & 0b11111111111111L); + output[outputOffset + 21] = (short) ((v4 >>> 38) & 0b11111111111111L); + output[outputOffset + 22] = (short) (((v4 >>> 52) & 0b111111111111L) | ((v5 & 0b11L) << 12)); + output[outputOffset + 23] = (short) ((v5 >>> 2) & 0b11111111111111L); + output[outputOffset + 24] = (short) ((v5 >>> 16) & 0b11111111111111L); + output[outputOffset + 25] = (short) ((v5 >>> 30) & 0b11111111111111L); + output[outputOffset + 26] = (short) ((v5 >>> 44) & 0b11111111111111L); + output[outputOffset + 27] = (short) (((v5 >>> 58) & 0b111111L) | ((v6 & 0b11111111L) << 6)); + output[outputOffset + 28] = (short) ((v6 >>> 8) & 0b11111111111111L); + output[outputOffset + 29] = (short) ((v6 >>> 22) & 0b11111111111111L); + output[outputOffset + 30] = (short) ((v6 >>> 36) & 0b11111111111111L); + output[outputOffset + 31] = (short) ((v6 >>> 50) & 0b11111111111111L); + } + } + + private static final class Unpacker15 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, + int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + int v7 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b111111111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 15) & 0b111111111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 30) & 0b111111111111111L); + output[outputOffset + 3] = (short) ((v0 >>> 45) & 0b111111111111111L); + output[outputOffset + 4] = (short) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111111L) << 4)); + output[outputOffset + 5] = (short) ((v1 >>> 11) & 0b111111111111111L); + output[outputOffset + 6] = (short) ((v1 >>> 26) & 0b111111111111111L); + output[outputOffset + 7] = (short) ((v1 >>> 41) & 0b111111111111111L); + output[outputOffset + 8] = (short) (((v1 >>> 56) & 0b11111111L) | ((v2 & 0b1111111L) << 8)); + output[outputOffset + 9] = (short) ((v2 >>> 7) & 0b111111111111111L); + output[outputOffset + 10] = (short) ((v2 >>> 22) & 0b111111111111111L); + output[outputOffset + 11] = (short) ((v2 >>> 37) & 0b111111111111111L); + output[outputOffset + 12] = (short) (((v2 >>> 52) & 0b111111111111L) | ((v3 & 0b111L) << 12)); + output[outputOffset + 13] = (short) ((v3 >>> 3) & 0b111111111111111L); + output[outputOffset + 14] = (short) ((v3 >>> 18) & 0b111111111111111L); + output[outputOffset + 15] = (short) ((v3 >>> 33) & 0b111111111111111L); + output[outputOffset + 16] = (short) ((v3 >>> 48) & 0b111111111111111L); + output[outputOffset + 17] = (short) (((v3 >>> 63) & 0b1L) | ((v4 & 0b11111111111111L) << 1)); + output[outputOffset + 18] = (short) ((v4 >>> 14) & 0b111111111111111L); + output[outputOffset + 19] = (short) ((v4 >>> 29) & 0b111111111111111L); + output[outputOffset + 20] = (short) ((v4 >>> 44) & 0b111111111111111L); + output[outputOffset + 21] = (short) (((v4 >>> 59) & 0b11111L) | ((v5 & 0b1111111111L) << 5)); + output[outputOffset + 22] = (short) ((v5 >>> 10) & 0b111111111111111L); + output[outputOffset + 23] = (short) ((v5 >>> 25) & 0b111111111111111L); + output[outputOffset + 24] = (short) ((v5 >>> 40) & 0b111111111111111L); + output[outputOffset + 25] = (short) (((v5 >>> 55) & 0b111111111L) | ((v6 & 0b111111L) << 9)); + output[outputOffset + 26] = (short) ((v6 >>> 6) & 0b111111111111111L); + output[outputOffset + 27] = (short) ((v6 >>> 21) & 0b111111111111111L); + output[outputOffset + 28] = (short) ((v6 >>> 36) & 0b111111111111111L); + output[outputOffset + 29] = (short) (((v6 >>> 51) & 0b1111111111111L) | ((v7 & 0b11L) << 13)); + output[outputOffset + 30] = (short) ((v7 >>> 2) & 0b111111111111111L); + output[outputOffset + 31] = (short) ((v7 >>> 17) & 0b111111111111111L); + } + } + + private static final class Unpacker16 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + input.readShorts(output, outputOffset, length); + } + } + + private static final class Unpacker17 + implements ShortBitUnpacker + { + @Override + public void unpack(short[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + while (length >= 32) { + unpack32(output, outputOffset, input); + outputOffset += 32; + length -= 32; + } + } + + private static void unpack32(short[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + long v2 = input.readLong(); + long v3 = input.readLong(); + long v4 = input.readLong(); + long v5 = input.readLong(); + long v6 = input.readLong(); + long v7 = input.readLong(); + int v8 = input.readInt(); + output[outputOffset] = (short) (v0 & 0b11111111111111111L); + output[outputOffset + 1] = (short) ((v0 >>> 17) & 0b11111111111111111L); + output[outputOffset + 2] = (short) ((v0 >>> 34) & 0b11111111111111111L); + output[outputOffset + 3] = (short) (((v0 >>> 51) & 0b1111111111111L) | ((v1 & 0b1111L) << 13)); + output[outputOffset + 4] = (short) ((v1 >>> 4) & 0b11111111111111111L); + output[outputOffset + 5] = (short) ((v1 >>> 21) & 0b11111111111111111L); + output[outputOffset + 6] = (short) ((v1 >>> 38) & 0b11111111111111111L); + output[outputOffset + 7] = (short) (((v1 >>> 55) & 0b111111111L) | ((v2 & 0b11111111L) << 9)); + output[outputOffset + 8] = (short) ((v2 >>> 8) & 0b11111111111111111L); + output[outputOffset + 9] = (short) ((v2 >>> 25) & 0b11111111111111111L); + output[outputOffset + 10] = (short) ((v2 >>> 42) & 0b11111111111111111L); + output[outputOffset + 11] = (short) (((v2 >>> 59) & 0b11111L) | ((v3 & 0b111111111111L) << 5)); + output[outputOffset + 12] = (short) ((v3 >>> 12) & 0b11111111111111111L); + output[outputOffset + 13] = (short) ((v3 >>> 29) & 0b11111111111111111L); + output[outputOffset + 14] = (short) ((v3 >>> 46) & 0b11111111111111111L); + output[outputOffset + 15] = (short) (((v3 >>> 63) & 0b1L) | ((v4 & 0b1111111111111111L) << 1)); + output[outputOffset + 16] = (short) ((v4 >>> 16) & 0b11111111111111111L); + output[outputOffset + 17] = (short) ((v4 >>> 33) & 0b11111111111111111L); + output[outputOffset + 18] = (short) (((v4 >>> 50) & 0b11111111111111L) | ((v5 & 0b111L) << 14)); + output[outputOffset + 19] = (short) ((v5 >>> 3) & 0b11111111111111111L); + output[outputOffset + 20] = (short) ((v5 >>> 20) & 0b11111111111111111L); + output[outputOffset + 21] = (short) ((v5 >>> 37) & 0b11111111111111111L); + output[outputOffset + 22] = (short) (((v5 >>> 54) & 0b1111111111L) | ((v6 & 0b1111111L) << 10)); + output[outputOffset + 23] = (short) ((v6 >>> 7) & 0b11111111111111111L); + output[outputOffset + 24] = (short) ((v6 >>> 24) & 0b11111111111111111L); + output[outputOffset + 25] = (short) ((v6 >>> 41) & 0b11111111111111111L); + output[outputOffset + 26] = (short) (((v6 >>> 58) & 0b111111L) | ((v7 & 0b11111111111L) << 6)); + output[outputOffset + 27] = (short) ((v7 >>> 11) & 0b11111111111111111L); + output[outputOffset + 28] = (short) ((v7 >>> 28) & 0b11111111111111111L); + output[outputOffset + 29] = (short) ((v7 >>> 45) & 0b11111111111111111L); + output[outputOffset + 30] = (short) (((v7 >>> 62) & 0b11L) | ((v8 & 0b111111111111111L) << 2)); + output[outputOffset + 31] = (short) ((v8 >>> 15) & 0b11111111111111111L); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortDecimalFixedWidthByteArrayBatchDecoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortDecimalFixedWidthByteArrayBatchDecoder.java new file mode 100644 index 000000000000..707d7425d443 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ShortDecimalFixedWidthByteArrayBatchDecoder.java @@ -0,0 +1,282 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; + +import java.lang.invoke.VarHandle; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetReaderUtils.propagateSignBit; +import static java.lang.invoke.MethodHandles.byteArrayViewVarHandle; +import static java.nio.ByteOrder.BIG_ENDIAN; + +public class ShortDecimalFixedWidthByteArrayBatchDecoder +{ + private static final ShortDecimalDecoder[] VALUE_DECODERS = new ShortDecimalDecoder[] { + new BigEndianReader1(), + new BigEndianReader2(), + new BigEndianReader3(), + new BigEndianReader4(), + new BigEndianReader5(), + new BigEndianReader6(), + new BigEndianReader7(), + new BigEndianReader8() + }; + private static final VarHandle LONG_HANDLE_BIG_ENDIAN = byteArrayViewVarHandle(long[].class, BIG_ENDIAN); + private static final VarHandle INT_HANDLE_BIG_ENDIAN = byteArrayViewVarHandle(int[].class, BIG_ENDIAN); + + public interface ShortDecimalDecoder + { + void decode(SimpleSliceInputStream input, long[] values, int offset, int length); + } + + private final ShortDecimalDecoder decoder; + + public ShortDecimalFixedWidthByteArrayBatchDecoder(int length) + { + checkArgument( + length > 0 && length <= 8, + "Short decimal length %s must be in range 1-8", + length); + decoder = VALUE_DECODERS[length - 1]; + // Unscaled number is encoded as two's complement using big-endian byte order + // (the most significant byte is the zeroth element) + } + + public void getShortDecimalValues(SimpleSliceInputStream input, long[] values, int offset, int length) + { + decoder.decode(input, values, offset, length); + } + + public static final class BigEndianReader8 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + for (int i = offset; i < offset + length; i++) { + values[i] = (long) LONG_HANDLE_BIG_ENDIAN.get(inputArray, inputOffset); + inputOffset += Long.BYTES; + } + input.skip(length * Long.BYTES); + } + } + + private static final class BigEndianReader7 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + if (length == 0) { + return; + } + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + int endOffset = offset + length; + for (int i = offset; i < endOffset - 1; i++) { + // We read redundant bytes and then ignore them. Sign bit is propagated by `>>` operator + values[i] = (long) LONG_HANDLE_BIG_ENDIAN.get(inputArray, inputOffset + inputBytesRead) >> 8; + inputBytesRead += 7; + } + // Decode the last one "normally" as it would read data out of bounds + values[endOffset - 1] = decode(input, inputBytesRead); + input.skip(inputBytesRead + 7); + } + + private long decode(SimpleSliceInputStream input, int index) + { + long value = (input.getByteUnchecked(index + 6) & 0xFFL) + | (input.getByteUnchecked(index + 5) & 0xFFL) << 8 + | (input.getByteUnchecked(index + 4) & 0xFFL) << 16 + | (Integer.reverseBytes(input.getIntUnchecked(index)) & 0xFFFFFFFFL) << 24; + return propagateSignBit(value, 8); + } + } + + private static final class BigEndianReader6 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + if (length == 0) { + return; + } + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + int endOffset = offset + length; + for (int i = offset; i < endOffset - 1; i++) { + // We read redundant bytes and then ignore them. Sign bit is propagated by `>>` operator + values[i] = (long) LONG_HANDLE_BIG_ENDIAN.get(inputArray, inputOffset + inputBytesRead) >> 16; + inputBytesRead += 6; + } + // Decode the last one "normally" as it would read data out of bounds + values[endOffset - 1] = decode(input, inputBytesRead); + input.skip(inputBytesRead + 6); + } + + private long decode(SimpleSliceInputStream input, int index) + { + long value = (input.getByteUnchecked(index + 5) & 0xFFL) + | (input.getByteUnchecked(index + 4) & 0xFFL) << 8 + | (Integer.reverseBytes(input.getIntUnchecked(index)) & 0xFFFFFFFFL) << 16; + return propagateSignBit(value, 16); + } + } + + private static final class BigEndianReader5 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + if (length == 0) { + return; + } + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + int endOffset = offset + length; + for (int i = offset; i < endOffset - 1; i++) { + // We read redundant bytes and then ignore them. Sign bit is propagated by `>>` operator + values[i] = (long) LONG_HANDLE_BIG_ENDIAN.get(inputArray, inputOffset + inputBytesRead) >> 24; + inputBytesRead += 5; + } + // Decode the last one "normally" as it would read data out of bounds + values[endOffset - 1] = decode(input, inputBytesRead); + input.skip(inputBytesRead + 5); + } + + private long decode(SimpleSliceInputStream input, int index) + { + long value = (input.getByteUnchecked(index + 4) & 0xFFL) + | (Integer.reverseBytes(input.getIntUnchecked(index)) & 0xFFFFFFFFL) << 8; + return propagateSignBit(value, 24); + } + } + + private static final class BigEndianReader4 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + for (int i = offset; i < offset + length; i++) { + values[i] = (int) INT_HANDLE_BIG_ENDIAN.get(inputArray, inputOffset); + inputOffset += Integer.BYTES; + } + input.skip(length * Integer.BYTES); + } + } + + private static final class BigEndianReader3 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length > 2) { + // We read redundant bytes and then ignore them. Sign bit is propagated by `>>` operator + long value = (long) LONG_HANDLE_BIG_ENDIAN.get(inputArray, inputOffset + inputBytesRead); + inputBytesRead += 6; + + values[offset] = value >> 40; + values[offset + 1] = value << 24 >> 40; + + offset += 2; + length -= 2; + } + // Decode the last values "normally" as it would read data out of bounds + while (length > 0) { + values[offset++] = decode(input, inputBytesRead); + length--; + inputBytesRead += 3; + } + input.skip(inputBytesRead); + } + + private long decode(SimpleSliceInputStream input, int index) + { + long value = (input.getByteUnchecked(index + 2) & 0xFFL) + | (input.getByteUnchecked(index + 1) & 0xFFL) << 8 + | (input.getByteUnchecked(index) & 0xFFL) << 16; + return propagateSignBit(value, 40); + } + } + + private static final class BigEndianReader2 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length > 3) { + // Reverse all bytes at once + long value = (long) LONG_HANDLE_BIG_ENDIAN.get(inputArray, inputOffset + inputBytesRead); + inputBytesRead += Long.BYTES; + + // We first shift the byte as left as possible. Then, when shifting back right, + // the sign bit will get propagated + values[offset] = value >> 48; + values[offset + 1] = value << 16 >> 48; + values[offset + 2] = value << 32 >> 48; + values[offset + 3] = value << 48 >> 48; + + offset += 4; + length -= 4; + } + input.skip(inputBytesRead); + + while (length > 0) { + // Implicit cast will propagate the sign bit correctly, as it is performed after the byte reversal. + values[offset++] = Short.reverseBytes(input.readShort()); + length--; + } + } + } + + private static final class BigEndianReader1 + implements ShortDecimalDecoder + { + @Override + public void decode(SimpleSliceInputStream input, long[] values, int offset, int length) + { + byte[] inputArr = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + int outputOffset = offset; + while (length > 0) { + // Implicit cast will propagate the sign bit correctly + values[outputOffset++] = inputArr[inputOffset + inputBytesRead]; + inputBytesRead++; + length--; + } + input.skip(inputBytesRead); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ValueDecoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ValueDecoder.java new file mode 100644 index 000000000000..22beb7d54130 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ValueDecoder.java @@ -0,0 +1,59 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.ParquetEncoding; +import io.trino.parquet.reader.SimpleSliceInputStream; + +import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; + +public interface ValueDecoder +{ + void init(SimpleSliceInputStream input); + + void read(T values, int offset, int length); + + void skip(int n); + + class EmptyValueDecoder + implements ValueDecoder + { + @Override + public void init(SimpleSliceInputStream input) {} + + @Override + public void read(T values, int offset, int length) {} + + @Override + public void skip(int n) {} + } + + interface ValueDecodersProvider + { + ValueDecoder create(ParquetEncoding encoding); + } + + interface LevelsDecoderProvider + { + ValueDecoder create(int maxLevel); + } + + static ValueDecoder createLevelsDecoder(int maxLevel, boolean vectorizedDecodingEnabled) + { + if (maxLevel == 0) { + return new ValueDecoder.EmptyValueDecoder<>(); + } + return new RleBitPackingHybridDecoder(getWidthFromMaxInt(maxLevel), vectorizedDecodingEnabled); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ValueDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ValueDecoders.java new file mode 100644 index 000000000000..7430aa0c9b26 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/ValueDecoders.java @@ -0,0 +1,1363 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.airlift.slice.Slice; +import io.trino.parquet.ParquetEncoding; +import io.trino.parquet.PrimitiveField; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.flat.BinaryBuffer; +import io.trino.spi.TrinoException; +import io.trino.spi.type.CharType; +import io.trino.spi.type.DecimalConversions; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Decimals; +import io.trino.spi.type.Int128; +import io.trino.spi.type.TimeType; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.TimestampWithTimeZoneType; +import io.trino.spi.type.Type; +import io.trino.spi.type.VarcharType; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.values.ValuesReader; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.joda.time.DateTimeZone; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetEncoding.BYTE_STREAM_SPLIT; +import static io.trino.parquet.ParquetEncoding.DELTA_BYTE_ARRAY; +import static io.trino.parquet.ParquetEncoding.PLAIN; +import static io.trino.parquet.ParquetReaderUtils.toByteExact; +import static io.trino.parquet.ParquetReaderUtils.toShortExact; +import static io.trino.parquet.ParquetTypeUtils.checkBytesFitInShortDecimal; +import static io.trino.parquet.ParquetTypeUtils.getShortDecimalValue; +import static io.trino.parquet.ValuesType.VALUES; +import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.BooleanApacheParquetValueDecoder; +import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.DoubleApacheParquetValueDecoder; +import static io.trino.parquet.reader.decoders.ApacheParquetValueDecoders.FloatApacheParquetValueDecoder; +import static io.trino.parquet.reader.decoders.BooleanPlainValueDecoders.createBooleanPlainValueDecoder; +import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedByteDecoder; +import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedIntDecoder; +import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedLongDecoder; +import static io.trino.parquet.reader.decoders.DeltaBinaryPackedDecoders.DeltaBinaryPackedShortDecoder; +import static io.trino.parquet.reader.decoders.DeltaByteArrayDecoders.BinaryDeltaByteArrayDecoder; +import static io.trino.parquet.reader.decoders.DeltaByteArrayDecoders.BoundedVarcharDeltaByteArrayDecoder; +import static io.trino.parquet.reader.decoders.DeltaByteArrayDecoders.CharDeltaByteArrayDecoder; +import static io.trino.parquet.reader.decoders.DeltaLengthByteArrayDecoders.BinaryDeltaLengthDecoder; +import static io.trino.parquet.reader.decoders.DeltaLengthByteArrayDecoders.BoundedVarcharDeltaLengthDecoder; +import static io.trino.parquet.reader.decoders.DeltaLengthByteArrayDecoders.CharDeltaLengthDecoder; +import static io.trino.parquet.reader.decoders.PlainByteArrayDecoders.BinaryPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainByteArrayDecoders.BoundedVarcharPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainByteArrayDecoders.CharPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.FixedLengthPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.Int96TimestampPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.IntPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.IntToBytePlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.IntToShortPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.LongDecimalPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.LongPlainValueDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.ShortDecimalFixedLengthByteArrayDecoder; +import static io.trino.parquet.reader.decoders.PlainValueDecoders.UuidPlainValueDecoder; +import static io.trino.spi.StandardErrorCode.INVALID_CAST_ARGUMENT; +import static io.trino.spi.block.Fixed12Block.decodeFixed12First; +import static io.trino.spi.block.Fixed12Block.decodeFixed12Second; +import static io.trino.spi.block.Fixed12Block.encodeFixed12; +import static io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone; +import static io.trino.spi.type.Decimals.longTenToNth; +import static io.trino.spi.type.Decimals.overflows; +import static io.trino.spi.type.Decimals.rescale; +import static io.trino.spi.type.TimeZoneKey.UTC_KEY; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_SECOND; +import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_SECOND; +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MICROSECOND; +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_DAY; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_MICROSECOND; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_NANOSECOND; +import static io.trino.spi.type.Timestamps.round; +import static java.lang.Math.floorDiv; +import static java.lang.Math.floorMod; +import static java.lang.Math.toIntExact; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; + +/** + * This class provides API for creating value decoders for given fields and encodings. + *

+ * This class is to replace most of the logic contained in ParquetEncoding enum + */ +public final class ValueDecoders +{ + private final PrimitiveField field; + private final boolean vectorizedDecodingEnabled; + + public ValueDecoders(PrimitiveField field) + { + this(field, false); + } + + public ValueDecoders(PrimitiveField field, boolean vectorizedDecodingEnabled) + { + this.field = requireNonNull(field, "field is null"); + this.vectorizedDecodingEnabled = vectorizedDecodingEnabled; + } + + public ValueDecoder getDoubleDecoder(ParquetEncoding encoding) + { + if (PLAIN.equals(encoding)) { + return new LongPlainValueDecoder(); + } + else if (BYTE_STREAM_SPLIT.equals(encoding)) { + return new DoubleApacheParquetValueDecoder(getApacheParquetReader(encoding)); + } + throw wrongEncoding(encoding); + } + + public ValueDecoder getRealDecoder(ParquetEncoding encoding) + { + if (PLAIN.equals(encoding)) { + return new IntPlainValueDecoder(); + } + else if (BYTE_STREAM_SPLIT.equals(encoding)) { + return new FloatApacheParquetValueDecoder(getApacheParquetReader(encoding)); + } + throw wrongEncoding(encoding); + } + + public ValueDecoder getShortDecimalDecoder(ParquetEncoding encoding) + { + PrimitiveType primitiveType = field.getDescriptor().getPrimitiveType(); + checkArgument( + primitiveType.getLogicalTypeAnnotation() instanceof DecimalLogicalTypeAnnotation, + "Column %s is not annotated as a decimal", + field); + return switch (primitiveType.getPrimitiveTypeName()) { + case INT64 -> getLongDecoder(encoding); + case INT32 -> getInt32ToLongDecoder(encoding); + case FIXED_LEN_BYTE_ARRAY -> getFixedWidthShortDecimalDecoder(encoding); + case BINARY -> getBinaryShortDecimalDecoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getLongDecimalDecoder(ParquetEncoding encoding) + { + return switch (field.getDescriptor().getPrimitiveType().getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY -> getFixedWidthLongDecimalDecoder(encoding); + case BINARY -> getBinaryLongDecimalDecoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getUuidDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new UuidPlainValueDecoder(); + case DELTA_BYTE_ARRAY -> getDeltaUuidDecoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getLongDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new LongPlainValueDecoder(); + case DELTA_BINARY_PACKED -> new DeltaBinaryPackedLongDecoder(); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getIntDecoder(ParquetEncoding encoding) + { + return switch (field.getDescriptor().getPrimitiveType().getPrimitiveTypeName()) { + case INT64 -> getInt64ToIntDecoder(encoding); + case INT32 -> getInt32Decoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getShortDecoder(ParquetEncoding encoding) + { + return switch (field.getDescriptor().getPrimitiveType().getPrimitiveTypeName()) { + case INT64 -> getInt64ToShortDecoder(encoding); + case INT32 -> getInt32ToShortDecoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getByteDecoder(ParquetEncoding encoding) + { + return switch (field.getDescriptor().getPrimitiveType().getPrimitiveTypeName()) { + case INT64 -> getInt64ToByteDecoder(encoding); + case INT32 -> getInt32ToByteDecoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getBooleanDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> createBooleanPlainValueDecoder(vectorizedDecodingEnabled); + case RLE -> new RleBitPackingHybridBooleanDecoder(vectorizedDecodingEnabled); + // BIT_PACKED is a deprecated encoding which should not be used anymore as per + // https://github.com/apache/parquet-format/blob/master/Encodings.md#bit-packed-deprecated-bit_packed--4 + // An unoptimized decoder for this encoding is provided here for compatibility with old files or non-compliant writers + case BIT_PACKED -> new BooleanApacheParquetValueDecoder(getApacheParquetReader(encoding)); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getInt96TimestampDecoder(ParquetEncoding encoding) + { + if (PLAIN.equals(encoding)) { + // INT96 type has been deprecated as per https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0 + // However, this encoding is still commonly encountered in parquet files. + return new Int96TimestampPlainValueDecoder(); + } + throw wrongEncoding(encoding); + } + + public ValueDecoder getFixedWidthShortDecimalDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new ShortDecimalFixedLengthByteArrayDecoder(field.getDescriptor()); + case DELTA_BYTE_ARRAY -> getDeltaFixedWidthShortDecimalDecoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getFixedWidthLongDecimalDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new LongDecimalPlainValueDecoder(field.getDescriptor().getPrimitiveType().getTypeLength()); + case DELTA_BYTE_ARRAY -> getDeltaFixedWidthLongDecimalDecoder(encoding); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getFixedWidthBinaryDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new FixedLengthPlainValueDecoder(field.getDescriptor().getPrimitiveType().getTypeLength()); + case DELTA_BYTE_ARRAY -> new BinaryDeltaByteArrayDecoder(); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getBoundedVarcharBinaryDecoder(ParquetEncoding encoding) + { + Type trinoType = field.getType(); + checkArgument( + trinoType instanceof VarcharType varcharType && !varcharType.isUnbounded(), + "Trino type %s is not a bounded varchar", + trinoType); + return switch (encoding) { + case PLAIN -> new BoundedVarcharPlainValueDecoder((VarcharType) trinoType); + case DELTA_LENGTH_BYTE_ARRAY -> new BoundedVarcharDeltaLengthDecoder((VarcharType) trinoType); + case DELTA_BYTE_ARRAY -> new BoundedVarcharDeltaByteArrayDecoder((VarcharType) trinoType); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getCharBinaryDecoder(ParquetEncoding encoding) + { + Type trinoType = field.getType(); + checkArgument( + trinoType instanceof CharType, + "Trino type %s is not a char", + trinoType); + return switch (encoding) { + case PLAIN -> new CharPlainValueDecoder((CharType) trinoType); + case DELTA_LENGTH_BYTE_ARRAY -> new CharDeltaLengthDecoder((CharType) trinoType); + case DELTA_BYTE_ARRAY -> new CharDeltaByteArrayDecoder((CharType) trinoType); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getBinaryDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new BinaryPlainValueDecoder(); + case DELTA_LENGTH_BYTE_ARRAY -> new BinaryDeltaLengthDecoder(); + case DELTA_BYTE_ARRAY -> new BinaryDeltaByteArrayDecoder(); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getInt32Decoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new IntPlainValueDecoder(); + case DELTA_BINARY_PACKED -> new DeltaBinaryPackedIntDecoder(); + default -> throw wrongEncoding(encoding); + }; + } + + private ValueDecoder getInt32ToShortDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new IntToShortPlainValueDecoder(); + case DELTA_BINARY_PACKED -> new DeltaBinaryPackedShortDecoder(); + default -> throw wrongEncoding(encoding); + }; + } + + private ValueDecoder getInt32ToByteDecoder(ParquetEncoding encoding) + { + return switch (encoding) { + case PLAIN -> new IntToBytePlainValueDecoder(); + case DELTA_BINARY_PACKED -> new DeltaBinaryPackedByteDecoder(); + default -> throw wrongEncoding(encoding); + }; + } + + public ValueDecoder getTimeMicrosDecoder(ParquetEncoding encoding) + { + return new InlineTransformDecoder<>( + getLongDecoder(encoding), + (values, offset, length) -> { + for (int i = offset; i < offset + length; i++) { + values[i] = values[i] * PICOSECONDS_PER_MICROSECOND; + } + }); + } + + public ValueDecoder getTimeMillisDecoder(ParquetEncoding encoding) + { + int precision = ((TimeType) field.getType()).getPrecision(); + if (precision < 3) { + return new InlineTransformDecoder<>( + getInt32ToLongDecoder(encoding), + (values, offset, length) -> { + // decoded values are millis, round to lower precision and convert to picos + // modulo PICOSECONDS_PER_DAY is applied for the case when a value is rounded up to PICOSECONDS_PER_DAY + for (int i = offset; i < offset + length; i++) { + values[i] = (round(values[i], 3 - precision) * PICOSECONDS_PER_MILLISECOND) % PICOSECONDS_PER_DAY; + } + }); + } + return new InlineTransformDecoder<>( + getInt32ToLongDecoder(encoding), + (values, offset, length) -> { + for (int i = offset; i < offset + length; i++) { + values[i] = values[i] * PICOSECONDS_PER_MILLISECOND; + } + }); + } + + public ValueDecoder getInt96ToShortTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + checkArgument( + field.getType() instanceof TimestampType timestampType && timestampType.isShort(), + "Trino type %s is not a short timestamp", + field.getType()); + int precision = ((TimestampType) field.getType()).getPrecision(); + ValueDecoder delegate = getInt96TimestampDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + int[] int96Buffer = new int[length * 3]; + delegate.read(int96Buffer, 0, length); + for (int i = 0; i < length; i++) { + long epochSeconds = decodeFixed12First(int96Buffer, i); + long epochMicros; + if (timeZone == DateTimeZone.UTC) { + epochMicros = epochSeconds * MICROSECONDS_PER_SECOND; + } + else { + epochMicros = timeZone.convertUTCToLocal(epochSeconds * MILLISECONDS_PER_SECOND) * MICROSECONDS_PER_MILLISECOND; + } + int nanosOfSecond = (int) round(decodeFixed12Second(int96Buffer, i), 9 - precision); + values[offset + i] = epochMicros + nanosOfSecond / NANOSECONDS_PER_MICROSECOND; + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt96ToLongTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + checkArgument( + field.getType() instanceof TimestampType timestampType && !timestampType.isShort(), + "Trino type %s is not a long timestamp", + field.getType()); + int precision = ((TimestampType) field.getType()).getPrecision(); + return new InlineTransformDecoder<>( + getInt96TimestampDecoder(encoding), + (values, offset, length) -> { + for (int i = offset; i < offset + length; i++) { + long epochSeconds = decodeFixed12First(values, i); + int nanosOfSecond = decodeFixed12Second(values, i); + if (timeZone != DateTimeZone.UTC) { + epochSeconds = timeZone.convertUTCToLocal(epochSeconds * MILLISECONDS_PER_SECOND) / MILLISECONDS_PER_SECOND; + } + if (precision < 9) { + nanosOfSecond = (int) round(nanosOfSecond, 9 - precision); + } + encodeFixed12( + epochSeconds * MICROSECONDS_PER_SECOND + (nanosOfSecond / NANOSECONDS_PER_MICROSECOND), // epochMicros + (nanosOfSecond % NANOSECONDS_PER_MICROSECOND) * PICOSECONDS_PER_NANOSECOND, // picosOfMicro + values, + i); + } + }); + } + + public ValueDecoder getInt96ToShortTimestampWithTimeZoneDecoder(ParquetEncoding encoding) + { + checkArgument( + field.getType() instanceof TimestampWithTimeZoneType timestampWithTimeZoneType && timestampWithTimeZoneType.isShort(), + "Trino type %s is not a short timestamp with timezone", + field.getType()); + ValueDecoder delegate = getInt96TimestampDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + int[] int96Buffer = new int[length * 3]; + delegate.read(int96Buffer, 0, length); + for (int i = 0; i < length; i++) { + long epochSeconds = decodeFixed12First(int96Buffer, i); + int nanosOfSecond = decodeFixed12Second(int96Buffer, i); + long utcMillis = epochSeconds * MILLISECONDS_PER_SECOND + (nanosOfSecond / NANOSECONDS_PER_MILLISECOND); + values[offset + i] = packDateTimeWithZone(utcMillis, UTC_KEY); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt96ToLongTimestampWithTimeZoneDecoder(ParquetEncoding encoding) + { + checkArgument( + field.getType() instanceof TimestampWithTimeZoneType timestampType && !timestampType.isShort(), + "Trino type %s is not a long timestamp", + field.getType()); + int precision = ((TimestampWithTimeZoneType) field.getType()).getPrecision(); + return new InlineTransformDecoder<>( + getInt96TimestampDecoder(encoding), + (values, offset, length) -> { + for (int i = offset; i < offset + length; i++) { + long epochSeconds = decodeFixed12First(values, i); + int nanosOfSecond = decodeFixed12Second(values, i); + if (precision < 9) { + nanosOfSecond = (int) round(nanosOfSecond, 9 - precision); + } + long utcMillis = epochSeconds * MILLISECONDS_PER_SECOND + (nanosOfSecond / NANOSECONDS_PER_MILLISECOND); + encodeFixed12( + packDateTimeWithZone(utcMillis, UTC_KEY), + (nanosOfSecond % NANOSECONDS_PER_MILLISECOND) * PICOSECONDS_PER_NANOSECOND, + values, + i); + } + }); + } + + public ValueDecoder getInt64TimestampMillisToShortTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + checkArgument( + field.getType() instanceof TimestampType timestampType && timestampType.isShort(), + "Trino type %s is not a short timestamp", + field.getType()); + int precision = ((TimestampType) field.getType()).getPrecision(); + ValueDecoder valueDecoder = getLongDecoder(encoding); + if (precision < 3) { + return new InlineTransformDecoder<>( + valueDecoder, + (values, offset, length) -> { + // decoded values are epochMillis, round to lower precision and convert to epochMicros + for (int i = offset; i < offset + length; i++) { + long epochMillis = round(values[i], 3 - precision); + if (timeZone == DateTimeZone.UTC) { + values[i] = epochMillis * MICROSECONDS_PER_MILLISECOND; + } + else { + values[i] = timeZone.convertUTCToLocal(epochMillis) * MICROSECONDS_PER_MILLISECOND; + } + } + }); + } + return new InlineTransformDecoder<>( + valueDecoder, + (values, offset, length) -> { + // decoded values are epochMillis, convert to epochMicros + for (int i = offset; i < offset + length; i++) { + if (timeZone == DateTimeZone.UTC) { + values[i] = values[i] * MICROSECONDS_PER_MILLISECOND; + } + else { + values[i] = timeZone.convertUTCToLocal(values[i]) * MICROSECONDS_PER_MILLISECOND; + } + } + }); + } + + public ValueDecoder getInt64TimestampMillsToShortTimestampWithTimeZoneDecoder(ParquetEncoding encoding) + { + checkArgument( + field.getType() instanceof TimestampWithTimeZoneType timestampWithTimeZoneType && timestampWithTimeZoneType.isShort(), + "Trino type %s is not a short timestamp", + field.getType()); + int precision = ((TimestampWithTimeZoneType) field.getType()).getPrecision(); + ValueDecoder valueDecoder = getLongDecoder(encoding); + if (precision < 3) { + return new InlineTransformDecoder<>( + valueDecoder, + (values, offset, length) -> { + // decoded values are epochMillis, round to lower precision and convert to packed millis utc value + for (int i = offset; i < offset + length; i++) { + values[i] = packDateTimeWithZone(round(values[i], 3 - precision), UTC_KEY); + } + }); + } + return new InlineTransformDecoder<>( + valueDecoder, + (values, offset, length) -> { + // decoded values are epochMillis, convert to packed millis utc value + for (int i = offset; i < offset + length; i++) { + values[i] = packDateTimeWithZone(values[i], UTC_KEY); + } + }); + } + + public ValueDecoder getInt64TimestampMicrosToShortTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + checkArgument( + field.getType() instanceof TimestampType timestampType && timestampType.isShort(), + "Trino type %s is not a short timestamp", + field.getType()); + int precision = ((TimestampType) field.getType()).getPrecision(); + ValueDecoder valueDecoder = getLongDecoder(encoding); + if (precision == 6) { + if (timeZone == DateTimeZone.UTC) { + return valueDecoder; + } + new InlineTransformDecoder<>( + valueDecoder, + (values, offset, length) -> { + for (int i = offset; i < offset + length; i++) { + long epochMicros = values[i]; + long localMillis = timeZone.convertUTCToLocal(floorDiv(epochMicros, MICROSECONDS_PER_MILLISECOND)); + values[i] = (localMillis * MICROSECONDS_PER_MILLISECOND) + floorMod(epochMicros, MICROSECONDS_PER_MILLISECOND); + } + }); + } + return new InlineTransformDecoder<>( + valueDecoder, + (values, offset, length) -> { + // decoded values are epochMicros, round to lower precision + for (int i = offset; i < offset + length; i++) { + long epochMicros = round(values[i], 6 - precision); + if (timeZone == DateTimeZone.UTC) { + values[i] = epochMicros; + } + else { + long localMillis = timeZone.convertUTCToLocal(floorDiv(epochMicros, MICROSECONDS_PER_MILLISECOND)); + values[i] = (localMillis * MICROSECONDS_PER_MILLISECOND) + floorMod(epochMicros, MICROSECONDS_PER_MILLISECOND); + } + } + }); + } + + public ValueDecoder getInt64TimestampMicrosToShortTimestampWithTimeZoneDecoder(ParquetEncoding encoding) + { + checkArgument( + field.getType() instanceof TimestampWithTimeZoneType timestampWithTimeZoneType && timestampWithTimeZoneType.isShort(), + "Trino type %s is not a short timestamp", + field.getType()); + int precision = ((TimestampWithTimeZoneType) field.getType()).getPrecision(); + return new InlineTransformDecoder<>( + getLongDecoder(encoding), + (values, offset, length) -> { + // decoded values are epochMicros, round to lower precision and convert to packed millis utc value + for (int i = offset; i < offset + length; i++) { + values[i] = packDateTimeWithZone(round(values[i], 6 - precision) / MICROSECONDS_PER_MILLISECOND, UTC_KEY); + } + }); + } + + public ValueDecoder getInt64TimestampNanosToShortTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + checkArgument( + field.getType() instanceof TimestampType timestampType && timestampType.isShort(), + "Trino type %s is not a short timestamp", + field.getType()); + int precision = ((TimestampType) field.getType()).getPrecision(); + return new InlineTransformDecoder<>( + getLongDecoder(encoding), + (values, offset, length) -> { + // decoded values are epochNanos, round to lower precision and convert to epochMicros + for (int i = offset; i < offset + length; i++) { + long epochNanos = round(values[i], 9 - precision); + if (timeZone == DateTimeZone.UTC) { + values[i] = epochNanos / NANOSECONDS_PER_MICROSECOND; + } + else { + long localMillis = timeZone.convertUTCToLocal(floorDiv(epochNanos, NANOSECONDS_PER_MILLISECOND)); + values[i] = (localMillis * MICROSECONDS_PER_MILLISECOND) + floorDiv(floorMod(epochNanos, NANOSECONDS_PER_MILLISECOND), NANOSECONDS_PER_MICROSECOND); + } + } + }); + } + + public ValueDecoder getInt64TimestampMillisToLongTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + ValueDecoder delegate = getLongDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(int[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + // decoded values are epochMillis, convert to epochMicros + for (int i = 0; i < length; i++) { + if (timeZone == DateTimeZone.UTC) { + encodeFixed12(buffer[i] * MICROSECONDS_PER_MILLISECOND, 0, values, i + offset); + } + else { + encodeFixed12(timeZone.convertUTCToLocal(buffer[i]) * MICROSECONDS_PER_MILLISECOND, 0, values, i + offset); + } + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt64TimestampMicrosToLongTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + ValueDecoder delegate = getLongDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(int[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + // decoded values are epochMicros + for (int i = 0; i < length; i++) { + long epochMicros = buffer[i]; + if (timeZone == DateTimeZone.UTC) { + encodeFixed12(epochMicros, 0, values, i + offset); + } + else { + long localMillis = timeZone.convertUTCToLocal(floorDiv(epochMicros, MICROSECONDS_PER_MILLISECOND)); + encodeFixed12((localMillis * MICROSECONDS_PER_MILLISECOND) + floorMod(epochMicros, MICROSECONDS_PER_MILLISECOND), + 0, + values, + i + offset); + } + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt64TimestampMicrosToLongTimestampWithTimeZoneDecoder(ParquetEncoding encoding) + { + ValueDecoder delegate = getLongDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(int[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + // decoded values are epochMicros, convert to (packed epochMillisUtc, picosOfMilli) + for (int i = 0; i < length; i++) { + long epochMicros = buffer[i]; + encodeFixed12( + packDateTimeWithZone(floorDiv(epochMicros, MICROSECONDS_PER_MILLISECOND), UTC_KEY), + floorMod(epochMicros, MICROSECONDS_PER_MILLISECOND) * PICOSECONDS_PER_MICROSECOND, + values, + i + offset); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt64TimestampNanosToLongTimestampDecoder(ParquetEncoding encoding, DateTimeZone timeZone) + { + ValueDecoder delegate = getLongDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(int[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + // decoded values are epochNanos, convert to (epochMicros, picosOfNanos) + for (int i = 0; i < length; i++) { + long epochNanos = buffer[i]; + int picosOfNanos = floorMod(epochNanos, NANOSECONDS_PER_MICROSECOND) * PICOSECONDS_PER_NANOSECOND; + if (timeZone == DateTimeZone.UTC) { + encodeFixed12( + floorDiv(epochNanos, NANOSECONDS_PER_MICROSECOND), + picosOfNanos, + values, + i + offset); + } + else { + long localMillis = timeZone.convertUTCToLocal(floorDiv(epochNanos, NANOSECONDS_PER_MILLISECOND)); + long microsFromNanos = floorMod(epochNanos, NANOSECONDS_PER_MILLISECOND) / NANOSECONDS_PER_MICROSECOND; + encodeFixed12( + (localMillis * MICROSECONDS_PER_MILLISECOND) + microsFromNanos, + picosOfNanos, + values, + i + offset); + } + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getFloatToDoubleDecoder(ParquetEncoding encoding) + { + ValueDecoder delegate = getRealDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + int[] buffer = new int[length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + values[offset + i] = Double.doubleToLongBits(Float.intBitsToFloat(buffer[i])); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getBinaryLongDecimalDecoder(ParquetEncoding encoding) + { + return new BinaryToLongDecimalTransformDecoder(getBinaryDecoder(encoding)); + } + + public ValueDecoder getDeltaFixedWidthLongDecimalDecoder(ParquetEncoding encoding) + { + checkArgument(encoding.equals(DELTA_BYTE_ARRAY), "encoding %s is not DELTA_BYTE_ARRAY", encoding); + ColumnDescriptor descriptor = field.getDescriptor(); + LogicalTypeAnnotation logicalTypeAnnotation = descriptor.getPrimitiveType().getLogicalTypeAnnotation(); + checkArgument( + logicalTypeAnnotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation + && decimalAnnotation.getPrecision() > Decimals.MAX_SHORT_PRECISION, + "Column %s is not a long decimal", + descriptor); + return new BinaryToLongDecimalTransformDecoder(new BinaryDeltaByteArrayDecoder()); + } + + public ValueDecoder getBinaryShortDecimalDecoder(ParquetEncoding encoding) + { + ValueDecoder delegate = getBinaryDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + BinaryBuffer buffer = new BinaryBuffer(length); + delegate.read(buffer, 0, length); + int[] offsets = buffer.getOffsets(); + byte[] inputBytes = buffer.asSlice().byteArray(); + + for (int i = 0; i < length; i++) { + int positionOffset = offsets[i]; + int positionLength = offsets[i + 1] - positionOffset; + if (positionLength > 8) { + throw new ParquetDecodingException("Unable to read BINARY type decimal of size " + positionLength + " as a short decimal"); + } + // No need for checkBytesFitInShortDecimal as the standard requires variable binary decimals + // to be stored in minimum possible number of bytes + values[offset + i] = getShortDecimalValue(inputBytes, positionOffset, positionLength); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getDeltaFixedWidthShortDecimalDecoder(ParquetEncoding encoding) + { + checkArgument(encoding.equals(DELTA_BYTE_ARRAY), "encoding %s is not DELTA_BYTE_ARRAY", encoding); + ColumnDescriptor descriptor = field.getDescriptor(); + LogicalTypeAnnotation logicalTypeAnnotation = descriptor.getPrimitiveType().getLogicalTypeAnnotation(); + checkArgument( + logicalTypeAnnotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation + && decimalAnnotation.getPrecision() <= Decimals.MAX_SHORT_PRECISION, + "Column %s is not a short decimal", + descriptor); + int typeLength = descriptor.getPrimitiveType().getTypeLength(); + checkArgument(typeLength > 0 && typeLength <= 16, "Expected column %s to have type length in range (1-16)", descriptor); + return new ValueDecoder<>() + { + private final ValueDecoder delegate = new BinaryDeltaByteArrayDecoder(); + + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + BinaryBuffer buffer = new BinaryBuffer(length); + delegate.read(buffer, 0, length); + + // Each position in FIXED_LEN_BYTE_ARRAY has fixed length + int bytesOffset = 0; + int bytesLength = typeLength; + if (typeLength > Long.BYTES) { + bytesOffset = typeLength - Long.BYTES; + bytesLength = Long.BYTES; + } + + byte[] inputBytes = buffer.asSlice().byteArray(); + int[] offsets = buffer.getOffsets(); + for (int i = 0; i < length; i++) { + int inputOffset = offsets[i]; + checkBytesFitInShortDecimal(inputBytes, inputOffset, bytesOffset, descriptor); + values[offset + i] = getShortDecimalValue(inputBytes, inputOffset + bytesOffset, bytesLength); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getRescaledLongDecimalDecoder(ParquetEncoding encoding) + { + DecimalType decimalType = (DecimalType) field.getType(); + DecimalLogicalTypeAnnotation decimalAnnotation = (DecimalLogicalTypeAnnotation) field.getDescriptor().getPrimitiveType().getLogicalTypeAnnotation(); + if (decimalAnnotation.getPrecision() <= Decimals.MAX_SHORT_PRECISION) { + ValueDecoder delegate = getShortDecimalDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + Int128 rescaled = DecimalConversions.shortToLongCast( + buffer[i], + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + decimalType.getPrecision(), + decimalType.getScale()); + + values[2 * (offset + i)] = rescaled.getHigh(); + values[2 * (offset + i) + 1] = rescaled.getLow(); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + return new InlineTransformDecoder<>( + getLongDecimalDecoder(encoding), + (values, offset, length) -> { + int endOffset = (offset + length) * 2; + for (int currentOffset = offset * 2; currentOffset < endOffset; currentOffset += 2) { + Int128 rescaled = DecimalConversions.longToLongCast( + Int128.valueOf(values[currentOffset], values[currentOffset + 1]), + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + decimalType.getPrecision(), + decimalType.getScale()); + + values[currentOffset] = rescaled.getHigh(); + values[currentOffset + 1] = rescaled.getLow(); + } + }); + } + + public ValueDecoder getRescaledShortDecimalDecoder(ParquetEncoding encoding) + { + DecimalType decimalType = (DecimalType) field.getType(); + DecimalLogicalTypeAnnotation decimalAnnotation = (DecimalLogicalTypeAnnotation) field.getDescriptor().getPrimitiveType().getLogicalTypeAnnotation(); + if (decimalAnnotation.getPrecision() <= Decimals.MAX_SHORT_PRECISION) { + long rescale = longTenToNth(Math.abs(decimalType.getScale() - decimalAnnotation.getScale())); + return new InlineTransformDecoder<>( + getShortDecimalDecoder(encoding), + (values, offset, length) -> { + for (int i = offset; i < offset + length; i++) { + values[i] = DecimalConversions.shortToShortCast( + values[i], + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + decimalType.getPrecision(), + decimalType.getScale(), + rescale, + rescale / 2); + } + }); + } + ValueDecoder delegate = getLongDecimalDecoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + long[] buffer = new long[2 * length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + values[offset + i] = DecimalConversions.longToShortCast( + Int128.valueOf(buffer[2 * i], buffer[2 * i + 1]), + decimalAnnotation.getPrecision(), + decimalAnnotation.getScale(), + decimalType.getPrecision(), + decimalType.getScale()); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt32ToShortDecimalDecoder(ParquetEncoding encoding) + { + DecimalType decimalType = (DecimalType) field.getType(); + ValueDecoder delegate = getInt32Decoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + int[] buffer = new int[length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + if (overflows(buffer[i], decimalType.getPrecision())) { + throw new TrinoException( + INVALID_CAST_ARGUMENT, + format("Cannot read parquet INT32 value '%s' as DECIMAL(%s, %s)", buffer[i], decimalType.getPrecision(), decimalType.getScale())); + } + values[i + offset] = rescale(buffer[i], 0, decimalType.getScale()); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt32ToLongDecoder(ParquetEncoding encoding) + { + ValueDecoder delegate = getInt32Decoder(encoding); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + int[] buffer = new int[length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + values[i + offset] = buffer[i]; + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + public ValueDecoder getInt64ToIntDecoder(ParquetEncoding encoding) + { + return new LongToIntTransformDecoder(getLongDecoder(encoding)); + } + + public ValueDecoder getShortDecimalToIntDecoder(ParquetEncoding encoding) + { + return new LongToIntTransformDecoder(getShortDecimalDecoder(encoding)); + } + + public ValueDecoder getInt64ToShortDecoder(ParquetEncoding encoding) + { + return new LongToShortTransformDecoder(getLongDecoder(encoding)); + } + + public ValueDecoder getShortDecimalToShortDecoder(ParquetEncoding encoding) + { + return new LongToShortTransformDecoder(getShortDecimalDecoder(encoding)); + } + + public ValueDecoder getInt64ToByteDecoder(ParquetEncoding encoding) + { + return new LongToByteTransformDecoder(getLongDecoder(encoding)); + } + + public ValueDecoder getShortDecimalToByteDecoder(ParquetEncoding encoding) + { + return new LongToByteTransformDecoder(getShortDecimalDecoder(encoding)); + } + + public ValueDecoder getDeltaUuidDecoder(ParquetEncoding encoding) + { + checkArgument(encoding.equals(DELTA_BYTE_ARRAY), "encoding %s is not DELTA_BYTE_ARRAY", encoding); + ValueDecoder delegate = new BinaryDeltaByteArrayDecoder(); + return new ValueDecoder<>() + { + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + BinaryBuffer buffer = new BinaryBuffer(length); + delegate.read(buffer, 0, length); + SimpleSliceInputStream binaryInput = new SimpleSliceInputStream(buffer.asSlice()); + + int endOffset = (offset + length) * 2; + for (int outputOffset = offset * 2; outputOffset < endOffset; outputOffset += 2) { + values[outputOffset] = binaryInput.readLong(); + values[outputOffset + 1] = binaryInput.readLong(); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + }; + } + + private static class LongToIntTransformDecoder + implements ValueDecoder + { + private final ValueDecoder delegate; + + private LongToIntTransformDecoder(ValueDecoder delegate) + { + this.delegate = delegate; + } + + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(int[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + values[offset + i] = toIntExact(buffer[i]); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + } + + private static class LongToShortTransformDecoder + implements ValueDecoder + { + private final ValueDecoder delegate; + + private LongToShortTransformDecoder(ValueDecoder delegate) + { + this.delegate = delegate; + } + + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(short[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + values[offset + i] = toShortExact(buffer[i]); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + } + + private static class LongToByteTransformDecoder + implements ValueDecoder + { + private final ValueDecoder delegate; + + private LongToByteTransformDecoder(ValueDecoder delegate) + { + this.delegate = delegate; + } + + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(byte[] values, int offset, int length) + { + long[] buffer = new long[length]; + delegate.read(buffer, 0, length); + for (int i = 0; i < length; i++) { + values[offset + i] = toByteExact(buffer[i]); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + } + + private static class BinaryToLongDecimalTransformDecoder + implements ValueDecoder + { + private final ValueDecoder delegate; + + private BinaryToLongDecimalTransformDecoder(ValueDecoder delegate) + { + this.delegate = delegate; + } + + @Override + public void init(SimpleSliceInputStream input) + { + delegate.init(input); + } + + @Override + public void read(long[] values, int offset, int length) + { + BinaryBuffer buffer = new BinaryBuffer(length); + delegate.read(buffer, 0, length); + int[] offsets = buffer.getOffsets(); + Slice binaryInput = buffer.asSlice(); + + for (int i = 0; i < length; i++) { + int positionOffset = offsets[i]; + int positionLength = offsets[i + 1] - positionOffset; + Int128 value = Int128.fromBigEndian(binaryInput.getBytes(positionOffset, positionLength)); + values[2 * (offset + i)] = value.getHigh(); + values[2 * (offset + i) + 1] = value.getLow(); + } + } + + @Override + public void skip(int n) + { + delegate.skip(n); + } + } + + private static class InlineTransformDecoder + implements ValueDecoder + { + private final ValueDecoder valueDecoder; + private final TypeTransform typeTransform; + + private InlineTransformDecoder(ValueDecoder valueDecoder, TypeTransform typeTransform) + { + this.valueDecoder = requireNonNull(valueDecoder, "valueDecoder is null"); + this.typeTransform = requireNonNull(typeTransform, "typeTransform is null"); + } + + @Override + public void init(SimpleSliceInputStream input) + { + valueDecoder.init(input); + } + + @Override + public void read(T values, int offset, int length) + { + valueDecoder.read(values, offset, length); + typeTransform.process(values, offset, length); + } + + @Override + public void skip(int n) + { + valueDecoder.skip(n); + } + } + + private interface TypeTransform + { + void process(T values, int offset, int length); + } + + private ValuesReader getApacheParquetReader(ParquetEncoding encoding) + { + return encoding.getValuesReader(field.getDescriptor(), VALUES); + } + + private IllegalArgumentException wrongEncoding(ParquetEncoding encoding) + { + return new IllegalArgumentException("Wrong encoding " + encoding + " for column " + field.getDescriptor()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/VectorIntBitUnpackers.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/VectorIntBitUnpackers.java new file mode 100644 index 000000000000..c8aa8f39c8c6 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/decoders/VectorIntBitUnpackers.java @@ -0,0 +1,1476 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.decoders; + +import io.trino.parquet.reader.SimpleSliceInputStream; +import jdk.incubator.vector.ByteVector; +import jdk.incubator.vector.IntVector; +import jdk.incubator.vector.ShortVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorShuffle; + +import static io.trino.parquet.reader.decoders.IntBitUnpackers.getIntBitUnpacker; + +public final class VectorIntBitUnpackers +{ + private static final IntBitUnpacker[] UNPACKERS = { + new Unpacker0(), + new Unpacker1(), + new Unpacker2(), + new Unpacker3(), + new Unpacker4(), + new Unpacker5(), + new Unpacker6(), + new Unpacker7(), + new Unpacker8(), + new Unpacker9(), + new Unpacker10(), + new Unpacker11(), + new Unpacker12(), + new Unpacker13(), + new Unpacker14(), + new Unpacker15(), + new Unpacker16(), + new Unpacker17(), + new Unpacker18(), + new Unpacker19(), + new Unpacker20()}; + + public static IntBitUnpacker getVectorIntBitUnpacker(int bitWidth) + { + // This encoding is used for dictionary ids, repetition and definition levels in V1 encodings + // and additionally for delta encoded integers in V2 encodings. + // We vectorize encodings upto 20 bit width as that is more than enough for real world usage of V1 encodings. + // The remaining bit widths can be vectorized when there is a need to do so. + if (bitWidth > 20) { + return getIntBitUnpacker(bitWidth); + } + return UNPACKERS[bitWidth]; + } + + private VectorIntBitUnpackers() {} + + private static final class Unpacker0 + implements IntBitUnpacker + { + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, + int length) + { + // Do nothing + } + } + + private static final class Unpacker1 + implements IntBitUnpacker + { + private static final ByteVector MASK_1 = ByteVector.broadcast(ByteVector.SPECIES_64, 1); + private static final ByteVector LSHR_BYTE_VECTOR = ByteVector.fromArray(ByteVector.SPECIES_64, new byte[] {0, 1, 2, 3, 4, 5, 6, 7}, 0); + + private static void unpack8(int[] output, int outputOffset, byte input) + { + ByteVector.broadcast(ByteVector.SPECIES_64, input) + .lanewise(VectorOperators.LSHR, LSHR_BYTE_VECTOR) + .and(MASK_1) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 8) { + unpack8(output, outputOffset, inputArray[inputOffset + inputBytesRead]); + outputOffset += 8; + length -= 8; + inputBytesRead++; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker2 + implements IntBitUnpacker + { + private static final ByteVector MASK_2 = ByteVector.broadcast(ByteVector.SPECIES_64, (1 << 2) - 1); + private static final ByteVector LSHR_BYTE_VECTOR = ByteVector.fromArray(ByteVector.SPECIES_64, new byte[] {0, 2, 4, 6, 0, 2, 4, 6}, 0); + private static final VectorShuffle SHUFFLE = VectorShuffle.fromArray(ByteVector.SPECIES_64, new int[] {0, 0, 0, 0, 1, 1, 1, 1}, 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ByteVector.fromArray(ByteVector.SPECIES_64, input, inputOffset) + .rearrange(SHUFFLE) + .lanewise(VectorOperators.LSHR, LSHR_BYTE_VECTOR) + .and(MASK_2) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8Scalar(int[] output, int outputOffset, byte[] input, int inputOffset) + { + byte v0 = input[inputOffset]; + byte v1 = input[inputOffset + 1]; + + output[outputOffset] = v0 & 0b11; + output[outputOffset + 1] = (v0 >>> 2) & 0b11; + output[outputOffset + 2] = (v0 >>> 4) & 0b11; + output[outputOffset + 3] = (v0 >>> 6) & 0b11; + + output[outputOffset + 4] = v1 & 0b11; + output[outputOffset + 5] = (v1 >>> 2) & 0b11; + output[outputOffset + 6] = (v1 >>> 4) & 0b11; + output[outputOffset + 7] = (v1 >>> 6) & 0b11; + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 32) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 2; + } + + switch (length) { + case 24: + unpack8Scalar(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + inputBytesRead += 2; + // fall through + case 16: + unpack8Scalar(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + inputBytesRead += 2; + // fall through + case 8: + unpack8Scalar(output, outputOffset, inputArray, inputOffset + inputBytesRead); + inputBytesRead += 2; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker3 + implements IntBitUnpacker + { + private static final ByteVector MASK_3 = ByteVector.broadcast(ByteVector.SPECIES_64, (1 << 3) - 1); + private static final ByteVector MASK_BEFORE_LSHL = ByteVector.fromArray( + ByteVector.SPECIES_64, + new byte[] {0, 0, 0b1, 0, 0, 0b11, 0, 0}, + 0); + private static final ByteVector LSHR_BYTE_VECTOR = ByteVector.fromArray( + ByteVector.SPECIES_64, + new byte[] {0, 3, 6, 1, 4, 7, 2, 5}, + 0); + private static final ByteVector LSHL_BYTE_VECTOR = ByteVector.fromArray( + ByteVector.SPECIES_64, + new byte[] {0, 0, 2, 0, 0, 1, 0, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ByteVector.SPECIES_64, + new int[] {0, 0, 0, 1, 1, 1, 2, 2}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ByteVector.SPECIES_64, + new int[] {0, 0, 1, 1, 1, 2, 2, 2}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ByteVector byteVector = ByteVector.fromArray(ByteVector.SPECIES_64, input, inputOffset); + + ByteVector shiftRightResult = byteVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_BYTE_VECTOR) + .and(MASK_3); + + ByteVector shiftLeftResult = byteVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_BYTE_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + short v0 = input.readShort(); + byte v1 = input.readByte(); + output[outputOffset] = v0 & 0b111; + output[outputOffset + 1] = (v0 >>> 3) & 0b111; + output[outputOffset + 2] = (v0 >>> 6) & 0b111; + output[outputOffset + 3] = (v0 >>> 9) & 0b111; + output[outputOffset + 4] = (v0 >>> 12) & 0b111; + output[outputOffset + 5] = ((v0 >>> 15) & 0b1) | ((v1 & 0b11) << 1); + output[outputOffset + 6] = (v1 >>> 2) & 0b111; + output[outputOffset + 7] = (v1 >>> 5) & 0b111; + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 24) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 3; + } + input.skip(inputBytesRead); + + switch (length) { + case 16: + unpack8(output, outputOffset, input); + outputOffset += 8; + // fall through + case 8: + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker4 + implements IntBitUnpacker + { + private static final ByteVector MASK_4 = ByteVector.broadcast(ByteVector.SPECIES_64, (1 << 4) - 1); + private static final ByteVector LSHR_BYTE_VECTOR = ByteVector.fromArray(ByteVector.SPECIES_64, new byte[] {0, 4, 0, 4, 0, 4, 0, 4}, 0); + private static final VectorShuffle SHUFFLE = VectorShuffle.fromArray(ByteVector.SPECIES_64, new int[] {0, 0, 1, 1, 2, 2, 3, 3}, 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ByteVector.fromArray(ByteVector.SPECIES_64, input, inputOffset) + .rearrange(SHUFFLE) + .lanewise(VectorOperators.LSHR, LSHR_BYTE_VECTOR) + .and(MASK_4) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8Scalar(int[] output, int outputOffset, byte[] input, int inputOffset) + { + byte v0 = input[inputOffset]; + byte v1 = input[inputOffset + 1]; + byte v2 = input[inputOffset + 2]; + byte v3 = input[inputOffset + 3]; + + output[outputOffset] = v0 & 0b1111; + output[outputOffset + 1] = (v0 >>> 4) & 0b1111; + output[outputOffset + 2] = v1 & 0b1111; + output[outputOffset + 3] = (v1 >>> 4) & 0b1111; + output[outputOffset + 4] = v2 & 0b1111; + output[outputOffset + 5] = (v2 >>> 4) & 0b1111; + output[outputOffset + 6] = v3 & 0b1111; + output[outputOffset + 7] = (v3 >>> 4) & 0b1111; + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 4; + } + + if (length >= 8) { + unpack8Scalar(output, outputOffset, inputArray, inputOffset + inputBytesRead); + inputBytesRead += 4; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker5 + implements IntBitUnpacker + { + private static final ByteVector MASK_5 = ByteVector.broadcast(ByteVector.SPECIES_64, (1 << 5) - 1); + private static final ByteVector MASK_BEFORE_LSHL = ByteVector.fromArray( + ByteVector.SPECIES_64, + new byte[] {0, 0b11, 0, 0b1111, 0b1, 0, 0b111, 0}, + 0); + private static final ByteVector LSHR_BYTE_VECTOR = ByteVector.fromArray( + ByteVector.SPECIES_64, + new byte[] {0, 5, 2, 7, 4, 1, 6, 3}, + 0); + private static final ByteVector LSHL_BYTE_VECTOR = ByteVector.fromArray( + ByteVector.SPECIES_64, + new byte[] {0, 3, 0, 1, 4, 0, 2, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ByteVector.SPECIES_64, + new int[] {0, 0, 1, 1, 2, 3, 3, 4}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ByteVector.SPECIES_64, + new int[] {0, 1, 1, 2, 3, 3, 4, 4}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ByteVector byteVector = ByteVector.fromArray(ByteVector.SPECIES_64, input, inputOffset); + + ByteVector shiftRightResult = byteVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_BYTE_VECTOR) + .and(MASK_5); + + ByteVector shiftLeftResult = byteVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_BYTE_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + byte v1 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111L); + output[outputOffset + 1] = (int) ((v0 >>> 5) & 0b11111L); + output[outputOffset + 2] = (int) ((v0 >>> 10) & 0b11111L); + output[outputOffset + 3] = (int) ((v0 >>> 15) & 0b11111L); + output[outputOffset + 4] = (int) ((v0 >>> 20) & 0b11111L); + output[outputOffset + 5] = (int) ((v0 >>> 25) & 0b11111L); + output[outputOffset + 6] = (int) (((v0 >>> 30) & 0b11L) | ((v1 & 0b111L) << 2)); + output[outputOffset + 7] = (int) ((v1 >>> 3) & 0b11111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 5; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker6 + implements IntBitUnpacker + { + private static final ShortVector MASK_6 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 6) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0, 0b11, 0, 0, 0b1111, 0, 0}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 6, 12, 2, 8, 14, 4, 10}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0, 4, 0, 0, 2, 0, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 0, 1, 1, 1, 2, 2}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 1, 1, 2, 2, 2}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_64, input, inputOffset) + .castShape(ByteVector.SPECIES_128, 0) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_6); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + short v1 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b111111L); + output[outputOffset + 1] = (int) ((v0 >>> 6) & 0b111111L); + output[outputOffset + 2] = (int) ((v0 >>> 12) & 0b111111L); + output[outputOffset + 3] = (int) ((v0 >>> 18) & 0b111111L); + output[outputOffset + 4] = (int) ((v0 >>> 24) & 0b111111L); + output[outputOffset + 5] = (int) (((v0 >>> 30) & 0b11L) | ((v1 & 0b1111L) << 2)); + output[outputOffset + 6] = (int) ((v1 >>> 4) & 0b111111L); + output[outputOffset + 7] = (int) ((v1 >>> 10) & 0b111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 6; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker7 + implements IntBitUnpacker + { + private static final ShortVector MASK_7 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 7) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0, 0b11111, 0, 0b111, 0, 0b1, 0}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 7, 14, 5, 12, 3, 10, 1}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0, 2, 0, 4, 0, 6, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 0, 1, 1, 2, 2, 3}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 1, 2, 2, 3, 3}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_64, input, inputOffset) + .castShape(ByteVector.SPECIES_128, 0) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_7); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + int v0 = input.readInt(); + short v1 = input.readShort(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111L); + output[outputOffset + 1] = (int) ((v0 >>> 7) & 0b1111111L); + output[outputOffset + 2] = (int) ((v0 >>> 14) & 0b1111111L); + output[outputOffset + 3] = (int) ((v0 >>> 21) & 0b1111111L); + output[outputOffset + 4] = (int) (((v0 >>> 28) & 0b1111L) | ((v1 & 0b111L) << 4)); + output[outputOffset + 5] = (int) ((v1 >>> 3) & 0b1111111L); + output[outputOffset + 6] = (int) (((v1 >>> 10) & 0b111111L) | ((v2 & 0b1L) << 6)); + output[outputOffset + 7] = (int) ((v2 >>> 1) & 0b1111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, + int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 7; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker8 + implements IntBitUnpacker + { + private static final IntVector MASK_8 = IntVector.broadcast(IntVector.SPECIES_256, (1 << 8) - 1); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ByteVector.fromArray(ByteVector.SPECIES_64, input, inputOffset) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .and(MASK_8) + .intoArray(output, outputOffset); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 8) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 8; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker9 + implements IntBitUnpacker + { + private static final ShortVector MASK_9 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 9) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0b11, 0, 0b1111, 0, 0b111111, 0, 0b11111111}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 9, 2, 11, 4, 13, 6, 15}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 7, 0, 5, 0, 3, 0, 1}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 1, 2, 2, 3, 3}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 1, 1, 2, 2, 3, 3, 4}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_9); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + byte v1 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 9) & 0b111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 18) & 0b111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 27) & 0b111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 36) & 0b111111111L); + output[outputOffset + 5] = (int) ((v0 >>> 45) & 0b111111111L); + output[outputOffset + 6] = (int) ((v0 >>> 54) & 0b111111111L); + output[outputOffset + 7] = (int) (((v0 >>> 63) & 0b1L) | ((v1 & 0b11111111L) << 1)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 9; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker10 + implements IntBitUnpacker + { + private static final ShortVector MASK_10 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 10) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0b1111, 0, 0b11111111, 0b11, 0, 0b111111, 0}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 10, 4, 14, 8, 2, 12, 6}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 6, 0, 2, 8, 0, 4, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 1, 2, 3, 3, 4}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 1, 1, 2, 3, 3, 4, 4}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_10); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + short v1 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b1111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 10) & 0b1111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 20) & 0b1111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 30) & 0b1111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 40) & 0b1111111111L); + output[outputOffset + 5] = (int) ((v0 >>> 50) & 0b1111111111L); + output[outputOffset + 6] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b111111L) << 4)); + output[outputOffset + 7] = (int) ((v1 >>> 6) & 0b1111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 10; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker11 + implements IntBitUnpacker + { + private static final ShortVector MASK_11 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 11) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0b111111, 0b1, 0, 0b1111111, 0b11, 0, 0b11111111}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 11, 6, 1, 12, 7, 2, 13}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 5, 10, 0, 4, 9, 0, 3}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 2, 2, 3, 4, 4}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 1, 2, 2, 3, 4, 4, 5}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_11); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + short v1 = input.readShort(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 11) & 0b11111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 22) & 0b11111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 33) & 0b11111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 44) & 0b11111111111L); + output[outputOffset + 5] = (int) (((v0 >>> 55) & 0b111111111L) | ((v1 & 0b11L) << 9)); + output[outputOffset + 6] = (int) ((v1 >>> 2) & 0b11111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 13) & 0b111L) | ((v2 & 0b11111111L) << 3)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 11; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker12 + implements IntBitUnpacker + { + private static final ShortVector MASK_12 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 12) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0b11111111, 0b1111, 0, 0, 0b11111111, 0b1111, 0}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 12, 8, 4, 0, 12, 8, 4}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 4, 8, 0, 0, 4, 8, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 2, 3, 3, 4, 5}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 1, 2, 2, 3, 4, 5, 5}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_12); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + output[outputOffset] = (int) (v0 & 0b111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 12) & 0b111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 24) & 0b111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 36) & 0b111111111111L); + output[outputOffset + 4] = (int) ((v0 >>> 48) & 0b111111111111L); + output[outputOffset + 5] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111L) << 4)); + output[outputOffset + 6] = (int) ((v1 >>> 8) & 0b111111111111L); + output[outputOffset + 7] = (int) ((v1 >>> 20) & 0b111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 12; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker13 + implements IntBitUnpacker + { + private static final ShortVector MASK_13 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 13) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0b1111111111, 0b1111111, 0b1111, 0b1, 0, 0b11111111111, 0b11111111}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 13, 10, 7, 4, 1, 14, 11}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 3, 6, 9, 12, 0, 2, 5}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 2, 3, 4, 4, 5}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 1, 2, 3, 4, 4, 5, 6}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_13); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 13) & 0b1111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 26) & 0b1111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 39) & 0b1111111111111L); + output[outputOffset + 4] = (int) (((v0 >>> 52) & 0b111111111111L) | ((v1 & 0b1L) << 12)); + output[outputOffset + 5] = (int) ((v1 >>> 1) & 0b1111111111111L); + output[outputOffset + 6] = (int) ((v1 >>> 14) & 0b1111111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 27) & 0b11111L) | ((v2 & 0b11111111L) << 5)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 13; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker14 + implements IntBitUnpacker + { + private static final ShortVector MASK_14 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 14) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0b111111111111, 0b1111111111, 0b11111111, 0b111111, 0b1111, 0b11, 0}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 14, 12, 10, 8, 6, 4, 2}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 2, 4, 6, 8, 10, 12, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 2, 3, 4, 5, 6}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 1, 2, 3, 4, 5, 6, 6}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_14); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + short v2 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b11111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 14) & 0b11111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 28) & 0b11111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 42) & 0b11111111111111L); + output[outputOffset + 4] = (int) (((v0 >>> 56) & 0b11111111L) | ((v1 & 0b111111L) << 8)); + output[outputOffset + 5] = (int) ((v1 >>> 6) & 0b11111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 20) & 0b111111111111L) | ((v2 & 0b11L) << 12)); + output[outputOffset + 7] = (int) ((v2 >>> 2) & 0b11111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 14; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker15 + implements IntBitUnpacker + { + private static final ShortVector MASK_15 = ShortVector.broadcast(ShortVector.SPECIES_128, (1 << 15) - 1); + private static final ShortVector MASK_BEFORE_LSHL = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 0b11111111111111, 0b1111111111111, 0b111111111111, 0b11111111111, 0b1111111111, 0b111111111, 0b11111111}, + 0); + private static final ShortVector LSHR_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 15, 14, 13, 12, 11, 10, 9}, + 0); + private static final ShortVector LSHL_SHORT_VECTOR = ShortVector.fromArray( + ShortVector.SPECIES_128, + new short[] {0, 1, 2, 3, 4, 5, 6, 7}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 0, 1, 2, 3, 4, 5, 6}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + ShortVector.SPECIES_128, + new int[] {0, 1, 2, 3, 4, 5, 6, 7}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ShortVector shortVector = ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts(); + + ShortVector shiftRightResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_SHORT_VECTOR) + .and(MASK_15); + + ShortVector shiftLeftResult = shortVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_SHORT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + int v1 = input.readInt(); + short v2 = input.readShort(); + byte v3 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 15) & 0b111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 30) & 0b111111111111111L); + output[outputOffset + 3] = (int) ((v0 >>> 45) & 0b111111111111111L); + output[outputOffset + 4] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b11111111111L) << 4)); + output[outputOffset + 5] = (int) ((v1 >>> 11) & 0b111111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 26) & 0b111111L) | ((v2 & 0b111111111L) << 6)); + output[outputOffset + 7] = (int) (((v2 >>> 9) & 0b1111111L) | ((v3 & 0b11111111L) << 7)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 15; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker16 + implements IntBitUnpacker + { + private static final IntVector MASK_16 = IntVector.broadcast(IntVector.SPECIES_256, (1 << 16) - 1); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + ByteVector.fromArray(ByteVector.SPECIES_128, input, inputOffset) + .reinterpretAsShorts() + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .and(MASK_16) + .intoArray(output, outputOffset); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 8) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 16; + } + input.skip(inputBytesRead); + } + } + + private static final class Unpacker17 + implements IntBitUnpacker + { + private static final IntVector MASK_17 = IntVector.broadcast(IntVector.SPECIES_256, (1 << 17) - 1); + private static final IntVector MASK_BEFORE_LSHL = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0b11, 0, 0b1111, 0, 0b111111, 0, 0b11111111}, + 0); + private static final IntVector LSHR_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 17, 2, 19, 4, 21, 6, 23}, + 0); + private static final IntVector LSHL_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 15, 0, 13, 0, 11, 0, 9}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0, 1, 1, 2, 2, 3, 3}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 1, 1, 2, 2, 3, 3, 4}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + IntVector intVector = ByteVector.fromArray(ByteVector.SPECIES_256, input, inputOffset) + .reinterpretAsInts(); + + IntVector shiftRightResult = intVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_INT_VECTOR) + .and(MASK_17); + + IntVector shiftLeftResult = intVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_INT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + byte v2 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b11111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 17) & 0b11111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 34) & 0b11111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 51) & 0b1111111111111L) | ((v1 & 0b1111L) << 13)); + output[outputOffset + 4] = (int) ((v1 >>> 4) & 0b11111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 21) & 0b11111111111111111L); + output[outputOffset + 6] = (int) ((v1 >>> 38) & 0b11111111111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 55) & 0b111111111L) | ((v2 & 0b11111111L) << 9)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 17; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker18 + implements IntBitUnpacker + { + private static final IntVector MASK_18 = IntVector.broadcast(IntVector.SPECIES_256, (1 << 18) - 1); + private static final IntVector MASK_BEFORE_LSHL = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0b1111, 0, 0b11111111, 0, 0b111111111111, 0, 0b1111111111111111}, + 0); + private static final IntVector LSHR_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 18, 4, 22, 8, 26, 12, 30}, + 0); + private static final IntVector LSHL_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 14, 0, 10, 0, 6, 0, 2}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0, 1, 1, 2, 2, 3, 3}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 1, 1, 2, 2, 3, 3, 4}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + IntVector intVector = ByteVector.fromArray(ByteVector.SPECIES_256, input, inputOffset) + .reinterpretAsInts(); + + IntVector shiftRightResult = intVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_INT_VECTOR) + .and(MASK_18); + + IntVector shiftLeftResult = intVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_INT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + short v2 = input.readShort(); + output[outputOffset] = (int) (v0 & 0b111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 18) & 0b111111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 36) & 0b111111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 54) & 0b1111111111L) | ((v1 & 0b11111111L) << 10)); + output[outputOffset + 4] = (int) ((v1 >>> 8) & 0b111111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 26) & 0b111111111111111111L); + output[outputOffset + 6] = (int) ((v1 >>> 44) & 0b111111111111111111L); + output[outputOffset + 7] = (int) (((v1 >>> 62) & 0b11L) | ((v2 & 0b1111111111111111L) << 2)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 18; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker19 + implements IntBitUnpacker + { + private static final IntVector MASK_19 = IntVector.broadcast(IntVector.SPECIES_256, (1 << 19) - 1); + private static final IntVector MASK_BEFORE_LSHL = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0b111111, 0, 0b111111111111, 0, 0b111111111111111111, 0b11111, 0}, + 0); + private static final IntVector LSHR_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 19, 6, 25, 12, 31, 18, 5}, + 0); + private static final IntVector LSHL_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 13, 0, 7, 0, 1, 14, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0, 1, 1, 2, 2, 3, 4}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 1, 1, 2, 2, 3, 4, 4}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + IntVector intVector = ByteVector.fromArray(ByteVector.SPECIES_256, input, inputOffset) + .reinterpretAsInts(); + + IntVector shiftRightResult = intVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_INT_VECTOR) + .and(MASK_19); + + IntVector shiftLeftResult = intVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_INT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + short v2 = input.readShort(); + byte v3 = input.readByte(); + output[outputOffset] = (int) (v0 & 0b1111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 19) & 0b1111111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 38) & 0b1111111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 57) & 0b1111111L) | ((v1 & 0b111111111111L) << 7)); + output[outputOffset + 4] = (int) ((v1 >>> 12) & 0b1111111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 31) & 0b1111111111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 50) & 0b11111111111111L) | ((v2 & 0b11111L) << 14)); + output[outputOffset + 7] = (int) (((v2 >>> 5) & 0b11111111111L) | ((v3 & 0b11111111L) << 11)); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 19; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } + + private static final class Unpacker20 + implements IntBitUnpacker + { + private static final IntVector MASK_20 = IntVector.broadcast(IntVector.SPECIES_256, (1 << 20) - 1); + private static final IntVector MASK_BEFORE_LSHL = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0b11111111, 0, 0b1111111111111111, 0b1111, 0, 0b111111111111, 0}, + 0); + private static final IntVector LSHR_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 20, 8, 28, 16, 4, 24, 12}, + 0); + private static final IntVector LSHL_INT_VECTOR = IntVector.fromArray( + IntVector.SPECIES_256, + new int[] {0, 12, 0, 4, 16, 0, 8, 0}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHR = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 0, 1, 1, 2, 3, 3, 4}, + 0); + private static final VectorShuffle SHUFFLE_BEFORE_LSHL = VectorShuffle.fromArray( + IntVector.SPECIES_256, + new int[] {0, 1, 1, 2, 3, 3, 4, 4}, + 0); + + private static void unpack8(int[] output, int outputOffset, byte[] input, int inputOffset) + { + IntVector intVector = ByteVector.fromArray(ByteVector.SPECIES_256, input, inputOffset) + .reinterpretAsInts(); + + IntVector shiftRightResult = intVector.rearrange(SHUFFLE_BEFORE_LSHR) + .lanewise(VectorOperators.LSHR, LSHR_INT_VECTOR) + .and(MASK_20); + + IntVector shiftLeftResult = intVector.rearrange(SHUFFLE_BEFORE_LSHL) + .and(MASK_BEFORE_LSHL) + .lanewise(VectorOperators.LSHL, LSHL_INT_VECTOR); + + shiftRightResult.or(shiftLeftResult) + .castShape(IntVector.SPECIES_256, 0) + .reinterpretAsInts() + .intoArray(output, outputOffset); + } + + private static void unpack8(int[] output, int outputOffset, SimpleSliceInputStream input) + { + long v0 = input.readLong(); + long v1 = input.readLong(); + int v2 = input.readInt(); + output[outputOffset] = (int) (v0 & 0b11111111111111111111L); + output[outputOffset + 1] = (int) ((v0 >>> 20) & 0b11111111111111111111L); + output[outputOffset + 2] = (int) ((v0 >>> 40) & 0b11111111111111111111L); + output[outputOffset + 3] = (int) (((v0 >>> 60) & 0b1111L) | ((v1 & 0b1111111111111111L) << 4)); + output[outputOffset + 4] = (int) ((v1 >>> 16) & 0b11111111111111111111L); + output[outputOffset + 5] = (int) ((v1 >>> 36) & 0b11111111111111111111L); + output[outputOffset + 6] = (int) (((v1 >>> 56) & 0b11111111L) | ((v2 & 0b111111111111L) << 8)); + output[outputOffset + 7] = (int) ((v2 >>> 12) & 0b11111111111111111111L); + } + + @Override + public void unpack(int[] output, int outputOffset, SimpleSliceInputStream input, int length) + { + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (length >= 16) { + unpack8(output, outputOffset, inputArray, inputOffset + inputBytesRead); + outputOffset += 8; + length -= 8; + inputBytesRead += 20; + } + input.skip(inputBytesRead); + + if (length >= 8) { + unpack8(output, outputOffset, input); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BinaryBuffer.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BinaryBuffer.java new file mode 100644 index 000000000000..707036f769ab --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BinaryBuffer.java @@ -0,0 +1,119 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import java.util.ArrayList; +import java.util.List; + +import static io.airlift.slice.SizeOf.sizeOf; +import static java.util.Objects.requireNonNull; + +/** + * A structure holding lazily populated binary data and offsets array. + *

+ * The data is stored as a list of Slices that are joined together when needed. + * This approach performs better if the slices are as big as possible. That is + * why the cases when the size of the resulting array is known beforehand it is + * more performant to add a single big byte array to this object rather than + * add slices one by one. + * The offset array is compatible with VariableWidthBlock.offsets field, i.e., the + * first value is always 0 and the last (number of positions + 1) is equal to + * the last offset + lest position length. + */ +public class BinaryBuffer +{ + private final List chunks; + private final int[] offsets; + + public BinaryBuffer(int valueCount) + { + this(new int[valueCount + 1], new ArrayList<>()); + } + + private BinaryBuffer(int[] offsets, List chunks) + { + this.offsets = requireNonNull(offsets, "offsets is null"); + this.chunks = requireNonNull(chunks, "chunks is null"); + } + + /** + * Returns a shallow copy of this buffer with empty offsets array. + * The first offset is set to the value of the last one in the original buffer. + * It can be used to add data to the original object while offsets land in temporary array + */ + public BinaryBuffer withTemporaryOffsets(int offset, int offsetCount) + { + int[] tmpOffsets = new int[offsetCount + 1]; + tmpOffsets[0] = offsets[offset]; + return new BinaryBuffer(tmpOffsets, chunks); + } + + public void add(byte[] source, int offset) + { + add(Slices.wrappedBuffer(source), offset); + } + + public void add(Slice slice, int offset) + { + chunks.add(slice); + offsets[offset + 1] = offsets[offset] + slice.length(); + } + + public void addChunk(Slice slice) + { + chunks.add(slice); + } + + public Slice asSlice() + { + if (chunks.size() == 1) { + return chunks.get(0); + } + int totalLength = 0; + for (Slice chunk : chunks) { + totalLength += chunk.length(); + } + Slice slice = Slices.allocate(totalLength); + int offset = 0; + for (Slice chunk : chunks) { + slice.setBytes(offset, chunk); + offset += chunk.length(); + } + chunks.clear(); + chunks.add(slice); + return slice; + } + + public int[] getOffsets() + { + return offsets; + } + + public int getValueCount() + { + return offsets.length - 1; + } + + public long getRetainedSize() + { + long chunksSizeInBytes = 0; + for (Slice slice : chunks) { + chunksSizeInBytes += slice.getRetainedSize(); + } + return sizeOf(offsets) + chunksSizeInBytes; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BinaryColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BinaryColumnAdapter.java new file mode 100644 index 000000000000..1542ec40d1f4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BinaryColumnAdapter.java @@ -0,0 +1,164 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.spi.block.Block; +import io.trino.spi.block.VariableWidthBlock; + +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetReaderUtils.castToByteNegate; + +public class BinaryColumnAdapter + implements ColumnAdapter +{ + public static final BinaryColumnAdapter BINARY_ADAPTER = new BinaryColumnAdapter(); + + @Override + public BinaryBuffer createBuffer(int batchSize) + { + return new BinaryBuffer(batchSize); + } + + @Override + public BinaryBuffer createTemporaryBuffer(int currentOffset, int size, BinaryBuffer buffer) + { + return buffer.withTemporaryOffsets(currentOffset, size); + } + + @Override + public void copyValue(BinaryBuffer source, int sourceIndex, BinaryBuffer destination, int destinationIndex) + { + // ignore as unpackNullValues is overridden + throw new UnsupportedOperationException(); + } + + @Override + public Block createNullableBlock(boolean[] nulls, BinaryBuffer values) + { + return new VariableWidthBlock(values.getValueCount(), values.asSlice(), values.getOffsets(), Optional.of(nulls)); + } + + @Override + public Block createNullableDictionaryBlock(BinaryBuffer dictionary, int nonNullsCount) + { + checkArgument( + dictionary.getValueCount() == nonNullsCount + 1, + "Dictionary buffer size %s did not match the expected value of %s", + dictionary.getValueCount(), + nonNullsCount + 1); + boolean[] nulls = new boolean[nonNullsCount + 1]; + nulls[nonNullsCount] = true; + // Overwrite the next after last position with an empty value. This will be used as null. + int[] offsets = dictionary.getOffsets(); + offsets[nonNullsCount + 1] = offsets[nonNullsCount]; + return new VariableWidthBlock(dictionary.getValueCount(), dictionary.asSlice(), offsets, Optional.of(nulls)); + } + + @Override + public Block createNonNullBlock(BinaryBuffer values) + { + return new VariableWidthBlock(values.getValueCount(), values.asSlice(), values.getOffsets(), Optional.empty()); + } + + @Override + public void unpackNullValues(BinaryBuffer sourceBuffer, BinaryBuffer destinationBuffer, boolean[] isNull, int destOffset, int nonNullCount, int totalValuesCount) + { + int endOffset = destOffset + totalValuesCount; + int srcOffset = 0; + int[] destination = destinationBuffer.getOffsets(); + int[] source = sourceBuffer.getOffsets(); + + while (srcOffset < nonNullCount) { + destination[destOffset] = source[srcOffset]; + srcOffset += castToByteNegate(isNull[destOffset]); + destOffset++; + } + // The last+1 offset is always a sentinel value equal to last offset + last position length. + // In case of null values at the end, the last offset value needs to be repeated for every null position + while (destOffset <= endOffset) { + destination[destOffset++] = source[nonNullCount]; + } + } + + @Override + public void decodeDictionaryIds(BinaryBuffer values, int offset, int length, int[] ids, BinaryBuffer dictionary) + { + Slice dictionarySlice = dictionary.asSlice(); + int[] outputOffsets = values.getOffsets(); + int[] dictionaryOffsets = dictionary.getOffsets(); + int outputLength = 0; + for (int i = 0; i < length; i++) { + int id = ids[i]; + int positionLength = dictionaryOffsets[id + 1] - dictionaryOffsets[id]; + outputLength += positionLength; + outputOffsets[offset + i + 1] = outputOffsets[offset + i] + positionLength; + } + byte[] outputChunk = new byte[outputLength]; + int outputIndex = 0; + for (int i = 0; i < length; i++) { + int id = ids[i]; + int startIndex = dictionaryOffsets[id]; + int endIndex = dictionaryOffsets[id + 1]; + int positionLength = endIndex - startIndex; + dictionarySlice.getBytes(startIndex, outputChunk, outputIndex, positionLength); + outputIndex += positionLength; + } + + values.addChunk(Slices.wrappedBuffer(outputChunk)); + } + + @Override + public long getSizeInBytes(BinaryBuffer values) + { + return values.getRetainedSize(); + } + + @Override + public BinaryBuffer merge(List buffers) + { + if (buffers.isEmpty()) { + return new BinaryBuffer(0); + } + + int valueCount = 0; + for (BinaryBuffer binaryBuffer : buffers) { + valueCount += binaryBuffer.getValueCount(); + } + BinaryBuffer result = new BinaryBuffer(valueCount); + for (BinaryBuffer binaryBuffer : buffers) { + result.addChunk(binaryBuffer.asSlice()); + } + int[] resultOffsets = result.getOffsets(); + int[] firstOffsets = buffers.get(0).getOffsets(); + System.arraycopy(firstOffsets, 0, resultOffsets, 0, firstOffsets.length); + + int dataOffset = firstOffsets[firstOffsets.length - 1]; + int outputArrayOffset = firstOffsets.length; + for (int i = 1; i < buffers.size(); i++) { + int[] currentOffsets = buffers.get(i).getOffsets(); + for (int j = 1; j < currentOffsets.length; j++) { + resultOffsets[outputArrayOffset + j - 1] = dataOffset + currentOffsets[j]; + } + outputArrayOffset += currentOffsets.length - 1; + dataOffset = resultOffsets[outputArrayOffset - 1]; + } + + return result; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BitPackingUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BitPackingUtils.java new file mode 100644 index 000000000000..83962998f92d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/BitPackingUtils.java @@ -0,0 +1,78 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import static io.trino.parquet.ParquetReaderUtils.castToByteNegate; + +public class BitPackingUtils +{ + private BitPackingUtils() {} + + /** + * @return number of bits equal to 0 (non-nulls) + */ + public static int unpack(boolean[] values, int offset, byte packedByte, int startBit, int endBit) + { + int nonNullCount = 0; + for (int i = 0; i < endBit - startBit; i++) { + // We need to negate the value as we convert the "does exist" to "is null", hence '== 0' instead of '== 1' + boolean value = (((packedByte >>> (startBit + i)) & 1) == 1); + nonNullCount += castToByteNegate(value); + values[offset + i] = value; + } + + return nonNullCount; + } + + /** + * @return number of bits equal to 0 (non-nulls) + */ + public static int unpack(boolean[] values, int offset, byte packedByte) + { + values[offset] = (packedByte & 1) == 1; + values[offset + 1] = ((packedByte >>> 1) & 1) == 1; + values[offset + 2] = ((packedByte >>> 2) & 1) == 1; + values[offset + 3] = ((packedByte >>> 3) & 1) == 1; + values[offset + 4] = ((packedByte >>> 4) & 1) == 1; + values[offset + 5] = ((packedByte >>> 5) & 1) == 1; + values[offset + 6] = ((packedByte >>> 6) & 1) == 1; + values[offset + 7] = ((packedByte >>> 7) & 1) == 1; + + return Byte.SIZE - bitCount(packedByte); + } + + public static void unpack(byte[] values, int offset, byte packedByte, int startBit, int endBit) + { + for (int i = 0; i < endBit - startBit; i++) { + values[offset + i] = (byte) ((packedByte >>> (startBit + i)) & 1); + } + } + + public static void unpack8FromByte(byte[] values, int offset, byte packedByte) + { + values[offset] = (byte) (packedByte & 1); + values[offset + 1] = (byte) ((packedByte >>> 1) & 1); + values[offset + 2] = (byte) ((packedByte >>> 2) & 1); + values[offset + 3] = (byte) ((packedByte >>> 3) & 1); + values[offset + 4] = (byte) ((packedByte >>> 4) & 1); + values[offset + 5] = (byte) ((packedByte >>> 5) & 1); + values[offset + 6] = (byte) ((packedByte >>> 6) & 1); + values[offset + 7] = (byte) ((packedByte >>> 7) & 1); + } + + public static int bitCount(byte value) + { + return Integer.bitCount(value & 0xFF); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ByteColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ByteColumnAdapter.java new file mode 100644 index 000000000000..76a272eac511 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ByteColumnAdapter.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.spi.block.Block; +import io.trino.spi.block.ByteArrayBlock; + +import java.util.List; +import java.util.Optional; + +import static io.airlift.slice.SizeOf.sizeOf; +import static java.lang.Math.toIntExact; + +public class ByteColumnAdapter + implements ColumnAdapter +{ + public static final ByteColumnAdapter BYTE_ADAPTER = new ByteColumnAdapter(); + + @Override + public byte[] createBuffer(int size) + { + return new byte[size]; + } + + @Override + public Block createNonNullBlock(byte[] values) + { + return new ByteArrayBlock(values.length, Optional.empty(), values); + } + + @Override + public Block createNullableBlock(boolean[] nulls, byte[] values) + { + return new ByteArrayBlock(values.length, Optional.of(nulls), values); + } + + @Override + public void copyValue(byte[] source, int sourceIndex, byte[] destination, int destinationIndex) + { + destination[destinationIndex] = source[sourceIndex]; + } + + @Override + public void decodeDictionaryIds(byte[] values, int offset, int length, int[] ids, byte[] dictionary) + { + for (int i = 0; i < length; i++) { + values[offset + i] = dictionary[ids[i]]; + } + } + + @Override + public long getSizeInBytes(byte[] values) + { + return sizeOf(values); + } + + @Override + public byte[] merge(List buffers) + { + long resultSize = 0; + for (byte[] buffer : buffers) { + resultSize += buffer.length; + } + byte[] result = new byte[toIntExact(resultSize)]; + int offset = 0; + for (byte[] buffer : buffers) { + System.arraycopy(buffer, 0, result, offset, buffer.length); + offset += buffer.length; + } + return result; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ColumnAdapter.java new file mode 100644 index 000000000000..38259a935ba6 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ColumnAdapter.java @@ -0,0 +1,63 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.spi.block.Block; + +import java.util.List; + +import static io.trino.parquet.ParquetReaderUtils.castToByteNegate; + +public interface ColumnAdapter +{ + /** + * Temporary buffer used for null unpacking + */ + default BufferType createTemporaryBuffer(int currentOffset, int size, BufferType buffer) + { + return createBuffer(size); + } + + BufferType createBuffer(int size); + + void copyValue(BufferType source, int sourceIndex, BufferType destination, int destinationIndex); + + Block createNullableBlock(boolean[] nulls, BufferType values); + + default Block createNullableDictionaryBlock(BufferType dictionary, int nonNullsCount) + { + boolean[] nulls = new boolean[nonNullsCount + 1]; + nulls[nonNullsCount] = true; + return createNullableBlock(nulls, dictionary); + } + + Block createNonNullBlock(BufferType values); + + default void unpackNullValues(BufferType source, BufferType destination, boolean[] isNull, int destOffset, int nonNullCount, int totalValuesCount) + { + int srcOffset = 0; + while (srcOffset < nonNullCount) { + copyValue(source, srcOffset, destination, destOffset); + // Avoid branching + srcOffset += castToByteNegate(isNull[destOffset]); + destOffset++; + } + } + + void decodeDictionaryIds(BufferType values, int offset, int length, int[] ids, BufferType dictionary); + + long getSizeInBytes(BufferType values); + + BufferType merge(List buffers); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/DictionaryDecoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/DictionaryDecoder.java new file mode 100644 index 000000000000..aac5aa9afe4b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/DictionaryDecoder.java @@ -0,0 +1,121 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.parquet.DictionaryPage; +import io.trino.parquet.reader.SimpleSliceInputStream; +import io.trino.parquet.reader.decoders.RleBitPackingHybridDecoder; +import io.trino.parquet.reader.decoders.ValueDecoder; +import io.trino.spi.block.Block; +import jakarta.annotation.Nullable; + +import static java.util.Objects.requireNonNull; + +public final class DictionaryDecoder + implements ValueDecoder +{ + private final T dictionary; + private final ColumnAdapter columnAdapter; + private final int dictionarySize; + private final boolean isNonNull; + private final boolean vectorizedDecodingEnabled; + private final long retainedSizeInBytes; + + private ValueDecoder dictionaryIdsReader; + @Nullable + private Block dictionaryBlock; + + public DictionaryDecoder(T dictionary, ColumnAdapter columnAdapter, int dictionarySize, boolean isNonNull, boolean vectorizedDecodingEnabled) + { + this.columnAdapter = requireNonNull(columnAdapter, "columnAdapter is null"); + this.dictionary = requireNonNull(dictionary, "dictionary is null"); + this.dictionarySize = dictionarySize; + this.isNonNull = isNonNull; + this.vectorizedDecodingEnabled = vectorizedDecodingEnabled; + this.retainedSizeInBytes = columnAdapter.getSizeInBytes(dictionary); + } + + @Override + public void init(SimpleSliceInputStream input) + { + int bitWidth = input.readByte(); + dictionaryIdsReader = new RleBitPackingHybridDecoder(bitWidth, vectorizedDecodingEnabled); + dictionaryIdsReader.init(input); + } + + @Override + public void read(T values, int offset, int length) + { + int[] ids = new int[length]; + dictionaryIdsReader.read(ids, 0, length); + columnAdapter.decodeDictionaryIds(values, offset, length, ids, dictionary); + } + + @Override + public void skip(int n) + { + dictionaryIdsReader.skip(n); + } + + public long getRetainedSizeInBytes() + { + return retainedSizeInBytes; + } + + public void readDictionaryIds(int[] ids, int offset, int length) + { + dictionaryIdsReader.read(ids, offset, length); + } + + public Block getDictionaryBlock() + { + if (dictionaryBlock == null) { + if (isNonNull) { + dictionaryBlock = columnAdapter.createNonNullBlock(dictionary); + } + else { + dictionaryBlock = columnAdapter.createNullableDictionaryBlock(dictionary, dictionarySize); + } + } + // Avoid creation of new Block objects for dictionary, since the engine currently + // uses identity equality to test if dictionaries are the same + return dictionaryBlock; + } + + public int getDictionarySize() + { + return dictionarySize; + } + + public interface DictionaryDecoderProvider + { + DictionaryDecoder create(DictionaryPage dictionaryPage, boolean isNonNull); + } + + public static DictionaryDecoder getDictionaryDecoder( + DictionaryPage dictionaryPage, + ColumnAdapter columnAdapter, + ValueDecoder plainValuesDecoder, + boolean isNonNull, + boolean vectorizedDecodingEnabled) + { + int size = dictionaryPage.getDictionarySize(); + // Extra value is added to the end of the dictionary for nullable columns because + // parquet dictionary page does not include null but Trino DictionaryBlock's dictionary does + BufferType dictionary = columnAdapter.createBuffer(size + (isNonNull ? 0 : 1)); + plainValuesDecoder.init(new SimpleSliceInputStream(dictionaryPage.getSlice())); + plainValuesDecoder.read(dictionary, 0, size); + return new DictionaryDecoder<>(dictionary, columnAdapter, size, isNonNull, vectorizedDecodingEnabled); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FilteredRowRangesIterator.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FilteredRowRangesIterator.java new file mode 100644 index 000000000000..1051f13c8ba5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FilteredRowRangesIterator.java @@ -0,0 +1,224 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.parquet.reader.FilteredRowRanges; + +import java.util.Iterator; +import java.util.OptionalLong; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static io.trino.parquet.reader.FilteredRowRanges.RowRange; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +/** + * When filtering using column indexes we might skip reading some pages for different columns. Because the rows are + * not aligned between the pages of the different columns it might be required to skip some values. The values (and the + * related rl and dl) are skipped based on the iterator of the required row indexes and the first row index of each + * page. + * For example: + * + *

+ * rows   col1   col2   col3
+ *      ┌──────┬──────┬──────┐
+ *   0  │  p0  │      │      │
+ *      ╞══════╡  p0  │  p0  │
+ *  20  │ p1(X)│------│------│
+ *      ╞══════╪══════╡      │
+ *  40  │ p2(X)│      │------│
+ *      ╞══════╡ p1(X)╞══════╡
+ *  60  │ p3(X)│      │------│
+ *      ╞══════╪══════╡      │
+ *  80  │  p4  │      │  p1  │
+ *      ╞══════╡  p2  │      │
+ * 100  │  p5  │      │      │
+ *      └──────┴──────┴──────┘
+ * 
+ * + * The pages 1, 2, 3 in col1 are skipped, so we have to skip the rows [20, 79]. Because page 1 in col2 contains values + * only for the rows [40, 79] we skip this entire page as well. To synchronize the row reading we have to skip the + * values (and the related rl and dl) for the rows [20, 39] in the end of the page 0 for col2. Similarly, we have to + * skip values while reading page0 and page1 for col3. + */ +public class FilteredRowRangesIterator + implements RowRangesIterator +{ + private final Iterator rowRangeIterator; + + // The current row range + private RowRange currentRange; + + private long pageFirstRowIndex = -1; + private int pageValuesConsumed; + + public FilteredRowRangesIterator(FilteredRowRanges rowRanges) + { + requireNonNull(rowRanges, "rowRanges is null"); + this.rowRangeIterator = rowRanges.getRowRanges().iterator(); + // We don't handle the empty rowRanges case because that should result in all pages getting eliminated + // and nothing should be read from file for that particular row group + checkArgument(this.rowRangeIterator.hasNext(), "rowRanges is empty"); + nextRange(); + } + + /** + * @return Size of the next read within current range, bounded by chunkSize. + */ + @Override + public int getRowsLeftInCurrentRange() + { + return toIntExact(currentRange.end() - pageFirstRowIndex) - pageValuesConsumed + 1; + } + + /** + * @return Size of the next read within current range, bounded by chunkSize. + * When all the rows of the current range have been read, advance to the next range. + */ + @Override + public int advanceRange(int chunkSize) + { + checkState(pageFirstRowIndex >= 0, "pageFirstRowIndex %s cannot be negative", pageFirstRowIndex); + long rangeEnd = currentRange.end(); + int rowsLeftInRange = toIntExact(rangeEnd - pageFirstRowIndex) - pageValuesConsumed + 1; + if (rowsLeftInRange > chunkSize) { + pageValuesConsumed += chunkSize; + return chunkSize; + } + pageValuesConsumed += rowsLeftInRange; + if (rowRangeIterator.hasNext()) { + nextRange(); + } + else { + checkState( + rowsLeftInRange == chunkSize, + "Reached end of filtered rowRanges with chunkSize %s, rowsLeftInRange %s, pageFirstRowIndex %s, pageValuesConsumed %s", + chunkSize, + rowsLeftInRange, + pageFirstRowIndex, + pageValuesConsumed); + } + return rowsLeftInRange; + } + + /** + * Seek forward in the page by chunkSize. + * Advance rowRanges if we seek beyond currentRange. + * + * @return number of values skipped within rowRanges + */ + @Override + public int seekForward(int chunkSize) + { + checkState(pageFirstRowIndex >= 0, "pageFirstRowIndex %s cannot be negative", pageFirstRowIndex); + long currentIndex = pageFirstRowIndex + pageValuesConsumed; + int skippedInRange = 0; + while (chunkSize > 0) { + // Before currentRange + if (currentIndex < currentRange.start()) { + int stepSize = min(chunkSize, toIntExact(currentRange.start() - currentIndex)); + currentIndex += stepSize; + pageValuesConsumed += stepSize; + chunkSize -= stepSize; + } + // Within currentRange + else if (currentIndex <= currentRange.end()) { + int stepSize = min(chunkSize, toIntExact(currentRange.end() - currentIndex) + 1); + currentIndex += stepSize; + skippedInRange += stepSize; + pageValuesConsumed += stepSize; + chunkSize -= stepSize; + } + // After currentRange + else { + // chunkSize can never go beyond rowRanges end + checkState( + rowRangeIterator.hasNext(), + "Reached end of filtered rowRanges with chunkSize %s, currentIndex %s, pageFirstRowIndex %s, pageValuesConsumed %s", + chunkSize, + currentIndex, + pageFirstRowIndex, + pageValuesConsumed); + nextRange(); + } + } + return skippedInRange; + } + + /** + * @return Count of values to be skipped when current range start + * is after current position in the page + */ + @Override + public long skipToRangeStart() + { + checkState(pageFirstRowIndex >= 0, "pageFirstRowIndex %s cannot be negative", pageFirstRowIndex); + long rangeStart = currentRange.start(); + long currentIndex = pageFirstRowIndex + pageValuesConsumed; + if (rangeStart <= currentIndex) { + return 0; + } + int skipCount = toIntExact(rangeStart - currentIndex); + pageValuesConsumed += skipCount; + return skipCount; + } + + /** + * Must be called at the beginning of reading a new page. + * Advances rowRanges if current range has no overlap with the new page. + */ + @Override + public void resetForNewPage(OptionalLong firstRowIndex) + { + checkArgument(firstRowIndex.isPresent(), "Missing firstRowIndex for selecting rowRanges"); + checkArgument(firstRowIndex.getAsLong() >= 0, "firstRowIndex %s cannot be negative", firstRowIndex.getAsLong()); + checkArgument( + firstRowIndex.getAsLong() >= pageFirstRowIndex, + "firstRowIndex %s cannot be less than current pageFirstRowIndex %s", + firstRowIndex.getAsLong(), + pageFirstRowIndex); + + pageFirstRowIndex = firstRowIndex.getAsLong(); + pageValuesConsumed = 0; + long rangeEnd = currentRange.end(); + if (pageFirstRowIndex > rangeEnd) { + nextRange(); + rangeEnd = currentRange.end(); + } + // We only read pages which contain some rows matched by rowRanges. + // At the end of reading previous page, one of 2 cases can happen: + // 1. Current range was not fully read, so firstRowIndex must be <= rangeEnd. + // 2. Current range was fully read, the next range must have some overlap with new page. + verify(pageFirstRowIndex <= rangeEnd); + } + + /** + * Returns whether the current page with the provided value count + * is fully contained within the current row range. + */ + @Override + public boolean isPageFullyConsumed(int pageValueCount) + { + return pageFirstRowIndex >= currentRange.start() + && (pageFirstRowIndex + pageValueCount) <= currentRange.end() + 1; + } + + private void nextRange() + { + currentRange = rowRangeIterator.next(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/Fixed12ColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/Fixed12ColumnAdapter.java new file mode 100644 index 000000000000..abf25c79d8d3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/Fixed12ColumnAdapter.java @@ -0,0 +1,78 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.spi.block.Block; +import io.trino.spi.block.Fixed12Block; + +import java.util.List; +import java.util.Optional; + +import static io.airlift.slice.SizeOf.sizeOf; + +public class Fixed12ColumnAdapter + implements ColumnAdapter +{ + public static final Fixed12ColumnAdapter FIXED12_ADAPTER = new Fixed12ColumnAdapter(); + + @Override + public int[] createBuffer(int size) + { + return new int[size * 3]; + } + + @Override + public Block createNonNullBlock(int[] values) + { + return new Fixed12Block(values.length / 3, Optional.empty(), values); + } + + @Override + public Block createNullableBlock(boolean[] nulls, int[] values) + { + return new Fixed12Block(values.length / 3, Optional.of(nulls), values); + } + + @Override + public void copyValue(int[] source, int sourceIndex, int[] destination, int destinationIndex) + { + destination[destinationIndex * 3] = source[sourceIndex * 3]; + destination[(destinationIndex * 3) + 1] = source[(sourceIndex * 3) + 1]; + destination[(destinationIndex * 3) + 2] = source[(sourceIndex * 3) + 2]; + } + + @Override + public void decodeDictionaryIds(int[] values, int offset, int length, int[] ids, int[] dictionary) + { + for (int i = 0; i < length; i++) { + int id = 3 * ids[i]; + int destinationIndex = 3 * (offset + i); + values[destinationIndex] = dictionary[id]; + values[destinationIndex + 1] = dictionary[id + 1]; + values[destinationIndex + 2] = dictionary[id + 2]; + } + } + + @Override + public long getSizeInBytes(int[] values) + { + return sizeOf(values); + } + + @Override + public int[] merge(List buffers) + { + return IntColumnAdapter.concatIntArrays(buffers); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FlatColumnReader.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FlatColumnReader.java new file mode 100644 index 000000000000..0556c225dd0a --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FlatColumnReader.java @@ -0,0 +1,478 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import com.google.common.annotations.VisibleForTesting; +import io.airlift.log.Logger; +import io.airlift.slice.Slice; +import io.trino.memory.context.LocalMemoryContext; +import io.trino.parquet.DataPage; +import io.trino.parquet.DataPageV1; +import io.trino.parquet.DataPageV2; +import io.trino.parquet.ParquetEncoding; +import io.trino.parquet.PrimitiveField; +import io.trino.parquet.reader.AbstractColumnReader; +import io.trino.parquet.reader.ColumnChunk; +import io.trino.parquet.reader.decoders.ValueDecoder; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.type.Type; + +import java.util.Arrays; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static io.trino.parquet.ParquetEncoding.RLE; +import static io.trino.parquet.reader.decoders.ValueDecoder.ValueDecodersProvider; +import static io.trino.parquet.reader.flat.DictionaryDecoder.DictionaryDecoderProvider; +import static io.trino.parquet.reader.flat.FlatDefinitionLevelDecoder.DefinitionLevelDecoderProvider; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +public class FlatColumnReader + extends AbstractColumnReader +{ + private static final Logger log = Logger.get(FlatColumnReader.class); + + private static final int[] EMPTY_DEFINITION_LEVELS = new int[0]; + private static final int[] EMPTY_REPETITION_LEVELS = new int[0]; + + private final DefinitionLevelDecoderProvider definitionLevelDecoderProvider; + private final LocalMemoryContext memoryContext; + + private int remainingPageValueCount; + private FlatDefinitionLevelDecoder definitionLevelDecoder; + private ValueDecoder valueDecoder; + private int readOffset; + private int nextBatchSize; + + public FlatColumnReader( + PrimitiveField field, + ValueDecodersProvider decodersProvider, + DefinitionLevelDecoderProvider definitionLevelDecoderProvider, + DictionaryDecoderProvider dictionaryDecoderProvider, + ColumnAdapter columnAdapter, + LocalMemoryContext memoryContext) + { + super(field, decodersProvider, dictionaryDecoderProvider, columnAdapter); + this.definitionLevelDecoderProvider = requireNonNull(definitionLevelDecoderProvider, "definitionLevelDecoderProvider is null"); + this.memoryContext = requireNonNull(memoryContext, "memoryContext is null"); + } + + @Override + public boolean hasPageReader() + { + return pageReader != null; + } + + @Override + protected boolean isNonNull() + { + return field.isRequired() || pageReader.hasNoNulls(); + } + + @Override + public ColumnChunk readPrimitive() + { + seek(); + ColumnChunk columnChunk; + if (isNonNull()) { + columnChunk = readNonNull(); + } + else { + columnChunk = readNullable(); + } + + readOffset = 0; + nextBatchSize = 0; + return columnChunk; + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + private void seek() + { + if (readOffset > 0) { + log.debug("seek field %s, readOffset %d, remainingPageValueCount %d", field, readOffset, remainingPageValueCount); + } + int remainingInBatch = readOffset; + while (remainingInBatch > 0) { + if (remainingPageValueCount == 0) { + remainingInBatch = seekToNextPage(remainingInBatch); + if (remainingInBatch == 0) { + break; + } + if (remainingPageValueCount == 0) { + throwEndOfBatchException(remainingInBatch); + } + } + + int chunkSize = Math.min(remainingPageValueCount, remainingInBatch); + int nonNullCount; + if (isNonNull()) { + nonNullCount = chunkSize; + } + else { + nonNullCount = definitionLevelDecoder.skip(chunkSize); + } + valueDecoder.skip(nonNullCount); + remainingInBatch -= rowRanges.seekForward(chunkSize); + remainingPageValueCount -= chunkSize; + } + } + + @VisibleForTesting + ColumnChunk readNullable() + { + log.debug("readNullable field %s, nextBatchSize %d, remainingPageValueCount %d", field, nextBatchSize, remainingPageValueCount); + NullableValuesBuffer valuesBuffer = createNullableValuesBuffer(nextBatchSize); + boolean[] isNull = new boolean[nextBatchSize]; + int remainingInBatch = nextBatchSize; + int offset = 0; + while (remainingInBatch > 0) { + if (remainingPageValueCount == 0) { + if (!readNextPage()) { + throwEndOfBatchException(remainingInBatch); + } + } + + if (skipToRowRangesStart()) { + continue; + } + int chunkSize = rowRanges.advanceRange(Math.min(remainingPageValueCount, remainingInBatch)); + int nonNullCount = definitionLevelDecoder.readNext(isNull, offset, chunkSize); + + valuesBuffer.readNullableValues(valueDecoder, isNull, offset, nonNullCount, chunkSize); + + offset += chunkSize; + remainingInBatch -= chunkSize; + remainingPageValueCount -= chunkSize; + } + return valuesBuffer.createNullableBlock(isNull, field.getType()); + } + + @VisibleForTesting + ColumnChunk readNonNull() + { + log.debug("readNonNull field %s, nextBatchSize %d, remainingPageValueCount %d", field, nextBatchSize, remainingPageValueCount); + NonNullValuesBuffer valuesBuffer = createNonNullValuesBuffer(nextBatchSize); + int remainingInBatch = nextBatchSize; + int offset = 0; + while (remainingInBatch > 0) { + if (remainingPageValueCount == 0) { + if (!readNextPage()) { + throwEndOfBatchException(remainingInBatch); + } + } + + if (skipToRowRangesStart()) { + continue; + } + int chunkSize = rowRanges.advanceRange(Math.min(remainingPageValueCount, remainingInBatch)); + + valuesBuffer.readNonNullValues(valueDecoder, offset, chunkSize); + offset += chunkSize; + remainingInBatch -= chunkSize; + remainingPageValueCount -= chunkSize; + } + return valuesBuffer.createNonNullBlock(field.getType()); + } + + /** + * Finds the number of values to be skipped in the current page to reach + * the start of the current row range and uses that to skip ValueDecoder + * and DefinitionLevelDecoder to the appropriate position. + * + * @return Whether to skip the entire remaining page + */ + private boolean skipToRowRangesStart() + { + int skipCount = toIntExact(rowRanges.skipToRangeStart()); + if (skipCount > 0) { + log.debug("skipCount %d, remainingPageValueCount %d", skipCount, remainingPageValueCount); + } + if (skipCount >= remainingPageValueCount) { + remainingPageValueCount = 0; + return true; + } + if (skipCount > 0) { + int nonNullsCount; + if (isNonNull()) { + nonNullsCount = skipCount; + } + else { + nonNullsCount = definitionLevelDecoder.skip(skipCount); + } + valueDecoder.skip(nonNullsCount); + remainingPageValueCount -= skipCount; + } + return false; + } + + private boolean readNextPage() + { + if (!pageReader.hasNext()) { + return false; + } + DataPage page = readPage(); + rowRanges.resetForNewPage(page.getFirstRowIndex()); + return true; + } + + // When a large enough number of rows are skipped due to `seek` operation, + // it is possible to skip decompressing and decoding parquet pages entirely. + private int seekToNextPage(int remainingInBatch) + { + while (remainingInBatch > 0 && pageReader.hasNext()) { + DataPage page = pageReader.getNextPage(); + rowRanges.resetForNewPage(page.getFirstRowIndex()); + if (remainingInBatch < page.getValueCount() || !rowRanges.isPageFullyConsumed(page.getValueCount())) { + readPage(); + return remainingInBatch; + } + remainingInBatch -= page.getValueCount(); + remainingPageValueCount = 0; + pageReader.skipNextPage(); + } + return remainingInBatch; + } + + private DataPage readPage() + { + DataPage page = pageReader.readPage(); + requireNonNull(page, "page is null"); + log.debug("readNextPage field %s, page %s", field, page); + if (page instanceof DataPageV1 dataPageV1) { + readFlatPageV1(dataPageV1); + } + else if (page instanceof DataPageV2 dataPageV2) { + readFlatPageV2(dataPageV2); + } + // For a compressed data page, the memory used by the decompressed values data needs to be accounted + // for separately as ParquetCompressionUtils#decompress allocates a new byte array for the decompressed result. + // For an uncompressed data page, we read directly from input Slices whose memory usage is already accounted + // for in AbstractParquetDataSource#ReferenceCountedReader. + int dataPageSizeInBytes = pageReader.arePagesCompressed() ? page.getUncompressedSize() : 0; + long dictionarySizeInBytes = dictionaryDecoder == null ? 0 : dictionaryDecoder.getRetainedSizeInBytes(); + memoryContext.setBytes(dataPageSizeInBytes + dictionarySizeInBytes); + + remainingPageValueCount = page.getValueCount(); + return page; + } + + private void readFlatPageV1(DataPageV1 page) + { + Slice buffer = page.getSlice(); + ParquetEncoding definitionEncoding = page.getDefinitionLevelEncoding(); + + checkArgument(isNonNull() || definitionEncoding == RLE, "Invalid definition level encoding: %s", definitionEncoding); + int alreadyRead = 0; + if (definitionEncoding == RLE) { + // Definition levels are skipped from file when the max definition level is 0 as the bit-width required to store them is 0. + // This can happen for non-null (required) fields or nullable fields where all values are null. + // See org.apache.parquet.column.Encoding.RLE.getValuesReader for reference. + int maxDefinitionLevel = field.getDescriptor().getMaxDefinitionLevel(); + definitionLevelDecoder = definitionLevelDecoderProvider.create(maxDefinitionLevel); + if (maxDefinitionLevel > 0) { + int bufferSize = buffer.getInt(0); // We need to read the size even if nulls are absent + definitionLevelDecoder.init(buffer.slice(Integer.BYTES, bufferSize)); + alreadyRead = bufferSize + Integer.BYTES; + } + } + + valueDecoder = createValueDecoder(decodersProvider, page.getValueEncoding(), buffer.slice(alreadyRead, buffer.length() - alreadyRead)); + } + + private void readFlatPageV2(DataPageV2 page) + { + definitionLevelDecoder = definitionLevelDecoderProvider.create(field.getDescriptor().getMaxDefinitionLevel()); + definitionLevelDecoder.init(page.getDefinitionLevels()); + valueDecoder = createValueDecoder(decodersProvider, page.getDataEncoding(), page.getSlice()); + } + + private NonNullValuesBuffer createNonNullValuesBuffer(int batchSize) + { + if (produceDictionaryBlock()) { + return new DictionaryValuesBuffer<>(field, dictionaryDecoder, batchSize); + } + return new DataValuesBuffer<>(field, columnAdapter, batchSize); + } + + private NullableValuesBuffer createNullableValuesBuffer(int batchSize) + { + if (produceDictionaryBlock()) { + return new DictionaryValuesBuffer<>(field, dictionaryDecoder, batchSize); + } + return new DataValuesBuffer<>(field, columnAdapter, batchSize); + } + + private interface NonNullValuesBuffer + { + void readNonNullValues(ValueDecoder valueDecoder, int offset, int valuesCount); + + ColumnChunk createNonNullBlock(Type type); + } + + private interface NullableValuesBuffer + { + void readNullableValues(ValueDecoder valueDecoder, boolean[] isNull, int offset, int nonNullCount, int valuesCount); + + ColumnChunk createNullableBlock(boolean[] isNull, Type type); + } + + private static final class DataValuesBuffer + implements NonNullValuesBuffer, NullableValuesBuffer + { + private final PrimitiveField field; + private final ColumnAdapter columnAdapter; + private final T values; + private final int batchSize; + private int totalNullsCount; + + private DataValuesBuffer(PrimitiveField field, ColumnAdapter columnAdapter, int batchSize) + { + this.field = field; + this.values = columnAdapter.createBuffer(batchSize); + this.columnAdapter = columnAdapter; + this.batchSize = batchSize; + } + + @Override + public void readNonNullValues(ValueDecoder valueDecoder, int offset, int valuesCount) + { + valueDecoder.read(values, offset, valuesCount); + } + + @Override + public void readNullableValues(ValueDecoder valueDecoder, boolean[] isNull, int offset, int nonNullCount, int valuesCount) + { + // Only nulls + if (nonNullCount == 0) { + // Unpack empty null table. This is almost always a no-op. However, in binary type + // the last position offset needs to be propagated + T tmpBuffer = columnAdapter.createTemporaryBuffer(offset, 0, values); + columnAdapter.unpackNullValues(tmpBuffer, values, isNull, offset, 0, valuesCount); + } + // No nulls + else if (nonNullCount == valuesCount) { + valueDecoder.read(values, offset, nonNullCount); + } + else { + // Read data values to a temporary array and unpack the nulls to the actual destination + T tmpBuffer = columnAdapter.createTemporaryBuffer(offset, nonNullCount, values); + valueDecoder.read(tmpBuffer, 0, nonNullCount); + columnAdapter.unpackNullValues(tmpBuffer, values, isNull, offset, nonNullCount, valuesCount); + } + totalNullsCount += valuesCount - nonNullCount; + } + + @Override + public ColumnChunk createNonNullBlock(Type type) + { + checkState( + totalNullsCount == 0, + "totalNonNullsCount %s should be equal to 0 when creating non-null block", + totalNullsCount); + log.debug("DataValuesBuffer createNonNullBlock field %s, totalNullsCount %d", field, totalNullsCount); + return new ColumnChunk(columnAdapter.createNonNullBlock(values), EMPTY_DEFINITION_LEVELS, EMPTY_REPETITION_LEVELS); + } + + @Override + public ColumnChunk createNullableBlock(boolean[] isNull, Type type) + { + log.debug("DataValuesBuffer createNullableBlock field %s, totalNullsCount %d, batchSize %d", field, totalNullsCount, batchSize); + if (totalNullsCount == batchSize) { + return new ColumnChunk(RunLengthEncodedBlock.create(type, null, batchSize), EMPTY_DEFINITION_LEVELS, EMPTY_REPETITION_LEVELS); + } + if (totalNullsCount == 0) { + return new ColumnChunk(columnAdapter.createNonNullBlock(values), EMPTY_DEFINITION_LEVELS, EMPTY_REPETITION_LEVELS); + } + return new ColumnChunk(columnAdapter.createNullableBlock(isNull, values), EMPTY_DEFINITION_LEVELS, EMPTY_REPETITION_LEVELS); + } + } + + private static final class DictionaryValuesBuffer + implements NonNullValuesBuffer, NullableValuesBuffer + { + private final PrimitiveField field; + private final DictionaryDecoder decoder; + private final int[] ids; + private final int batchSize; + private int totalNullsCount; + + private DictionaryValuesBuffer(PrimitiveField field, DictionaryDecoder dictionaryDecoder, int batchSize) + { + this.field = field; + this.ids = new int[batchSize]; + this.decoder = dictionaryDecoder; + this.batchSize = batchSize; + } + + @Override + public void readNonNullValues(ValueDecoder valueDecoder, int offset, int chunkSize) + { + decoder.readDictionaryIds(ids, offset, chunkSize); + } + + @Override + public void readNullableValues(ValueDecoder valueDecoder, boolean[] isNull, int offset, int nonNullCount, int valuesCount) + { + // Parquet dictionary encodes only non-null values + // Dictionary size is used as the id to denote nulls for Trino dictionary block + if (nonNullCount == 0) { + // Only nulls were encountered in chunkSize, add empty values for nulls + Arrays.fill(ids, offset, offset + valuesCount, decoder.getDictionarySize()); + } + // No nulls + else if (nonNullCount == valuesCount) { + decoder.readDictionaryIds(ids, offset, valuesCount); + } + else { + // Read data values to a temporary array and unpack the nulls to the actual destination + int[] tmpBuffer = new int[nonNullCount]; + decoder.readDictionaryIds(tmpBuffer, 0, nonNullCount); + unpackDictionaryNullId(tmpBuffer, ids, isNull, offset, valuesCount, decoder.getDictionarySize()); + } + totalNullsCount += valuesCount - nonNullCount; + } + + @Override + public ColumnChunk createNonNullBlock(Type type) + { + // This will return a nullable dictionary even if we are returning a batch of non-null values + // for a nullable column. We avoid creating a new non-nullable dictionary to allow the engine + // to optimize for the unchanged dictionary case. + checkState( + totalNullsCount == 0, + "totalNonNullsCount %s should be equal to 0 when creating non-null block", + totalNullsCount); + log.debug("DictionaryValuesBuffer createNonNullBlock field %s, totalNullsCount %d", field, totalNullsCount); + return createDictionaryBlock(ids, decoder.getDictionaryBlock(), EMPTY_DEFINITION_LEVELS, EMPTY_REPETITION_LEVELS); + } + + @Override + public ColumnChunk createNullableBlock(boolean[] isNull, Type type) + { + log.debug("DictionaryValuesBuffer createNullableBlock field %s, totalNullsCount %d, batchSize %d", field, totalNullsCount, batchSize); + if (totalNullsCount == batchSize) { + return new ColumnChunk(RunLengthEncodedBlock.create(type, null, batchSize), EMPTY_DEFINITION_LEVELS, EMPTY_REPETITION_LEVELS); + } + return createDictionaryBlock(ids, decoder.getDictionaryBlock(), EMPTY_DEFINITION_LEVELS, EMPTY_REPETITION_LEVELS); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FlatDefinitionLevelDecoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FlatDefinitionLevelDecoder.java new file mode 100644 index 000000000000..820d5332bf09 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/FlatDefinitionLevelDecoder.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.airlift.slice.Slice; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.reader.flat.NullsDecoders.createNullsDecoder; + +public interface FlatDefinitionLevelDecoder +{ + void init(Slice input); + + /** + * Populate 'values' with true for nulls and return the number of non-nulls encountered. + * 'values' array is assumed to be empty at the start of reading a batch, i.e. contain only false values. + */ + int readNext(boolean[] values, int offset, int length); + + /** + * Skip 'length' values and return the number of non-nulls encountered + */ + int skip(int length); + + interface DefinitionLevelDecoderProvider + { + FlatDefinitionLevelDecoder create(int maxDefinitionLevel); + } + + static FlatDefinitionLevelDecoder getFlatDefinitionLevelDecoder(int maxDefinitionLevel, boolean vectorizedDecodingEnabled) + { + checkArgument(maxDefinitionLevel >= 0 && maxDefinitionLevel <= 1, "Invalid max definition level: %s", maxDefinitionLevel); + if (maxDefinitionLevel == 0) { + return new ZeroDefinitionLevelDecoder(); + } + return createNullsDecoder(vectorizedDecodingEnabled); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/Int128ColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/Int128ColumnAdapter.java new file mode 100644 index 000000000000..ee74dc2db5ff --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/Int128ColumnAdapter.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.spi.block.Block; +import io.trino.spi.block.Int128ArrayBlock; + +import java.util.List; +import java.util.Optional; + +import static io.airlift.slice.SizeOf.sizeOf; + +public class Int128ColumnAdapter + implements ColumnAdapter +{ + public static final Int128ColumnAdapter INT128_ADAPTER = new Int128ColumnAdapter(); + + @Override + public long[] createBuffer(int size) + { + return new long[size * 2]; + } + + @Override + public Block createNonNullBlock(long[] values) + { + return new Int128ArrayBlock(values.length / 2, Optional.empty(), values); + } + + @Override + public Block createNullableBlock(boolean[] nulls, long[] values) + { + return new Int128ArrayBlock(values.length / 2, Optional.of(nulls), values); + } + + @Override + public void copyValue(long[] source, int sourceIndex, long[] destination, int destinationIndex) + { + destination[destinationIndex * 2] = source[sourceIndex * 2]; + destination[(destinationIndex * 2) + 1] = source[(sourceIndex * 2) + 1]; + } + + @Override + public void decodeDictionaryIds(long[] values, int offset, int length, int[] ids, long[] dictionary) + { + for (int i = 0; i < length; i++) { + int id = 2 * ids[i]; + int destinationIndex = 2 * (offset + i); + values[destinationIndex] = dictionary[id]; + values[destinationIndex + 1] = dictionary[id + 1]; + } + } + + @Override + public long getSizeInBytes(long[] values) + { + return sizeOf(values); + } + + @Override + public long[] merge(List buffers) + { + return LongColumnAdapter.concatLongArrays(buffers); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/IntColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/IntColumnAdapter.java new file mode 100644 index 000000000000..83bfdccc5668 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/IntColumnAdapter.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.spi.block.Block; +import io.trino.spi.block.IntArrayBlock; + +import java.util.List; +import java.util.Optional; + +import static io.airlift.slice.SizeOf.sizeOf; +import static java.lang.Math.toIntExact; + +public class IntColumnAdapter + implements ColumnAdapter +{ + public static final IntColumnAdapter INT_ADAPTER = new IntColumnAdapter(); + + @Override + public int[] createBuffer(int size) + { + return new int[size]; + } + + @Override + public Block createNonNullBlock(int[] values) + { + return new IntArrayBlock(values.length, Optional.empty(), values); + } + + @Override + public Block createNullableBlock(boolean[] nulls, int[] values) + { + return new IntArrayBlock(values.length, Optional.of(nulls), values); + } + + @Override + public void copyValue(int[] source, int sourceIndex, int[] destination, int destinationIndex) + { + destination[destinationIndex] = source[sourceIndex]; + } + + @Override + public void decodeDictionaryIds(int[] values, int offset, int length, int[] ids, int[] dictionary) + { + for (int i = 0; i < length; i++) { + values[offset + i] = dictionary[ids[i]]; + } + } + + @Override + public long getSizeInBytes(int[] values) + { + return sizeOf(values); + } + + @Override + public int[] merge(List buffers) + { + return concatIntArrays(buffers); + } + + static int[] concatIntArrays(List buffers) + { + long resultSize = 0; + for (int[] buffer : buffers) { + resultSize += buffer.length; + } + int[] result = new int[toIntExact(resultSize)]; + int offset = 0; + for (int[] buffer : buffers) { + System.arraycopy(buffer, 0, result, offset, buffer.length); + offset += buffer.length; + } + return result; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/LongColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/LongColumnAdapter.java new file mode 100644 index 000000000000..f96d4e1b0f2d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/LongColumnAdapter.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.spi.block.Block; +import io.trino.spi.block.LongArrayBlock; + +import java.util.List; +import java.util.Optional; + +import static io.airlift.slice.SizeOf.sizeOf; +import static java.lang.Math.toIntExact; + +public class LongColumnAdapter + implements ColumnAdapter +{ + public static final LongColumnAdapter LONG_ADAPTER = new LongColumnAdapter(); + + @Override + public long[] createBuffer(int size) + { + return new long[size]; + } + + @Override + public Block createNonNullBlock(long[] values) + { + return new LongArrayBlock(values.length, Optional.empty(), values); + } + + @Override + public Block createNullableBlock(boolean[] nulls, long[] values) + { + return new LongArrayBlock(values.length, Optional.of(nulls), values); + } + + @Override + public void copyValue(long[] source, int sourceIndex, long[] destination, int destinationIndex) + { + destination[destinationIndex] = source[sourceIndex]; + } + + @Override + public void decodeDictionaryIds(long[] values, int offset, int length, int[] ids, long[] dictionary) + { + for (int i = 0; i < length; i++) { + values[offset + i] = dictionary[ids[i]]; + } + } + + @Override + public long getSizeInBytes(long[] values) + { + return sizeOf(values); + } + + @Override + public long[] merge(List buffers) + { + return concatLongArrays(buffers); + } + + static long[] concatLongArrays(List buffers) + { + long resultSize = 0; + for (long[] buffer : buffers) { + resultSize += buffer.length; + } + long[] result = new long[toIntExact(resultSize)]; + int offset = 0; + for (long[] buffer : buffers) { + System.arraycopy(buffer, 0, result, offset, buffer.length); + offset += buffer.length; + } + return result; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/NullsDecoders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/NullsDecoders.java new file mode 100644 index 000000000000..f659780753be --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/NullsDecoders.java @@ -0,0 +1,271 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.airlift.slice.Slice; +import io.trino.parquet.reader.SimpleSliceInputStream; + +import java.util.Arrays; + +import static io.trino.parquet.ParquetReaderUtils.castToByteNegate; +import static io.trino.parquet.ParquetReaderUtils.readUleb128Int; +import static io.trino.parquet.reader.flat.BitPackingUtils.bitCount; +import static io.trino.parquet.reader.flat.BitPackingUtils.unpack; +import static io.trino.parquet.reader.flat.VectorBitPackingUtils.vectorUnpackAndInvert8; +import static java.lang.Math.min; +import static java.util.Objects.requireNonNull; + +/** + * The hybrid RLE/bit-packing encoding consists of multiple groups. + * Each group is either encoded as RLE or bit-packed + *

+ * For a primitive column, the definition level is always either 0 (null) or 1 (non-null). + * Therefore, every value is decoded from a single bit and stored into a boolean array + * which stores false for non-null and true for null. + */ +public class NullsDecoders +{ + private NullsDecoders() {} + + public static FlatDefinitionLevelDecoder createNullsDecoder(boolean vectorizedDecodingEnabled) + { + return vectorizedDecodingEnabled ? new VectorNullsDecoder() : new NullsDecoder(); + } + + private abstract static class AbstractNullsDecoder + implements FlatDefinitionLevelDecoder + { + protected SimpleSliceInputStream input; + // Encoding type if decoding stopped in the middle of the group + protected boolean isRle; + // Values left to decode in the current group + protected int valuesLeftInGroup; + // With RLE encoding - the current value + protected boolean rleValue; + // With bit-packing - the byte that has been partially read + protected byte bitPackedValue; + // Number of bits already read in the current byte while reading bit-packed values + protected int bitPackedValueOffset; + + @Override + public void init(Slice input) + { + this.input = new SimpleSliceInputStream(requireNonNull(input, "input is null")); + } + + /** + * Skip 'length' values and return the number of non-nulls encountered + */ + @Override + public int skip(int length) + { + int nonNullCount = 0; + while (length > 0) { + if (valuesLeftInGroup == 0) { + readGroupHeader(); + } + + if (isRle) { + int chunkSize = min(length, valuesLeftInGroup); + nonNullCount += castToByteNegate(rleValue) * chunkSize; + valuesLeftInGroup -= chunkSize; + + length -= chunkSize; + } + else if (bitPackedValueOffset != 0) { // bit-packed - read remaining bits of current byte + int remainingBits = Byte.SIZE - bitPackedValueOffset; + int chunkSize = min(remainingBits, length); + int remainingPackedValue = (bitPackedValue & 0xff) >>> bitPackedValueOffset; + // In bitPackedValue 1's are nulls, so the number of non-nulls is + // chunkSize - bitCount(remainingBits up to chunkSize) + nonNullCount += chunkSize - bitCount((byte) (remainingPackedValue & ((1 << chunkSize) - 1))); + valuesLeftInGroup -= chunkSize; + bitPackedValueOffset = (bitPackedValueOffset + chunkSize) % Byte.SIZE; + + length -= chunkSize; + } + else { // bit-packed + // At this point we have only full bytes to read and valuesLeft is a multiplication of 8 + int chunkSize = min(length, valuesLeftInGroup); + int leftToRead = chunkSize; + // Parquet uses 1 for non-null value + while (leftToRead >= Long.SIZE) { + nonNullCount += Long.bitCount(input.readLong()); + leftToRead -= Long.SIZE; + } + while (leftToRead >= Byte.SIZE) { + nonNullCount += bitCount(input.readByte()); + leftToRead -= Byte.SIZE; + } + if (leftToRead > 0) { + byte packedValue = input.readByte(); + nonNullCount += bitCount((byte) (packedValue & ((1 << leftToRead) - 1))); + + // Inverting packedValue as readNext expects 1 for null + bitPackedValue = (byte) ~packedValue; + bitPackedValueOffset += leftToRead; + } + valuesLeftInGroup -= chunkSize; + length -= chunkSize; + } + } + return nonNullCount; + } + + protected void readGroupHeader() + { + int header = readUleb128Int(input); + isRle = (header & 1) == 0; + valuesLeftInGroup = header >>> 1; + if (isRle) { + // We need to negate the value as we convert the "does exist" to "is null", hence "== 0" + rleValue = input.readByte() == 0; + } + else { + // Only full bytes are encoded + valuesLeftInGroup *= Byte.SIZE; + } + } + } + + private static final class NullsDecoder + extends AbstractNullsDecoder + { + /** + * 'values' array needs to be empty, i.e. contain only false values. + */ + @Override + public int readNext(boolean[] values, int offset, int length) + { + int nonNullCount = 0; + while (length > 0) { + if (valuesLeftInGroup == 0) { + readGroupHeader(); + } + + if (isRle) { + int chunkSize = min(length, valuesLeftInGroup); + // The contract of the method requires values array to be empty (i.e. filled with false) + // so action is required only if the value is equal to true + if (rleValue) { + Arrays.fill(values, offset, offset + chunkSize, true); + } + nonNullCount += castToByteNegate(rleValue) * chunkSize; + valuesLeftInGroup -= chunkSize; + + length -= chunkSize; + offset += chunkSize; + } + else if (bitPackedValueOffset != 0) { // bit-packed - read remaining bits of current byte + int remainingBits = Byte.SIZE - bitPackedValueOffset; + int chunkSize = min(remainingBits, length); + nonNullCount += unpack(values, offset, bitPackedValue, bitPackedValueOffset, bitPackedValueOffset + chunkSize); + valuesLeftInGroup -= chunkSize; + bitPackedValueOffset = (bitPackedValueOffset + chunkSize) % Byte.SIZE; + + offset += chunkSize; + length -= chunkSize; + } + else { // bit-packed + // At this point we have only full bytes to read and valuesLeft is a multiplication of 8 + int chunkSize = min(length, valuesLeftInGroup); + int leftToRead = chunkSize; + // All values read from input are inverted as Trino uses 1 for null but Parquet uses 1 for non-null value + while (leftToRead >= Byte.SIZE) { + nonNullCount += unpack(values, offset, (byte) ~input.readByte()); + offset += Byte.SIZE; + leftToRead -= Byte.SIZE; + } + if (leftToRead > 0) { + bitPackedValue = (byte) ~input.readByte(); + nonNullCount += unpack(values, offset, bitPackedValue, 0, leftToRead); + bitPackedValueOffset += leftToRead; + offset += leftToRead; + } + valuesLeftInGroup -= chunkSize; + length -= chunkSize; + } + } + return nonNullCount; + } + } + + private static final class VectorNullsDecoder + extends AbstractNullsDecoder + { + /** + * 'values' array needs to be empty, i.e. contain only false values. + */ + @Override + public int readNext(boolean[] values, int offset, int length) + { + int nonNullCount = 0; + while (length > 0) { + if (valuesLeftInGroup == 0) { + readGroupHeader(); + } + + if (isRle) { + int chunkSize = min(length, valuesLeftInGroup); + // The contract of the method requires values array to be empty (i.e. filled with false) + // so action is required only if the value is equal to true + if (rleValue) { + Arrays.fill(values, offset, offset + chunkSize, true); + } + nonNullCount += castToByteNegate(rleValue) * chunkSize; + valuesLeftInGroup -= chunkSize; + + length -= chunkSize; + offset += chunkSize; + } + else if (bitPackedValueOffset != 0) { // bit-packed - read remaining bits of current byte + int remainingBits = Byte.SIZE - bitPackedValueOffset; + int chunkSize = min(remainingBits, length); + nonNullCount += unpack(values, offset, bitPackedValue, bitPackedValueOffset, bitPackedValueOffset + chunkSize); + valuesLeftInGroup -= chunkSize; + bitPackedValueOffset = (bitPackedValueOffset + chunkSize) % Byte.SIZE; + + offset += chunkSize; + length -= chunkSize; + } + else { // bit-packed + // At this point we have only full bytes to read and valuesLeft is a multiplication of 8 + int chunkSize = min(length, valuesLeftInGroup); + int leftToRead = chunkSize; + // All values read from input are inverted as Trino uses 1 for null but Parquet uses 1 for non-null value + byte[] inputArray = input.getByteArray(); + int inputOffset = input.getByteArrayOffset(); + int inputBytesRead = 0; + while (leftToRead >= Byte.SIZE) { + nonNullCount += vectorUnpackAndInvert8(values, offset, inputArray[inputOffset + inputBytesRead]); + offset += Byte.SIZE; + leftToRead -= Byte.SIZE; + inputBytesRead++; + } + input.skip(inputBytesRead); + + if (leftToRead > 0) { + bitPackedValue = (byte) ~input.readByte(); + nonNullCount += unpack(values, offset, bitPackedValue, 0, leftToRead); + bitPackedValueOffset += leftToRead; + offset += leftToRead; + } + valuesLeftInGroup -= chunkSize; + length -= chunkSize; + } + } + return nonNullCount; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/RowRangesIterator.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/RowRangesIterator.java new file mode 100644 index 000000000000..728826541155 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/RowRangesIterator.java @@ -0,0 +1,81 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.parquet.reader.FilteredRowRanges; + +import java.util.Optional; +import java.util.OptionalLong; + +public interface RowRangesIterator +{ + RowRangesIterator ALL_ROW_RANGES_ITERATOR = new AllRowRangesIterator(); + + int getRowsLeftInCurrentRange(); + + int advanceRange(int chunkSize); + + int seekForward(int chunkSize); + + long skipToRangeStart(); + + void resetForNewPage(OptionalLong firstRowIndex); + + boolean isPageFullyConsumed(int pageValueCount); + + class AllRowRangesIterator + implements RowRangesIterator + { + @Override + public int getRowsLeftInCurrentRange() + { + return Integer.MAX_VALUE; + } + + @Override + public int advanceRange(int chunkSize) + { + return chunkSize; + } + + @Override + public int seekForward(int chunkSize) + { + return chunkSize; + } + + @Override + public long skipToRangeStart() + { + return 0; + } + + @Override + public void resetForNewPage(OptionalLong firstRowIndex) {} + + @Override + public boolean isPageFullyConsumed(int pageValueCount) + { + return true; + } + } + + static RowRangesIterator createRowRangesIterator(Optional filteredRowRanges) + { + if (filteredRowRanges.isEmpty()) { + return ALL_ROW_RANGES_ITERATOR; + } + return new FilteredRowRangesIterator(filteredRowRanges.get()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ShortColumnAdapter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ShortColumnAdapter.java new file mode 100644 index 000000000000..a15897ddfb16 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ShortColumnAdapter.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.trino.spi.block.Block; +import io.trino.spi.block.ShortArrayBlock; + +import java.util.List; +import java.util.Optional; + +import static io.airlift.slice.SizeOf.sizeOf; +import static java.lang.Math.toIntExact; + +public class ShortColumnAdapter + implements ColumnAdapter +{ + public static final ShortColumnAdapter SHORT_ADAPTER = new ShortColumnAdapter(); + + @Override + public short[] createBuffer(int size) + { + return new short[size]; + } + + @Override + public Block createNonNullBlock(short[] values) + { + return new ShortArrayBlock(values.length, Optional.empty(), values); + } + + @Override + public Block createNullableBlock(boolean[] nulls, short[] values) + { + return new ShortArrayBlock(values.length, Optional.of(nulls), values); + } + + @Override + public void copyValue(short[] source, int sourceIndex, short[] destination, int destinationIndex) + { + destination[destinationIndex] = source[sourceIndex]; + } + + @Override + public void decodeDictionaryIds(short[] values, int offset, int length, int[] ids, short[] dictionary) + { + for (int i = 0; i < length; i++) { + values[offset + i] = dictionary[ids[i]]; + } + } + + @Override + public long getSizeInBytes(short[] values) + { + return sizeOf(values); + } + + @Override + public short[] merge(List buffers) + { + long resultSize = 0; + for (short[] buffer : buffers) { + resultSize += buffer.length; + } + short[] result = new short[toIntExact(resultSize)]; + int offset = 0; + for (short[] buffer : buffers) { + System.arraycopy(buffer, 0, result, offset, buffer.length); + offset += buffer.length; + } + return result; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/VectorBitPackingUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/VectorBitPackingUtils.java new file mode 100644 index 000000000000..1f29e93b1543 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/VectorBitPackingUtils.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import jdk.incubator.vector.ByteVector; +import jdk.incubator.vector.VectorOperators; + +import static io.trino.parquet.reader.flat.BitPackingUtils.bitCount; + +public final class VectorBitPackingUtils +{ + private static final ByteVector MASK_1 = ByteVector.broadcast(ByteVector.SPECIES_64, 1); + private static final ByteVector LSHR_BYTE_VECTOR = ByteVector.fromArray(ByteVector.SPECIES_64, new byte[] {0, 1, 2, 3, 4, 5, 6, 7}, 0); + + private VectorBitPackingUtils() {} + + public static int vectorUnpackAndInvert8(boolean[] values, int offset, byte packedByte) + { + ByteVector.broadcast(ByteVector.SPECIES_64, packedByte) + .lanewise(VectorOperators.LSHR, LSHR_BYTE_VECTOR) + .and(MASK_1) + .lanewise(VectorOperators.NOT) + .intoBooleanArray(values, offset); + return bitCount(packedByte); + } + + public static void vectorUnpack8FromByte(byte[] values, int offset, byte packedByte) + { + ByteVector.broadcast(ByteVector.SPECIES_64, packedByte) + .lanewise(VectorOperators.LSHR, LSHR_BYTE_VECTOR) + .and(MASK_1) + .intoArray(values, offset); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ZeroDefinitionLevelDecoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ZeroDefinitionLevelDecoder.java new file mode 100644 index 000000000000..79634780a4f5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/reader/flat/ZeroDefinitionLevelDecoder.java @@ -0,0 +1,35 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.reader.flat; + +import io.airlift.slice.Slice; + +public class ZeroDefinitionLevelDecoder + implements FlatDefinitionLevelDecoder +{ + @Override + public void init(Slice input) {} + + @Override + public int readNext(boolean[] values, int offset, int length) + { + return 0; + } + + @Override + public int skip(int length) + { + return 0; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/spark/Variant.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/spark/Variant.java new file mode 100644 index 000000000000..12b7d6a69817 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/spark/Variant.java @@ -0,0 +1,172 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.spark; + +import com.fasterxml.jackson.core.JsonGenerator; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.util.Base64; +import java.util.Locale; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.spark.VariantUtil.SIZE_LIMIT; +import static io.trino.parquet.spark.VariantUtil.VERSION; +import static io.trino.parquet.spark.VariantUtil.VERSION_MASK; +import static io.trino.parquet.spark.VariantUtil.getBinary; +import static io.trino.parquet.spark.VariantUtil.getBoolean; +import static io.trino.parquet.spark.VariantUtil.getDecimal; +import static io.trino.parquet.spark.VariantUtil.getDouble; +import static io.trino.parquet.spark.VariantUtil.getFloat; +import static io.trino.parquet.spark.VariantUtil.getLong; +import static io.trino.parquet.spark.VariantUtil.getMetadataKey; +import static io.trino.parquet.spark.VariantUtil.getString; +import static io.trino.parquet.spark.VariantUtil.getType; +import static io.trino.parquet.spark.VariantUtil.handleArray; +import static io.trino.parquet.spark.VariantUtil.handleObject; +import static io.trino.parquet.spark.VariantUtil.readUnsigned; +import static io.trino.plugin.base.util.JsonUtils.jsonFactory; +import static java.time.format.DateTimeFormatter.ISO_LOCAL_DATE; +import static java.time.format.DateTimeFormatter.ISO_LOCAL_TIME; +import static java.time.temporal.ChronoUnit.MICROS; + +/** + * Copied from https://github.com/apache/spark/blob/53d65fd12dd9231139188227ef9040d40d759021/common/variant/src/main/java/org/apache/spark/types/variant/Variant.java + * and adjusted the code style. + */ +public final class Variant +{ + private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(ISO_LOCAL_DATE) + .appendLiteral(' ') + .append(ISO_LOCAL_TIME) + .toFormatter(Locale.US); + + private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + private final byte[] value; + private final byte[] metadata; + // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and + // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value binary + // when reading a sub-variant in the array/object element. + private final int position; + + public Variant(byte[] value, byte[] metadata) + { + this(value, metadata, 0); + } + + private Variant(byte[] value, byte[] metadata, int position) + { + this.value = value; + this.metadata = metadata; + this.position = position; + checkArgument(metadata.length >= 1, "metadata must be present"); + checkArgument((metadata[0] & VERSION_MASK) == VERSION, "metadata version must be %s", VERSION); + // Don't attempt to use a Variant larger than 16 MiB. We'll never produce one, and it risks memory instability. + checkArgument(metadata.length <= SIZE_LIMIT, "max metadata size is %s: %s", SIZE_LIMIT, metadata.length); + checkArgument(value.length <= SIZE_LIMIT, "max value size is %s: %s", SIZE_LIMIT, value.length); + } + + // Stringify the variant in JSON format. + public String toJson(ZoneId zoneId) + { + StringBuilder json = new StringBuilder(); + toJsonImpl(value, metadata, position, json, zoneId); + return json.toString(); + } + + private static void toJsonImpl(byte[] value, byte[] metadata, int position, StringBuilder json, ZoneId zoneId) + { + switch (getType(value, position)) { + case NULL -> json.append("null"); + case BOOLEAN -> json.append(getBoolean(value, position)); + case LONG -> json.append(getLong(value, position)); + case FLOAT -> json.append(getFloat(value, position)); + case DOUBLE -> json.append(getDouble(value, position)); + case DECIMAL -> json.append(getDecimal(value, position).toPlainString()); + case STRING -> json.append(escapeJson(getString(value, position))); + case BINARY -> appendQuoted(json, Base64.getEncoder().encodeToString(getBinary(value, position))); + case DATE -> appendQuoted(json, LocalDate.ofEpochDay(getLong(value, position)).toString()); + case TIMESTAMP -> appendQuoted(json, TIMESTAMP_FORMATTER.format(microsToInstant(getLong(value, position)).atZone(zoneId))); + case TIMESTAMP_NTZ -> appendQuoted(json, TIMESTAMP_NTZ_FORMATTER.format(microsToInstant(getLong(value, position)).atZone(ZoneOffset.UTC))); + case ARRAY -> handleArray(value, position, (size, offsetSize, offsetStart, dataStart) -> { + json.append('['); + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) { + json.append(','); + } + toJsonImpl(value, metadata, elementPos, json, zoneId); + } + json.append(']'); + return null; + }); + case OBJECT -> handleObject(value, position, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + json.append('{'); + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPosition = dataStart + offset; + if (i != 0) { + json.append(','); + } + json.append(escapeJson(getMetadataKey(metadata, id))); + json.append(':'); + toJsonImpl(value, metadata, elementPosition, json, zoneId); + } + json.append('}'); + return null; + }); + } + } + + private static Instant microsToInstant(long timestamp) + { + return Instant.EPOCH.plus(timestamp, MICROS); + } + + // A simplified and more performant version of `sb.append(escapeJson(value))`. It is used when we + // know `value` doesn't contain any special character that needs escaping. + private static void appendQuoted(StringBuilder json, String value) + { + json.append('"').append(value).append('"'); + } + + // Escape a string so that it can be pasted into JSON structure. + // For example, if `str` only contains a new-line character, then the result content is "\n" + // (4 characters). + private static String escapeJson(String value) + { + try (CharArrayWriter writer = new CharArrayWriter(); + JsonGenerator generator = jsonFactory().createGenerator(writer)) { + generator.writeString(value); + generator.flush(); + return writer.toString(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/spark/VariantUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/spark/VariantUtil.java new file mode 100644 index 000000000000..c17c33f647f0 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/spark/VariantUtil.java @@ -0,0 +1,480 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.spark; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.Arrays; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Copied from https://github.com/apache/spark/blob/53d65fd12dd9231139188227ef9040d40d759021/common/variant/src/main/java/org/apache/spark/types/variant/VariantUtil.java + * + * This class defines constants related to the variant format and provides functions for + * manipulating variant binaries. + + * A variant is made up of 2 binaries: value and metadata. A variant value consists of a one-byte + * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits + * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in + * the below constants for all possible basic type and type info values. + + * The variant metadata includes a version id and a dictionary of distinct strings (case-sensitive). + * Its binary format is: + * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently. + * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the + * dictionary. + * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the + * starting position of string i, counting starting from the address of `offsets[0]`. Strings + * must be stored contiguously, so we don’t need to store the string size, instead, we compute it + * with `offset[i + 1] - offset[i]`. + * - UTF-8 string data. + */ +public final class VariantUtil +{ + public static final int BASIC_TYPE_BITS = 2; + public static final int BASIC_TYPE_MASK = 0x3; + public static final int TYPE_INFO_MASK = 0x3F; + + // Below is all possible basic type values. + // Primitive value. The type info value must be one of the values in the below section. + public static final int PRIMITIVE = 0; + // Short string value. The type info value is the string size, which must be in `[0, + // kMaxShortStrSize]`. + // The string content bytes directly follow the header byte. + public static final int SHORT_STR = 1; + // Object value. The content contains a size, a list of field ids, a list of field offsets, and + // the actual field data. The length of the id list is `size`, while the length of the offset + // list is `size + 1`, where the last offset represent the total size of the field data. The + // fields in an object must be sorted by the field name in alphabetical order. Duplicate field + // names in one object are not allowed. + // We use 5 bits in the type info to specify the integer type of the object header: it should + // be 0_b4_b3b2_b1b0 (MSB is 0), where: + // - b4 specifies the type of size. When it is 0/1, `size` is a little-endian 1/4-byte + // unsigned integer. + // - b3b2/b1b0 specifies the integer type of id and offset. When the 2 bits are 0/1/2, the + // list contains 1/2/3-byte little-endian unsigned integers. + public static final int OBJECT = 2; + // Array value. The content contains a size, a list of field offsets, and the actual element + // data. It is similar to an object without the id list. The length of the offset list + // is `size + 1`, where the last offset represent the total size of the element data. + // Its type info should be: 000_b2_b1b0: + // - b2 specifies the type of size. + // - b1b0 specifies the integer type of offset. + public static final int ARRAY = 3; + + // Below is all possible type info values for `PRIMITIVE`. + // JSON Null value. Empty content. + public static final int NULL = 0; + // True value. Empty content. + public static final int TRUE = 1; + // False value. Empty content. + public static final int FALSE = 2; + // 1-byte little-endian signed integer. + public static final int INT1 = 3; + // 2-byte little-endian signed integer. + public static final int INT2 = 4; + // 4-byte little-endian signed integer. + public static final int INT4 = 5; + // 4-byte little-endian signed integer. + public static final int INT8 = 6; + // 8-byte IEEE double. + public static final int DOUBLE = 7; + // 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. + public static final int DECIMAL4 = 8; + // 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer. + public static final int DECIMAL8 = 9; + // 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. + public static final int DECIMAL16 = 10; + // Date value. Content is 4-byte little-endian signed integer that represents the number of days + // from the Unix epoch. + public static final int DATE = 11; + // Timestamp value. Content is 8-byte little-endian signed integer that represents the number of + // microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in + // their local time zones and may be displayed differently depending on the execution environment. + public static final int TIMESTAMP = 12; + // Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted + // as if the local time zone is UTC. + public static final int TIMESTAMP_NTZ = 13; + // 4-byte IEEE float. + public static final int FLOAT = 14; + // Binary value. The content is (4-byte little-endian unsigned integer representing the binary + // size) + (size bytes of binary content). + public static final int BINARY = 15; + // Long string value. The content is (4-byte little-endian unsigned integer representing the + // string size) + (size bytes of string content). + public static final int LONG_STR = 16; + + public static final byte VERSION = 1; + // The lower 4 bits of the first metadata byte contain the version. + public static final byte VERSION_MASK = 0x0F; + + public static final int U24_MAX = 0xFFFFFF; + public static final int U32_SIZE = 4; + + // Both variant value and variant metadata need to be no longer than 16MiB. + public static final int SIZE_LIMIT = U24_MAX + 1; + + public static final int MAX_DECIMAL4_PRECISION = 9; + public static final int MAX_DECIMAL8_PRECISION = 18; + public static final int MAX_DECIMAL16_PRECISION = 38; + + private VariantUtil() {} + + // Check the validity of an array index `position`. Throw `MALFORMED_VARIANT` if it is out of bound, + // meaning that the variant is malformed. + static void checkIndex(int position, int length) + { + if (position < 0 || position >= length) { + throw new IllegalArgumentException("Index out of bound: %s (length: %s)".formatted(position, length)); + } + } + + // Read a little-endian signed long value from `bytes[position, position + numBytes)`. + static long readLong(byte[] bytes, int position, int numBytes) + { + checkIndex(position, bytes.length); + checkIndex(position + numBytes - 1, bytes.length); + long result = 0; + // All bytes except the most significant byte should be unsign-extended and shifted (so we need + // `& 0xFF`). The most significant byte should be sign-extended and is handled after the loop. + for (int i = 0; i < numBytes - 1; ++i) { + long unsignedByteValue = bytes[position + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + long signedByteValue = bytes[position + numBytes - 1]; + result |= signedByteValue << (8 * (numBytes - 1)); + return result; + } + + // Read a little-endian unsigned int value from `bytes[position, position + numBytes)`. The value must fit + // into a non-negative int (`[0, Integer.MAX_VALUE]`). + static int readUnsigned(byte[] bytes, int position, int numBytes) + { + checkIndex(position, bytes.length); + checkIndex(position + numBytes - 1, bytes.length); + int result = 0; + // Similar to the `readLong` loop, but all bytes should be unsign-extended. + for (int i = 0; i < numBytes; ++i) { + int unsignedByteValue = bytes[position + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + if (result < 0) { + throw new IllegalArgumentException("Value out of bound: %s".formatted(result)); + } + return result; + } + + // The value type of variant value. It is determined by the header byte but not a 1:1 mapping + // (for example, INT1/2/4/8 all maps to `Type.LONG`). + public enum Type + { + NULL, + BOOLEAN, + LONG, + FLOAT, + DOUBLE, + DECIMAL, + STRING, + BINARY, + DATE, + TIMESTAMP, + TIMESTAMP_NTZ, + ARRAY, + OBJECT, + } + + // Get the value type of variant value `value[position...]`. It is only legal to call `get*` if + // `getType` returns this type (for example, it is only legal to call `getLong` if `getType` + // returns `Type.Long`). + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static Type getType(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + return switch (basicType) { + case SHORT_STR -> Type.STRING; + case OBJECT -> Type.OBJECT; + case ARRAY -> Type.ARRAY; + default -> switch (typeInfo) { + case NULL -> Type.NULL; + case TRUE, FALSE -> Type.BOOLEAN; + case INT1, INT2, INT4, INT8 -> Type.LONG; + case DOUBLE -> Type.DOUBLE; + case DECIMAL4, DECIMAL8, DECIMAL16 -> Type.DECIMAL; + case DATE -> Type.DATE; + case TIMESTAMP -> Type.TIMESTAMP; + case TIMESTAMP_NTZ -> Type.TIMESTAMP_NTZ; + case FLOAT -> Type.FLOAT; + case BINARY -> Type.BINARY; + case LONG_STR -> Type.STRING; + default -> throw new IllegalArgumentException("Unexpected type: " + typeInfo); + }; + }; + } + + private static IllegalStateException unexpectedType(Type type) + { + return new IllegalStateException("Expect type to be " + type); + } + + // Get a boolean value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static boolean getBoolean(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) { + throw unexpectedType(Type.BOOLEAN); + } + return typeInfo == TRUE; + } + + // Get a long value from variant value `value[position...]`. + // It is only legal to call it if `getType` returns one of `Type.LONG/DATE/TIMESTAMP/ + // TIMESTAMP_NTZ`. If the type is `DATE`, the return value is guaranteed to fit into an int and + // represents the number of days from the Unix epoch. If the type is `TIMESTAMP/TIMESTAMP_NTZ`, + // the return value represents the number of microseconds from the Unix epoch. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static long getLong(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ"; + if (basicType != PRIMITIVE) { + throw new IllegalStateException(exceptionMessage); + } + return switch (typeInfo) { + case INT1 -> readLong(value, position + 1, 1); + case INT2 -> readLong(value, position + 1, 2); + case INT4, DATE -> readLong(value, position + 1, 4); + case INT8, TIMESTAMP, TIMESTAMP_NTZ -> readLong(value, position + 1, 8); + default -> throw new IllegalStateException(exceptionMessage); + }; + } + + // Get a double value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static double getDouble(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != DOUBLE) { + throw unexpectedType(Type.DOUBLE); + } + return Double.longBitsToDouble(readLong(value, position + 1, 8)); + } + + // Check whether the precision and scale of the decimal are within the limit. + private static void checkDecimal(BigDecimal decimal, int maxPrecision) + { + if (decimal.precision() > maxPrecision || decimal.scale() > maxPrecision) { + throw new IllegalArgumentException("Decimal out of bound: " + decimal); + } + } + + // Get a decimal value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static BigDecimal getDecimal(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE) { + throw unexpectedType(Type.DECIMAL); + } + // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be + // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. + int scale = value[position + 1] & 0xFF; + BigDecimal result; + switch (typeInfo) { + case DECIMAL4: + result = BigDecimal.valueOf(readLong(value, position + 2, 4), scale); + checkDecimal(result, MAX_DECIMAL4_PRECISION); + break; + case DECIMAL8: + result = BigDecimal.valueOf(readLong(value, position + 2, 8), scale); + checkDecimal(result, MAX_DECIMAL8_PRECISION); + break; + case DECIMAL16: + checkIndex(position + 17, value.length); + byte[] bytes = new byte[16]; + // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian + // representation. + for (int i = 0; i < 16; ++i) { + bytes[i] = value[position + 17 - i]; + } + result = new BigDecimal(new BigInteger(bytes), scale); + checkDecimal(result, MAX_DECIMAL16_PRECISION); + break; + default: + throw unexpectedType(Type.DECIMAL); + } + return result.stripTrailingZeros(); + } + + // Get a float value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static float getFloat(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != FLOAT) { + throw unexpectedType(Type.FLOAT); + } + return Float.intBitsToFloat((int) readLong(value, position + 1, 4)); + } + + // Get a binary value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static byte[] getBinary(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != BINARY) { + throw unexpectedType(Type.BINARY); + } + int start = position + 1 + U32_SIZE; + int length = readUnsigned(value, position + 1, U32_SIZE); + checkIndex(start + length - 1, value.length); + return Arrays.copyOfRange(value, start, start + length); + } + + // Get a string value from variant value `value[position...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static String getString(byte[] value, int position) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) { + int start; + int length; + if (basicType == SHORT_STR) { + start = position + 1; + length = typeInfo; + } + else { + start = position + 1 + U32_SIZE; + length = readUnsigned(value, position + 1, U32_SIZE); + } + checkIndex(start + length - 1, value.length); + return new String(value, start, length, UTF_8); + } + throw unexpectedType(Type.STRING); + } + + public interface ObjectHandler + { + /** + * @param size Number of object fields. + * @param idSize The integer size of the field id list. + * @param offsetSize The integer size of the offset list. + * @param idStart The starting index of the field id list in the variant value array. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of field data in the variant value array. + */ + T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart); + } + + // A helper function to access a variant object. It provides `handler` with its required + // parameters and returns what it returns. + public static T handleObject(byte[] value, int position, ObjectHandler handler) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != OBJECT) { + throw unexpectedType(Type.OBJECT); + } + // Refer to the comment of the `OBJECT` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts + // b4 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 4) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, position + 1, sizeBytes); + // Extracts b3b2 to determine the integer size of the field id list. + int idSize = ((typeInfo >> 2) & 0x3) + 1; + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int idStart = position + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart); + } + + public interface ArrayHandler + { + /** + * @param size Number of array elements. + * @param offsetSize The integer size of the offset list. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of element data in the variant value array. + */ + T apply(int size, int offsetSize, int offsetStart, int dataStart); + } + + // A helper function to access a variant array. + public static T handleArray(byte[] value, int position, ArrayHandler handler) + { + checkIndex(position, value.length); + int basicType = value[position] & BASIC_TYPE_MASK; + int typeInfo = (value[position] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != ARRAY) { + throw unexpectedType(Type.ARRAY); + } + // Refer to the comment of the `ARRAY` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts + // b2 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 2) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, position + 1, sizeBytes); + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int offsetStart = position + 1 + sizeBytes; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, offsetSize, offsetStart, dataStart); + } + + // Get a key at `id` in the variant metadata. + // Throw `MALFORMED_VARIANT` if the variant is malformed. An out-of-bound `id` is also considered + // a malformed variant because it is read from the corresponding variant value. + public static String getMetadataKey(byte[] metadata, int id) + { + checkIndex(0, metadata.length); + // Extracts the highest 2 bits in the metadata header to determine the integer size of the + // offset list. + int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, 1, offsetSize); + if (id >= dictSize) { + throw new IllegalArgumentException("Index out of bound: %s (size: %s)".formatted(id, dictSize)); + } + // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets + // before the string data. + int stringStart = 1 + (dictSize + 2) * offsetSize; + int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); + int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); + if (offset > nextOffset) { + throw new IllegalArgumentException("Invalid offset: %s > %s".formatted(offset, nextOffset)); + } + checkIndex(stringStart + nextOffset - 1, metadata.length); + return new String(metadata, stringStart + offset, nextOffset - offset, UTF_8); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ArrayColumnWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ArrayColumnWriter.java new file mode 100644 index 000000000000..4cc187bf8298 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ArrayColumnWriter.java @@ -0,0 +1,86 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.ImmutableList; +import io.trino.parquet.writer.repdef.DefLevelWriterProvider; +import io.trino.parquet.writer.repdef.DefLevelWriterProviders; +import io.trino.parquet.writer.repdef.RepLevelWriterProvider; +import io.trino.parquet.writer.repdef.RepLevelWriterProviders; +import io.trino.spi.block.ColumnarArray; + +import java.io.IOException; +import java.util.List; + +import static io.airlift.slice.SizeOf.instanceSize; +import static java.util.Objects.requireNonNull; + +public class ArrayColumnWriter + implements ColumnWriter +{ + private static final int INSTANCE_SIZE = instanceSize(ArrayColumnWriter.class); + + private final ColumnWriter elementWriter; + private final int maxDefinitionLevel; + private final int maxRepetitionLevel; + + public ArrayColumnWriter(ColumnWriter elementWriter, int maxDefinitionLevel, int maxRepetitionLevel) + { + this.elementWriter = requireNonNull(elementWriter, "elementWriter is null"); + this.maxDefinitionLevel = maxDefinitionLevel; + this.maxRepetitionLevel = maxRepetitionLevel; + } + + @Override + public void writeBlock(ColumnChunk columnChunk) + throws IOException + { + ColumnarArray columnarArray = ColumnarArray.toColumnarArray(columnChunk.getBlock()); + elementWriter.writeBlock( + new ColumnChunk(columnarArray.getElementsBlock(), + ImmutableList.builder() + .addAll(columnChunk.getDefLevelWriterProviders()) + .add(DefLevelWriterProviders.of(columnarArray, maxDefinitionLevel)) + .build(), + ImmutableList.builder() + .addAll(columnChunk.getRepLevelWriterProviders()) + .add(RepLevelWriterProviders.of(columnarArray, maxRepetitionLevel)) + .build())); + } + + @Override + public void close() + { + elementWriter.close(); + } + + @Override + public List getBuffer() + throws IOException + { + return ImmutableList.copyOf(elementWriter.getBuffer()); + } + + @Override + public long getBufferedBytes() + { + return elementWriter.getBufferedBytes(); + } + + @Override + public long getRetainedBytes() + { + return INSTANCE_SIZE + elementWriter.getRetainedBytes(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ColumnChunk.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ColumnChunk.java new file mode 100644 index 000000000000..32912ebabb52 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ColumnChunk.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.ImmutableList; +import io.trino.parquet.writer.repdef.DefLevelWriterProvider; +import io.trino.parquet.writer.repdef.RepLevelWriterProvider; +import io.trino.spi.block.Block; + +import java.util.List; + +import static java.util.Objects.requireNonNull; + +public class ColumnChunk +{ + private final Block block; + private final List defLevelWriterProviders; + private final List repLevelWriterProviders; + + ColumnChunk(Block block) + { + this(block, ImmutableList.of(), ImmutableList.of()); + } + + ColumnChunk(Block block, List defLevelWriterProviders, List repLevelWriterProviders) + { + this.block = requireNonNull(block, "block is null"); + this.defLevelWriterProviders = ImmutableList.copyOf(defLevelWriterProviders); + this.repLevelWriterProviders = ImmutableList.copyOf(repLevelWriterProviders); + } + + List getDefLevelWriterProviders() + { + return defLevelWriterProviders; + } + + public List getRepLevelWriterProviders() + { + return repLevelWriterProviders; + } + + public Block getBlock() + { + return block; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ColumnWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ColumnWriter.java new file mode 100644 index 000000000000..1e25b0da2631 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ColumnWriter.java @@ -0,0 +1,75 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.format.ColumnMetaData; + +import java.io.IOException; +import java.util.List; +import java.util.Optional; +import java.util.OptionalInt; + +import static java.util.Objects.requireNonNull; + +public interface ColumnWriter +{ + void writeBlock(ColumnChunk columnChunk) + throws IOException; + + void close(); + + List getBuffer() + throws IOException; + + long getBufferedBytes(); + + long getRetainedBytes(); + + class BufferData + { + private final ColumnMetaData metaData; + private final List data; + private final OptionalInt dictionaryPageSize; + private final Optional bloomFilter; + + public BufferData(List data, OptionalInt dictionaryPageSize, Optional bloomFilter, ColumnMetaData metaData) + { + this.data = requireNonNull(data, "data is null"); + this.dictionaryPageSize = requireNonNull(dictionaryPageSize, "dictionaryPageSize is null"); + this.bloomFilter = requireNonNull(bloomFilter, "bloomFilter is null"); + this.metaData = requireNonNull(metaData, "metaData is null"); + } + + public ColumnMetaData getMetaData() + { + return metaData; + } + + public List getData() + { + return data; + } + + public OptionalInt getDictionaryPageSize() + { + return dictionaryPageSize; + } + + public Optional getBloomFilter() + { + return bloomFilter; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/MapColumnWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/MapColumnWriter.java new file mode 100644 index 000000000000..e3c05d9040f8 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/MapColumnWriter.java @@ -0,0 +1,90 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.ImmutableList; +import io.trino.parquet.writer.repdef.DefLevelWriterProvider; +import io.trino.parquet.writer.repdef.DefLevelWriterProviders; +import io.trino.parquet.writer.repdef.RepLevelWriterProvider; +import io.trino.parquet.writer.repdef.RepLevelWriterProviders; +import io.trino.spi.block.ColumnarMap; + +import java.io.IOException; +import java.util.List; + +import static io.airlift.slice.SizeOf.instanceSize; +import static java.util.Objects.requireNonNull; + +public class MapColumnWriter + implements ColumnWriter +{ + private static final int INSTANCE_SIZE = instanceSize(MapColumnWriter.class); + + private final ColumnWriter keyWriter; + private final ColumnWriter valueWriter; + private final int maxDefinitionLevel; + private final int maxRepetitionLevel; + + public MapColumnWriter(ColumnWriter keyWriter, ColumnWriter valueWriter, int maxDefinitionLevel, int maxRepetitionLevel) + { + this.keyWriter = requireNonNull(keyWriter, "keyWriter is null"); + this.valueWriter = requireNonNull(valueWriter, "valueWriter is null"); + this.maxDefinitionLevel = maxDefinitionLevel; + this.maxRepetitionLevel = maxRepetitionLevel; + } + + @Override + public void writeBlock(ColumnChunk columnChunk) + throws IOException + { + ColumnarMap columnarMap = ColumnarMap.toColumnarMap(columnChunk.getBlock()); + + List defLevelWriterProviders = ImmutableList.builder() + .addAll(columnChunk.getDefLevelWriterProviders()) + .add(DefLevelWriterProviders.of(columnarMap, maxDefinitionLevel)).build(); + + List repLevelIterables = ImmutableList.builder() + .addAll(columnChunk.getRepLevelWriterProviders()) + .add(RepLevelWriterProviders.of(columnarMap, maxRepetitionLevel)).build(); + + keyWriter.writeBlock(new ColumnChunk(columnarMap.getKeysBlock(), defLevelWriterProviders, repLevelIterables)); + valueWriter.writeBlock(new ColumnChunk(columnarMap.getValuesBlock(), defLevelWriterProviders, repLevelIterables)); + } + + @Override + public void close() + { + keyWriter.close(); + valueWriter.close(); + } + + @Override + public List getBuffer() + throws IOException + { + return ImmutableList.builder().addAll(keyWriter.getBuffer()).addAll(valueWriter.getBuffer()).build(); + } + + @Override + public long getBufferedBytes() + { + return keyWriter.getBufferedBytes() + valueWriter.getBufferedBytes(); + } + + @Override + public long getRetainedBytes() + { + return INSTANCE_SIZE + keyWriter.getRetainedBytes() + valueWriter.getRetainedBytes(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/MessageTypeConverter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/MessageTypeConverter.java new file mode 100644 index 000000000000..b0a98dcf657c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/MessageTypeConverter.java @@ -0,0 +1,173 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import org.apache.parquet.format.ConvertedType; +import org.apache.parquet.format.FieldRepetitionType; +import org.apache.parquet.format.SchemaElement; +import org.apache.parquet.format.Type; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.TypeVisitor; + +import java.util.ArrayList; +import java.util.List; + +import static io.trino.parquet.ParquetMetadataConverter.convertToLogicalType; + +class MessageTypeConverter +{ + private MessageTypeConverter() {} + + static List toParquetSchema(MessageType schema) + { + List result = new ArrayList<>(); + addToList(result, schema); + return result; + } + + private static void addToList(List result, org.apache.parquet.schema.Type field) + { + field.accept(new TypeVisitor() + { + @Override + public void visit(PrimitiveType primitiveType) + { + SchemaElement element = new SchemaElement(primitiveType.getName()); + element.setRepetition_type(toParquetRepetition(primitiveType.getRepetition())); + element.setType(getType(primitiveType.getPrimitiveTypeName())); + if (primitiveType.getOriginalType() != null) { + element.setConverted_type(getConvertedType(primitiveType.getOriginalType())); + } + if (primitiveType.getLogicalTypeAnnotation() != null) { + element.setLogicalType(convertToLogicalType(primitiveType.getLogicalTypeAnnotation())); + } + if (primitiveType.getDecimalMetadata() != null) { + element.setPrecision(primitiveType.getDecimalMetadata().getPrecision()); + element.setScale(primitiveType.getDecimalMetadata().getScale()); + } + if (primitiveType.getTypeLength() > 0) { + element.setType_length(primitiveType.getTypeLength()); + } + if (primitiveType.getId() != null) { + element.setField_id(primitiveType.getId().intValue()); + } + result.add(element); + } + + @Override + public void visit(MessageType messageType) + { + SchemaElement element = new SchemaElement(messageType.getName()); + if (messageType.getId() != null) { + element.setField_id(messageType.getId().intValue()); + } + visitChildren(result, messageType.asGroupType(), element); + } + + @Override + public void visit(GroupType groupType) + { + SchemaElement element = new SchemaElement(groupType.getName()); + element.setRepetition_type(toParquetRepetition(groupType.getRepetition())); + if (groupType.getOriginalType() != null) { + element.setConverted_type(getConvertedType(groupType.getOriginalType())); + } + if (groupType.getLogicalTypeAnnotation() != null) { + element.setLogicalType(convertToLogicalType(groupType.getLogicalTypeAnnotation())); + } + if (groupType.getId() != null) { + element.setField_id(groupType.getId().intValue()); + } + visitChildren(result, groupType, element); + } + + private void visitChildren(List result, + GroupType groupType, SchemaElement element) + { + element.setNum_children(groupType.getFieldCount()); + result.add(element); + for (org.apache.parquet.schema.Type field : groupType.getFields()) { + addToList(result, field); + } + } + }); + } + + private static FieldRepetitionType toParquetRepetition(org.apache.parquet.schema.Type.Repetition repetition) + { + return FieldRepetitionType.valueOf(repetition.name()); + } + + private static org.apache.parquet.format.Type getType(PrimitiveType.PrimitiveTypeName type) + { + switch (type) { + case INT64 -> { + return Type.INT64; + } + case INT32 -> { + return Type.INT32; + } + case BOOLEAN -> { + return Type.BOOLEAN; + } + case BINARY -> { + return Type.BYTE_ARRAY; + } + case FLOAT -> { + return Type.FLOAT; + } + case DOUBLE -> { + return Type.DOUBLE; + } + case INT96 -> { + return Type.INT96; + } + case FIXED_LEN_BYTE_ARRAY -> { + return Type.FIXED_LEN_BYTE_ARRAY; + } + } + throw new RuntimeException("Unknown primitive type " + type); + } + + private static ConvertedType getConvertedType(OriginalType type) + { + return switch (type) { + case UTF8 -> ConvertedType.UTF8; + case MAP -> ConvertedType.MAP; + case MAP_KEY_VALUE -> ConvertedType.MAP_KEY_VALUE; + case LIST -> ConvertedType.LIST; + case ENUM -> ConvertedType.ENUM; + case DECIMAL -> ConvertedType.DECIMAL; + case DATE -> ConvertedType.DATE; + case TIME_MICROS -> ConvertedType.TIME_MICROS; + case TIME_MILLIS -> ConvertedType.TIME_MILLIS; + case TIMESTAMP_MILLIS -> ConvertedType.TIMESTAMP_MILLIS; + case TIMESTAMP_MICROS -> ConvertedType.TIMESTAMP_MICROS; + case INTERVAL -> ConvertedType.INTERVAL; + case INT_8 -> ConvertedType.INT_8; + case INT_16 -> ConvertedType.INT_16; + case INT_32 -> ConvertedType.INT_32; + case INT_64 -> ConvertedType.INT_64; + case UINT_8 -> ConvertedType.UINT_8; + case UINT_16 -> ConvertedType.UINT_16; + case UINT_32 -> ConvertedType.UINT_32; + case UINT_64 -> ConvertedType.UINT_64; + case JSON -> ConvertedType.JSON; + case BSON -> ConvertedType.BSON; + }; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetCompressor.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetCompressor.java new file mode 100644 index 000000000000..a716de82ec97 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetCompressor.java @@ -0,0 +1,97 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import io.airlift.compress.v3.Compressor; +import io.airlift.compress.v3.lz4.Lz4Compressor; +import io.airlift.compress.v3.snappy.SnappyCompressor; +import io.airlift.compress.v3.zstd.ZstdCompressor; +import io.airlift.slice.Slices; +import org.apache.parquet.format.CompressionCodec; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.zip.GZIPOutputStream; + +import static io.trino.parquet.writer.ParquetDataOutput.createDataOutput; +import static java.util.Objects.requireNonNull; + +interface ParquetCompressor +{ + ParquetDataOutput compress(byte[] input) + throws IOException; + + static ParquetCompressor getCompressor(CompressionCodec codec) + { + switch (codec) { + case GZIP: + return new GzipCompressor(); + case SNAPPY: + return new AirLiftCompressor(SnappyCompressor.create()); + case ZSTD: + return new AirLiftCompressor(ZstdCompressor.create()); + case LZ4: + return new AirLiftCompressor(Lz4Compressor.create()); + case UNCOMPRESSED: + return null; + case LZO: + case LZ4_RAW: + // TODO Support LZO and LZ4_RAW compression + // Note: LZ4 compression scheme has been deprecated by parquet-format in favor of LZ4_RAW + // When using airlift LZO or LZ4 compressor, decompressing page in reader throws exception. + break; + case BROTLI: + // unsupported + break; + } + throw new RuntimeException("Unsupported codec: " + codec); + } + + class GzipCompressor + implements ParquetCompressor + { + @Override + public ParquetDataOutput compress(byte[] input) + throws IOException + { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + try (GZIPOutputStream outputStream = new GZIPOutputStream(byteArrayOutputStream)) { + outputStream.write(input, 0, input.length); + } + return createDataOutput(byteArrayOutputStream); + } + } + + class AirLiftCompressor + implements ParquetCompressor + { + private final Compressor compressor; + + AirLiftCompressor(Compressor compressor) + { + this.compressor = requireNonNull(compressor, "compressor is null"); + } + + @Override + public ParquetDataOutput compress(byte[] input) + throws IOException + { + int minCompressionBufferSize = compressor.maxCompressedLength(input.length); + byte[] compressionBuffer = new byte[minCompressionBufferSize]; + // TODO compressedDataSize > bytes.length? + int compressedDataSize = compressor.compress(input, 0, input.length, compressionBuffer, 0, compressionBuffer.length); + return createDataOutput(Slices.wrappedBuffer(compressionBuffer, 0, compressedDataSize)); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetDataOutput.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetDataOutput.java new file mode 100644 index 000000000000..24be9c5dbc17 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetDataOutput.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import io.airlift.slice.Slice; +import io.airlift.slice.SliceOutput; +import io.trino.plugin.base.io.ChunkedSliceOutput; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import static java.util.Objects.requireNonNull; + +public interface ParquetDataOutput +{ + static ParquetDataOutput createDataOutput(Slice slice) + { + requireNonNull(slice, "slice is null"); + return new ParquetDataOutput() + { + @Override + public int size() + { + return slice.length(); + } + + @Override + public void writeData(SliceOutput sliceOutput) + { + sliceOutput.writeBytes(slice); + } + }; + } + + static ParquetDataOutput createDataOutput(ChunkedSliceOutput chunkedSliceOutput) + { + requireNonNull(chunkedSliceOutput, "chunkedSliceOutput is null"); + return new ParquetDataOutput() + { + @Override + public int size() + { + return chunkedSliceOutput.size(); + } + + @Override + public void writeData(SliceOutput sliceOutput) + { + chunkedSliceOutput.getSlices().forEach(sliceOutput::writeBytes); + } + }; + } + + static ParquetDataOutput createDataOutput(ByteArrayOutputStream byteArrayOutputStream) + { + requireNonNull(byteArrayOutputStream, "byteArrayOutputStream is null"); + return new ParquetDataOutput() + { + @Override + public int size() + { + return byteArrayOutputStream.size(); + } + + @Override + public void writeData(SliceOutput sliceOutput) + { + try { + byteArrayOutputStream.writeTo(sliceOutput); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + /** + * Number of bytes that will be written. + */ + int size(); + + /** + * Writes data to the output. The output must be exactly + * {@link #size()} bytes. + */ + void writeData(SliceOutput sliceOutput); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetSchemaConverter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetSchemaConverter.java new file mode 100644 index 000000000000..d4944def35ec --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetSchemaConverter.java @@ -0,0 +1,277 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.spi.TrinoException; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.CharType; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.MapType; +import io.trino.spi.type.RealType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.Type; +import io.trino.spi.type.VarbinaryType; +import io.trino.spi.type.VarcharType; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type.Repetition; +import org.apache.parquet.schema.Types; + +import java.util.List; +import java.util.Map; +import java.util.function.BiConsumer; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.Decimals.MAX_PRECISION; +import static io.trino.spi.type.DoubleType.DOUBLE; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.StandardTypes.ARRAY; +import static io.trino.spi.type.StandardTypes.MAP; +import static io.trino.spi.type.StandardTypes.ROW; +import static io.trino.spi.type.TinyintType.TINYINT; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType; +import static org.apache.parquet.schema.LogicalTypeAnnotation.intType; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.apache.parquet.schema.Type.Repetition.REQUIRED; + +public class ParquetSchemaConverter +{ + // Map precision to the number bytes needed for binary conversion. + // Based on org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe + private static final int[] PRECISION_TO_BYTE_COUNT = new int[MAX_PRECISION + 1]; + + static { + for (int precision = 1; precision <= MAX_PRECISION; precision++) { + // Estimated number of bytes needed. + PRECISION_TO_BYTE_COUNT[precision] = (int) Math.ceil((Math.log(Math.pow(10, precision) - 1) / Math.log(2) + 1) / 8); + } + } + + public static final boolean HIVE_PARQUET_USE_LEGACY_DECIMAL_ENCODING = true; + public static final boolean HIVE_PARQUET_USE_INT96_TIMESTAMP_ENCODING = true; + + private final Map, Type> primitiveTypes; + private final MessageType messageType; + + public ParquetSchemaConverter(List types, List columnNames, boolean useLegacyDecimalEncoding, boolean useInt96TimestampEncoding) + { + requireNonNull(types, "types is null"); + requireNonNull(columnNames, "columnNames is null"); + checkArgument(types.size() == columnNames.size(), "types size not equals to columnNames size"); + ImmutableMap.Builder, Type> primitiveTypesBuilder = ImmutableMap.builder(); + messageType = convert(types, columnNames, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesBuilder::put); + primitiveTypes = primitiveTypesBuilder.buildOrThrow(); + } + + public Map, Type> getPrimitiveTypes() + { + return primitiveTypes; + } + + public MessageType getMessageType() + { + return messageType; + } + + private static MessageType convert( + List types, + List columnNames, + boolean useLegacyDecimalEncoding, + boolean useInt96TimestampEncoding, + BiConsumer, Type> primitiveTypesConsumer) + { + Types.MessageTypeBuilder builder = Types.buildMessage(); + for (int i = 0; i < types.size(); i++) { + builder.addField(convert(types.get(i), columnNames.get(i), ImmutableList.of(), OPTIONAL, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer)); + } + return builder.named("trino_schema"); + } + + private static org.apache.parquet.schema.Type convert( + Type type, + String name, + List parent, + Repetition repetition, + boolean useLegacyDecimalEncoding, + boolean useInt96TimestampEncoding, + BiConsumer, Type> primitiveTypesConsumer) + { + if (ROW.equals(type.getTypeSignature().getBase())) { + return getRowType((RowType) type, name, parent, repetition, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer); + } + if (MAP.equals(type.getTypeSignature().getBase())) { + return getMapType((MapType) type, name, parent, repetition, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer); + } + if (ARRAY.equals(type.getTypeSignature().getBase())) { + return getArrayType((ArrayType) type, name, parent, repetition, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer); + } + return getPrimitiveType(type, name, parent, repetition, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer); + } + + private static org.apache.parquet.schema.Type getPrimitiveType( + Type type, + String name, + List parent, + Repetition repetition, + boolean useLegacyDecimalEncoding, + boolean useInt96TimestampEncoding, + BiConsumer, Type> primitiveTypesConsumer) + { + List fullName = ImmutableList.builder().addAll(parent).add(name).build(); + primitiveTypesConsumer.accept(fullName, type); + if (BOOLEAN.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, repetition).named(name); + } + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#signed-integers + // INT(32, true) and INT(64, true) are implied by the int32 and int64 primitive types if no other annotation is present. + // Implementations may use these annotations to produce smaller in-memory representations when reading data. + if (TINYINT.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(intType(8, true)) + .named(name); + } + if (SMALLINT.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(intType(16, true)) + .named(name); + } + if (INTEGER.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(intType(32, true)) + .named(name); + } + if (type instanceof DecimalType decimalType) { + // Apache Hive version 3 or lower does not support reading decimals encoded as INT32/INT64 + if (!useLegacyDecimalEncoding) { + if (decimalType.getPrecision() <= 9) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) + .as(decimalType(decimalType.getScale(), decimalType.getPrecision())) + .named(name); + } + if (decimalType.isShort()) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition) + .as(decimalType(decimalType.getScale(), decimalType.getPrecision())) + .named(name); + } + } + return Types.primitive(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, repetition) + .length(PRECISION_TO_BYTE_COUNT[decimalType.getPrecision()]) + .as(decimalType(decimalType.getScale(), decimalType.getPrecision())) + .named(name); + } + if (DATE.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition).as(LogicalTypeAnnotation.dateType()).named(name); + } + if (BIGINT.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition) + .as(intType(64, true)) + .named(name); + } + + if (type instanceof TimestampType timestampType) { + // Apache Hive version 3.x or lower does not support reading timestamps encoded as INT64 + if (useInt96TimestampEncoding) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT96, repetition).named(name); + } + + if (timestampType.getPrecision() <= 3) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition).as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS)).named(name); + } + if (timestampType.getPrecision() <= 6) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition).as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS)).named(name); + } + if (timestampType.getPrecision() <= 9) { + // Per https://github.com/apache/parquet-format/blob/master/LogicalTypes.md, nanosecond precision timestamp should be stored as INT64 + // even though it can only hold values within 1677-09-21 00:12:43 and 2262-04-11 23:47:16 range. + return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition).as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.NANOS)).named(name); + } + } + if (DOUBLE.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, repetition).named(name); + } + if (RealType.REAL.equals(type)) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.FLOAT, repetition).named(name); + } + if (type instanceof VarcharType || type instanceof CharType) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition).as(LogicalTypeAnnotation.stringType()).named(name); + } + if (type instanceof VarbinaryType) { + return Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition).named(name); + } + throw new TrinoException(NOT_SUPPORTED, format("Unsupported primitive type: %s", type)); + } + + private static org.apache.parquet.schema.Type getArrayType( + ArrayType type, + String name, + List parent, + Repetition repetition, + boolean useLegacyDecimalEncoding, + boolean useInt96TimestampEncoding, + BiConsumer, Type> primitiveTypesConsumer) + { + Type elementType = type.getElementType(); + return Types.list(repetition) + .element(convert(elementType, "element", ImmutableList.builder().addAll(parent).add(name).add("list").build(), OPTIONAL, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer)) + .named(name); + } + + private static org.apache.parquet.schema.Type getMapType( + MapType type, + String name, + List parent, + Repetition repetition, + boolean useLegacyDecimalEncoding, + boolean useInt96TimestampEncoding, + BiConsumer, Type> primitiveTypesConsumer) + { + parent = ImmutableList.builder().addAll(parent).add(name).add("key_value").build(); + Type keyType = type.getKeyType(); + Type valueType = type.getValueType(); + return Types.map(repetition) + .key(convert(keyType, "key", parent, REQUIRED, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer)) + .value(convert(valueType, "value", parent, OPTIONAL, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer)) + .named(name); + } + + private static org.apache.parquet.schema.Type getRowType( + RowType type, + String name, + List parent, + Repetition repetition, + boolean useLegacyDecimalEncoding, + boolean useInt96TimestampEncoding, + BiConsumer, Type> primitiveTypesConsumer) + { + parent = ImmutableList.builder().addAll(parent).add(name).build(); + Types.GroupBuilder builder = Types.buildGroup(repetition); + for (RowType.Field field : type.getFields()) { + checkArgument(field.getName().isPresent(), "field in struct type doesn't have name"); + builder.addField(convert(field.getType(), field.getName().get(), parent, OPTIONAL, useLegacyDecimalEncoding, useInt96TimestampEncoding, primitiveTypesConsumer)); + } + return builder.named(name); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetTypeConverter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetTypeConverter.java new file mode 100644 index 000000000000..12a4631bc2f7 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetTypeConverter.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import org.apache.parquet.format.Type; +import org.apache.parquet.schema.PrimitiveType; + +// Copy from parquet-mr +public class ParquetTypeConverter +{ + private ParquetTypeConverter() {} + + public static org.apache.parquet.format.Type getType(PrimitiveType.PrimitiveTypeName type) + { + return switch (type) { + case INT64 -> Type.INT64; + case INT32 -> Type.INT32; + case BOOLEAN -> Type.BOOLEAN; + case BINARY -> Type.BYTE_ARRAY; + case FLOAT -> Type.FLOAT; + case DOUBLE -> Type.DOUBLE; + case INT96 -> Type.INT96; + case FIXED_LEN_BYTE_ARRAY -> Type.FIXED_LEN_BYTE_ARRAY; + }; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetTypeVisitor.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetTypeVisitor.java new file mode 100644 index 000000000000..a33010d4eab2 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetTypeVisitor.java @@ -0,0 +1,155 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.Lists; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +import java.util.LinkedList; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.parquet.schema.Type.Repetition.REPEATED; + +// Code from iceberg +public class ParquetTypeVisitor +{ + protected LinkedList fieldNames = new LinkedList<>(); + + public static T visit(Type type, ParquetTypeVisitor visitor) + { + if (type instanceof MessageType messageType) { + return visitor.message(messageType, visitFields(type.asGroupType(), visitor)); + } + if (type.isPrimitive()) { + return visitor.primitive(type.asPrimitiveType()); + } + // if not a primitive, the typeId must be a group + GroupType group = type.asGroupType(); + LogicalTypeAnnotation annotation = group.getLogicalTypeAnnotation(); + if (LogicalTypeAnnotation.listType().equals(annotation)) { + checkArgument(!group.isRepetition(REPEATED), + "Invalid list: top-level group is repeated: %s", group); + checkArgument(group.getFieldCount() == 1, + "Invalid list: does not contain single repeated field: %s", group); + + GroupType repeatedElement = group.getFields().get(0).asGroupType(); + checkArgument(repeatedElement.isRepetition(REPEATED), + "Invalid list: inner group is not repeated"); + checkArgument(repeatedElement.getFieldCount() <= 1, + "Invalid list: repeated group is not a single field: %s", group); + + visitor.fieldNames.push(repeatedElement.getName()); + try { + T elementResult = null; + if (repeatedElement.getFieldCount() > 0) { + elementResult = visitField(repeatedElement.getType(0), visitor); + } + + return visitor.list(group, elementResult); + } + finally { + visitor.fieldNames.pop(); + } + } + if (LogicalTypeAnnotation.mapType().equals(annotation)) { + checkArgument(!group.isRepetition(REPEATED), + "Invalid map: top-level group is repeated: %s", group); + checkArgument(group.getFieldCount() == 1, + "Invalid map: does not contain single repeated field: %s", group); + + GroupType repeatedKeyValue = group.getType(0).asGroupType(); + checkArgument(repeatedKeyValue.isRepetition(REPEATED), + "Invalid map: inner group is not repeated"); + checkArgument(repeatedKeyValue.getFieldCount() <= 2, + "Invalid map: repeated group does not have 2 fields"); + + visitor.fieldNames.push(repeatedKeyValue.getName()); + try { + T keyResult = null; + T valueResult = null; + if (repeatedKeyValue.getFieldCount() == 2) { + keyResult = visitField(repeatedKeyValue.getType(0), visitor); + valueResult = visitField(repeatedKeyValue.getType(1), visitor); + } + else if (repeatedKeyValue.getFieldCount() == 1) { + Type keyOrValue = repeatedKeyValue.getType(0); + if (keyOrValue.getName().equalsIgnoreCase("key")) { + keyResult = visitField(keyOrValue, visitor); + // value result remains null + } + else { + valueResult = visitField(keyOrValue, visitor); + // key result remains null + } + } + return visitor.map(group, keyResult, valueResult); + } + finally { + visitor.fieldNames.pop(); + } + } + return visitor.struct(group, visitFields(group, visitor)); + } + + private static T visitField(Type field, ParquetTypeVisitor visitor) + { + visitor.fieldNames.push(field.getName()); + try { + return visit(field, visitor); + } + finally { + visitor.fieldNames.pop(); + } + } + + private static List visitFields(GroupType group, ParquetTypeVisitor visitor) + { + List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); + for (Type field : group.getFields()) { + results.add(visitField(field, visitor)); + } + + return results; + } + + public T message(MessageType message, List fields) + { + return null; + } + + public T struct(GroupType struct, List fields) + { + return null; + } + + public T list(GroupType array, T element) + { + return null; + } + + public T map(GroupType map, T key, T value) + { + return null; + } + + public T primitive(PrimitiveType primitive) + { + return null; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriter.java new file mode 100644 index 000000000000..0251d897b905 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriter.java @@ -0,0 +1,495 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import io.airlift.slice.DynamicSliceOutput; +import io.airlift.slice.OutputStreamSliceOutput; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.trino.parquet.Column; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.ParquetWriteValidation; +import io.trino.parquet.metadata.BlockMetadata; +import io.trino.parquet.metadata.FileMetadata; +import io.trino.parquet.metadata.ParquetMetadata; +import io.trino.parquet.reader.MetadataReader; +import io.trino.parquet.reader.ParquetReader; +import io.trino.parquet.reader.RowGroupInfo; +import io.trino.parquet.writer.ColumnWriter.BufferData; +import io.trino.spi.Page; +import io.trino.spi.connector.SourcePage; +import io.trino.spi.type.Type; +import jakarta.annotation.Nullable; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.format.BloomFilterAlgorithm; +import org.apache.parquet.format.BloomFilterCompression; +import org.apache.parquet.format.BloomFilterHash; +import org.apache.parquet.format.BloomFilterHeader; +import org.apache.parquet.format.ColumnMetaData; +import org.apache.parquet.format.CompressionCodec; +import org.apache.parquet.format.FileMetaData; +import org.apache.parquet.format.KeyValue; +import org.apache.parquet.format.RowGroup; +import org.apache.parquet.format.SplitBlockAlgorithm; +import org.apache.parquet.format.Uncompressed; +import org.apache.parquet.format.Util; +import org.apache.parquet.format.XxHash; +import org.apache.parquet.io.MessageColumnIO; +import org.apache.parquet.schema.MessageType; +import org.joda.time.DateTimeZone; + +import java.io.Closeable; +import java.io.IOException; +import java.io.OutputStream; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.function.Consumer; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.slice.SizeOf.SIZE_OF_INT; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.Slices.wrappedBuffer; +import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.trino.parquet.ParquetTypeUtils.constructField; +import static io.trino.parquet.ParquetTypeUtils.getColumnIO; +import static io.trino.parquet.ParquetTypeUtils.getDescriptors; +import static io.trino.parquet.ParquetTypeUtils.lookupColumnByName; +import static io.trino.parquet.ParquetWriteValidation.ParquetWriteValidationBuilder; +import static io.trino.parquet.metadata.PrunedBlockMetadata.createPrunedColumnsMetadata; +import static io.trino.parquet.writer.ParquetDataOutput.createDataOutput; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.DoubleType.DOUBLE; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.RealType.REAL; +import static io.trino.spi.type.UuidType.UUID; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.Math.max; +import static java.lang.Math.min; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static java.util.Objects.requireNonNull; + +public class ParquetWriter + implements Closeable +{ + private static final int INSTANCE_SIZE = instanceSize(ParquetWriter.class); + public static final List SUPPORTED_BLOOM_FILTER_TYPES = ImmutableList.of(BIGINT, DOUBLE, INTEGER, REAL, UUID, VARBINARY, VARCHAR); + + private final OutputStreamSliceOutput outputStream; + private final ParquetWriterOptions writerOption; + private final MessageType messageType; + private final int chunkMaxBytes; + private final Map, Type> primitiveTypes; + private final CompressionCodec compressionCodec; + private final Optional parquetTimeZone; + private final FileFooter fileFooter; + private final ImmutableList.Builder>> bloomFilterGroups = ImmutableList.builder(); + private final Optional validationBuilder; + + private List columnWriters; + private int rows; + private long bufferedBytes; + private boolean closed; + private boolean writeHeader; + @Nullable + private FileMetaData fileMetaData; + + public static final Slice MAGIC = wrappedBuffer("PAR1".getBytes(US_ASCII)); + + public ParquetWriter( + OutputStream outputStream, + MessageType messageType, + Map, Type> primitiveTypes, + ParquetWriterOptions writerOption, + CompressionCodec compressionCodec, + String trinoVersion, + Optional parquetTimeZone, + Optional validationBuilder) + { + this.validationBuilder = requireNonNull(validationBuilder, "validationBuilder is null"); + this.outputStream = new OutputStreamSliceOutput(requireNonNull(outputStream, "outputStream is null")); + this.messageType = requireNonNull(messageType, "messageType is null"); + this.primitiveTypes = requireNonNull(primitiveTypes, "primitiveTypes is null"); + this.writerOption = requireNonNull(writerOption, "writerOption is null"); + this.compressionCodec = requireNonNull(compressionCodec, "compressionCodec is null"); + this.parquetTimeZone = requireNonNull(parquetTimeZone, "parquetTimeZone is null"); + String createdBy = formatCreatedBy(requireNonNull(trinoVersion, "trinoVersion is null")); + this.fileFooter = new FileFooter(messageType, createdBy, parquetTimeZone); + + recordValidation(validation -> validation.setTimeZone(parquetTimeZone.map(DateTimeZone::getID))); + recordValidation(validation -> validation.setColumns(messageType.getColumns())); + recordValidation(validation -> validation.setCreatedBy(createdBy)); + initColumnWriters(); + this.chunkMaxBytes = max(1, writerOption.getMaxRowGroupSize() / 2); + } + + public long getWrittenBytes() + { + return outputStream.longSize(); + } + + public long getBufferedBytes() + { + return bufferedBytes; + } + + public long getRetainedBytes() + { + return INSTANCE_SIZE + + outputStream.getRetainedSize() + + columnWriters.stream().mapToLong(ColumnWriter::getRetainedBytes).sum() + + validationBuilder.map(ParquetWriteValidationBuilder::getRetainedSize).orElse(0L); + } + + public void write(Page page) + throws IOException + { + requireNonNull(page, "page is null"); + checkState(!closed, "writer is closed"); + if (page.getPositionCount() == 0) { + return; + } + + checkArgument(page.getChannelCount() == columnWriters.size()); + + recordValidation(validation -> validation.addPage(page)); + + int writeOffset = 0; + while (writeOffset < page.getPositionCount()) { + Page chunk = page.getRegion(writeOffset, min(page.getPositionCount() - writeOffset, writerOption.getBatchSize())); + + // avoid chunk with huge logical size + while (chunk.getPositionCount() > 1 && chunk.getSizeInBytes() > chunkMaxBytes) { + chunk = page.getRegion(writeOffset, chunk.getPositionCount() / 2); + } + + writeOffset += chunk.getPositionCount(); + writeChunk(chunk); + } + } + + private void writeChunk(Page page) + throws IOException + { + bufferedBytes = 0; + for (int channel = 0; channel < page.getChannelCount(); channel++) { + ColumnWriter writer = columnWriters.get(channel); + writer.writeBlock(new ColumnChunk(page.getBlock(channel))); + bufferedBytes += writer.getBufferedBytes(); + } + rows += page.getPositionCount(); + + if (bufferedBytes >= writerOption.getMaxRowGroupSize()) { + columnWriters.forEach(ColumnWriter::close); + flush(); + initColumnWriters(); + rows = 0; + bufferedBytes = columnWriters.stream().mapToLong(ColumnWriter::getBufferedBytes).sum(); + } + } + + @Override + public void close() + throws IOException + { + if (closed) { + return; + } + closed = true; + + try (outputStream) { + columnWriters.forEach(ColumnWriter::close); + flush(); + columnWriters = ImmutableList.of(); + fileMetaData = fileFooter.createFileMetadata(); + writeBloomFilters(fileMetaData.getRow_groups(), bloomFilterGroups.build()); + writeFooter(); + } + bufferedBytes = 0; + } + + public void validate(ParquetDataSource input) + throws ParquetCorruptionException + { + checkState(validationBuilder.isPresent(), "validation is not enabled"); + ParquetWriteValidation writeValidation = validationBuilder.get().build(); + try { + ParquetMetadata parquetMetadata = MetadataReader.readFooter(input, Optional.empty(), Optional.of(writeValidation)); + try (ParquetReader parquetReader = createParquetReader(input, parquetMetadata, writeValidation)) { + for (SourcePage page = parquetReader.nextPage(); page != null; page = parquetReader.nextPage()) { + // fully load the page + page.getPage(); + } + } + } + catch (IOException e) { + if (e instanceof ParquetCorruptionException pce) { + throw pce; + } + throw new ParquetCorruptionException(input.getId(), "Validation failed with exception %s", e); + } + } + + public FileMetaData getFileMetaData() + { + checkState(closed, "fileMetaData is available only after writer is closed"); + return requireNonNull(fileMetaData, "fileMetaData is null"); + } + + private ParquetReader createParquetReader(ParquetDataSource input, ParquetMetadata parquetMetadata, ParquetWriteValidation writeValidation) + throws IOException + { + FileMetadata fileMetaData = parquetMetadata.getFileMetaData(); + MessageColumnIO messageColumnIO = getColumnIO(fileMetaData.getSchema(), fileMetaData.getSchema()); + ImmutableList.Builder columnFields = ImmutableList.builder(); + for (int i = 0; i < writeValidation.getTypes().size(); i++) { + columnFields.add(new Column( + messageColumnIO.getName(), + constructField( + writeValidation.getTypes().get(i), + lookupColumnByName(messageColumnIO, writeValidation.getColumnNames().get(i))) + .orElseThrow())); + } + Map, ColumnDescriptor> descriptorsByPath = getDescriptors(fileMetaData.getSchema(), fileMetaData.getSchema()); + long nextStart = 0; + ImmutableList.Builder rowGroupInfoBuilder = ImmutableList.builder(); + for (BlockMetadata block : parquetMetadata.getBlocks()) { + rowGroupInfoBuilder.add(new RowGroupInfo(createPrunedColumnsMetadata(block, input.getId(), descriptorsByPath), nextStart, Optional.empty())); + nextStart += block.rowCount(); + } + return new ParquetReader( + Optional.ofNullable(fileMetaData.getCreatedBy()), + columnFields.build(), + false, + rowGroupInfoBuilder.build(), + input, + parquetTimeZone.orElseThrow(), + newSimpleAggregatedMemoryContext(), + ParquetReaderOptions.defaultOptions(), + exception -> { + throwIfUnchecked(exception); + return new RuntimeException(exception); + }, + Optional.empty(), + Optional.of(writeValidation)); + } + + private void recordValidation(Consumer task) + { + validationBuilder.ifPresent(task); + } + + // Parquet File Layout: + // + // MAGIC + // variable: Data + // variable: Metadata + // 4 bytes: MetadataLength + // MAGIC + private void flush() + throws IOException + { + // write header + if (!writeHeader) { + createDataOutput(MAGIC).writeData(outputStream); + writeHeader = true; + } + + // get all data in buffer + ImmutableList.Builder builder = ImmutableList.builder(); + for (ColumnWriter columnWriter : columnWriters) { + columnWriter.getBuffer().forEach(builder::add); + } + List bufferDataList = builder.build(); + + if (rows == 0) { + // Avoid writing empty row groups as these are ignored by the reader + verify( + bufferDataList.stream() + .flatMap(bufferData -> bufferData.getData().stream()) + .allMatch(dataOutput -> dataOutput.size() == 0), + "Buffer should be empty when there are no rows"); + return; + } + + // update stats + long currentOffset = outputStream.longSize(); + ImmutableList.Builder columnMetaDataBuilder = ImmutableList.builder(); + for (BufferData bufferData : bufferDataList) { + ColumnMetaData columnMetaData = bufferData.getMetaData(); + OptionalInt dictionaryPageSize = bufferData.getDictionaryPageSize(); + if (dictionaryPageSize.isPresent()) { + columnMetaData.setDictionary_page_offset(currentOffset); + } + columnMetaData.setData_page_offset(currentOffset + dictionaryPageSize.orElse(0)); + columnMetaDataBuilder.add(columnMetaData); + currentOffset += columnMetaData.getTotal_compressed_size(); + } + updateRowGroups(columnMetaDataBuilder.build(), outputStream.longSize()); + + // flush pages + for (BufferData bufferData : bufferDataList) { + bufferData.getData() + .forEach(data -> data.writeData(outputStream)); + } + + bloomFilterGroups.add(bufferDataList.stream().map(BufferData::getBloomFilter).collect(toImmutableList())); + } + + private void writeFooter() + throws IOException + { + checkState(closed); + Slice footer = serializeFooter(fileMetaData); + recordValidation(validation -> validation.setRowGroups(fileMetaData.getRow_groups())); + createDataOutput(footer).writeData(outputStream); + + Slice footerSize = Slices.allocate(SIZE_OF_INT); + footerSize.setInt(0, footer.length()); + createDataOutput(footerSize).writeData(outputStream); + + createDataOutput(MAGIC).writeData(outputStream); + } + + private void writeBloomFilters(List rowGroups, List>> rowGroupBloomFilters) + { + checkArgument(rowGroups.size() == rowGroupBloomFilters.size(), "Row groups size %s should match row group Bloom filter size %s", rowGroups.size(), rowGroupBloomFilters.size()); + for (int group = 0; group < rowGroups.size(); group++) { + List columns = rowGroups.get(group).getColumns(); + List> bloomFilters = rowGroupBloomFilters.get(group); + for (int i = 0; i < columns.size(); i++) { + if (bloomFilters.get(i).isEmpty()) { + continue; + } + + BloomFilter bloomFilter = bloomFilters.get(i).orElseThrow(); + long bloomFilterOffset = outputStream.longSize(); + try { + Util.writeBloomFilterHeader( + new BloomFilterHeader( + bloomFilter.getBitsetSize(), + BloomFilterAlgorithm.BLOCK(new SplitBlockAlgorithm()), + BloomFilterHash.XXHASH(new XxHash()), + BloomFilterCompression.UNCOMPRESSED(new Uncompressed())), + outputStream, + null, + null); + bloomFilter.writeTo(outputStream); + columns.get(i).getMeta_data().setBloom_filter_offset(bloomFilterOffset); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + } + + private void updateRowGroups(List columnMetaData, long fileOffset) + { + long totalCompressedBytes = columnMetaData.stream().mapToLong(ColumnMetaData::getTotal_compressed_size).sum(); + long totalBytes = columnMetaData.stream().mapToLong(ColumnMetaData::getTotal_uncompressed_size).sum(); + List columnChunks = columnMetaData.stream().map(ParquetWriter::toColumnChunk).collect(toImmutableList()); + fileFooter.addRowGroup(new RowGroup(columnChunks, totalBytes, rows) + .setTotal_compressed_size(totalCompressedBytes) + .setFile_offset(fileOffset)); + } + + private static Slice serializeFooter(FileMetaData fileMetaData) + throws IOException + { + DynamicSliceOutput dynamicSliceOutput = new DynamicSliceOutput(40); + Util.writeFileMetaData(fileMetaData, dynamicSliceOutput); + return dynamicSliceOutput.slice(); + } + + private static org.apache.parquet.format.ColumnChunk toColumnChunk(ColumnMetaData metaData) + { + // TODO Not sure whether file_offset is used + org.apache.parquet.format.ColumnChunk columnChunk = new org.apache.parquet.format.ColumnChunk(0); + columnChunk.setMeta_data(metaData); + return columnChunk; + } + + @VisibleForTesting + static String formatCreatedBy(String trinoVersion) + { + // Add "(build n/a)" suffix to satisfy Parquet's VersionParser expectations + // Apache Hive will skip timezone conversion if createdBy does not start with parquet-mr + // https://github.com/apache/hive/blob/67ef629486ba38b1d3e0f400bee0073fa3c4e989/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java#L154 + return "parquet-mr-trino version " + trinoVersion + " (build n/a)"; + } + + private void initColumnWriters() + { + this.columnWriters = ParquetWriters.getColumnWriters( + messageType, + primitiveTypes, + compressionCodec, + writerOption, + parquetTimeZone); + } + + private static class FileFooter + { + private final MessageType messageType; + private final String createdBy; + private final Optional parquetTimeZone; + + @Nullable + private ImmutableList.Builder rowGroupBuilder = ImmutableList.builder(); + + private FileFooter(MessageType messageType, String createdBy, Optional parquetTimeZone) + { + this.messageType = messageType; + this.createdBy = createdBy; + this.parquetTimeZone = parquetTimeZone; + } + + public void addRowGroup(RowGroup rowGroup) + { + checkState(rowGroupBuilder != null, "rowGroupBuilder is null"); + rowGroupBuilder.add(rowGroup); + } + + public FileMetaData createFileMetadata() + { + checkState(rowGroupBuilder != null, "rowGroupBuilder is null"); + List rowGroups = rowGroupBuilder.build(); + rowGroupBuilder = null; + long totalRows = rowGroups.stream().mapToLong(RowGroup::getNum_rows).sum(); + FileMetaData fileMetaData = new FileMetaData( + 1, + MessageTypeConverter.toParquetSchema(messageType), + totalRows, + ImmutableList.copyOf(rowGroups)); + fileMetaData.setCreated_by(createdBy); + // Added based on org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport + parquetTimeZone.ifPresent(dateTimeZone -> fileMetaData.setKey_value_metadata( + ImmutableList.of(new KeyValue("writer.time.zone").setValue(dateTimeZone.getID())))); + return fileMetaData; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriterOptions.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriterOptions.java new file mode 100644 index 000000000000..022051d015e3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriterOptions.java @@ -0,0 +1,169 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.ImmutableSet; +import com.google.common.primitives.Ints; +import io.airlift.units.DataSize; +import org.apache.parquet.column.ParquetProperties; + +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.units.DataSize.Unit.MEGABYTE; + +public class ParquetWriterOptions +{ + private static final DataSize DEFAULT_MAX_ROW_GROUP_SIZE = DataSize.of(128, MEGABYTE); + private static final DataSize DEFAULT_MAX_PAGE_SIZE = DataSize.ofBytes(ParquetProperties.DEFAULT_PAGE_SIZE); + // org.apache.parquet.column.DEFAULT_PAGE_ROW_COUNT_LIMIT is 20_000 to improve selectivity of page indexes + // This value should be revisited when TODO https://github.com/trinodb/trino/issues/9359 is implemented + public static final int DEFAULT_MAX_PAGE_VALUE_COUNT = 60_000; + public static final int DEFAULT_BATCH_SIZE = 10_000; + public static final DataSize DEFAULT_MAX_BLOOM_FILTER_SIZE = DataSize.of(1, MEGABYTE); + public static final double DEFAULT_BLOOM_FILTER_FPP = 0.05; + + public static ParquetWriterOptions.Builder builder() + { + return new ParquetWriterOptions.Builder(); + } + + private final int maxRowGroupSize; + private final int maxPageSize; + private final int maxPageValueCount; + private final int batchSize; + private final int maxBloomFilterSize; + private final double bloomFilterFpp; + // Set of column dot paths to columns with bloom filters + private final Set bloomFilterColumns; + + private ParquetWriterOptions( + DataSize maxBlockSize, + DataSize maxPageSize, + int maxPageValueCount, + int batchSize, + DataSize maxBloomFilterSize, + double bloomFilterFpp, + Set bloomFilterColumns) + { + this.maxRowGroupSize = Ints.saturatedCast(maxBlockSize.toBytes()); + this.maxPageSize = Ints.saturatedCast(maxPageSize.toBytes()); + this.maxPageValueCount = maxPageValueCount; + this.batchSize = batchSize; + this.maxBloomFilterSize = Ints.saturatedCast(maxBloomFilterSize.toBytes()); + this.bloomFilterFpp = bloomFilterFpp; + this.bloomFilterColumns = ImmutableSet.copyOf(bloomFilterColumns); + checkArgument(this.bloomFilterFpp > 0.0 && this.bloomFilterFpp < 1.0, "bloomFilterFpp should be > 0.0 & < 1.0"); + } + + public int getMaxRowGroupSize() + { + return maxRowGroupSize; + } + + public int getMaxPageSize() + { + return maxPageSize; + } + + public int getMaxPageValueCount() + { + return maxPageValueCount; + } + + public int getBatchSize() + { + return batchSize; + } + + public int getMaxBloomFilterSize() + { + return maxBloomFilterSize; + } + + public Set getBloomFilterColumns() + { + return bloomFilterColumns; + } + + public double getBLoomFilterFpp() + { + return bloomFilterFpp; + } + + public static class Builder + { + private DataSize maxBlockSize = DEFAULT_MAX_ROW_GROUP_SIZE; + private DataSize maxPageSize = DEFAULT_MAX_PAGE_SIZE; + private int maxPageValueCount = DEFAULT_MAX_PAGE_VALUE_COUNT; + private int batchSize = DEFAULT_BATCH_SIZE; + private DataSize maxBloomFilterSize = DEFAULT_MAX_BLOOM_FILTER_SIZE; + private Set bloomFilterColumns = ImmutableSet.of(); + private double bloomFilterFpp = DEFAULT_BLOOM_FILTER_FPP; + + public Builder setMaxBlockSize(DataSize maxBlockSize) + { + this.maxBlockSize = maxBlockSize; + return this; + } + + public Builder setMaxPageSize(DataSize maxPageSize) + { + this.maxPageSize = maxPageSize; + return this; + } + + public Builder setMaxPageValueCount(int maxPageValueCount) + { + this.maxPageValueCount = maxPageValueCount; + return this; + } + + public Builder setBatchSize(int batchSize) + { + this.batchSize = batchSize; + return this; + } + + public Builder setMaxBloomFilterSize(DataSize maxBloomFilterSize) + { + this.maxBloomFilterSize = maxBloomFilterSize; + return this; + } + + public Builder setBloomFilterFpp(double bloomFilterFpp) + { + this.bloomFilterFpp = bloomFilterFpp; + return this; + } + + public Builder setBloomFilterColumns(Set columns) + { + this.bloomFilterColumns = columns; + return this; + } + + public ParquetWriterOptions build() + { + return new ParquetWriterOptions( + maxBlockSize, + maxPageSize, + maxPageValueCount, + batchSize, + maxBloomFilterSize, + bloomFilterFpp, + bloomFilterColumns); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriters.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriters.java new file mode 100644 index 000000000000..556ea467b670 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/ParquetWriters.java @@ -0,0 +1,330 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import io.trino.parquet.writer.valuewriter.BigintValueWriter; +import io.trino.parquet.writer.valuewriter.BinaryValueWriter; +import io.trino.parquet.writer.valuewriter.BooleanValueWriter; +import io.trino.parquet.writer.valuewriter.DateValueWriter; +import io.trino.parquet.writer.valuewriter.DoubleValueWriter; +import io.trino.parquet.writer.valuewriter.FixedLenByteArrayLongDecimalValueWriter; +import io.trino.parquet.writer.valuewriter.FixedLenByteArrayShortDecimalValueWriter; +import io.trino.parquet.writer.valuewriter.Int32ShortDecimalValueWriter; +import io.trino.parquet.writer.valuewriter.Int64ShortDecimalValueWriter; +import io.trino.parquet.writer.valuewriter.Int96TimestampValueWriter; +import io.trino.parquet.writer.valuewriter.IntegerValueWriter; +import io.trino.parquet.writer.valuewriter.PrimitiveValueWriter; +import io.trino.parquet.writer.valuewriter.RealValueWriter; +import io.trino.parquet.writer.valuewriter.TimeMicrosValueWriter; +import io.trino.parquet.writer.valuewriter.TimestampMillisValueWriter; +import io.trino.parquet.writer.valuewriter.TimestampNanosValueWriter; +import io.trino.parquet.writer.valuewriter.TimestampTzMicrosValueWriter; +import io.trino.parquet.writer.valuewriter.TimestampTzMillisValueWriter; +import io.trino.parquet.writer.valuewriter.TrinoValuesWriterFactory; +import io.trino.parquet.writer.valuewriter.UuidValueWriter; +import io.trino.spi.TrinoException; +import io.trino.spi.type.CharType; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.Type; +import io.trino.spi.type.UuidType; +import io.trino.spi.type.VarbinaryType; +import io.trino.spi.type.VarcharType; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.format.CompressionCodec; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.TimeLogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.TimestampLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.joda.time.DateTimeZone; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.function.Predicate; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.writer.ParquetWriter.SUPPORTED_BLOOM_FILTER_TYPES; +import static io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter.newDefinitionLevelWriter; +import static io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter.newRepetitionLevelWriter; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.DoubleType.DOUBLE; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.RealType.REAL; +import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TimeType.TIME_MICROS; +import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS; +import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS; +import static io.trino.spi.type.TimestampType.TIMESTAMP_NANOS; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; +import static io.trino.spi.type.TinyintType.TINYINT; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; + +final class ParquetWriters +{ + private static final int DEFAULT_DICTIONARY_PAGE_SIZE = 1024 * 1024; + static final int BLOOM_FILTER_EXPECTED_ENTRIES = 100_000; + + private ParquetWriters() {} + + static PrimitiveValueWriter getValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType, Optional parquetTimeZone) + { + if (BOOLEAN.equals(type)) { + return new BooleanValueWriter(valuesWriter, parquetType); + } + if (INTEGER.equals(type) || SMALLINT.equals(type) || TINYINT.equals(type)) { + return new IntegerValueWriter(valuesWriter, type, parquetType); + } + if (BIGINT.equals(type)) { + return new BigintValueWriter(valuesWriter, type, parquetType); + } + if (type instanceof DecimalType decimalType) { + if (parquetType.getPrimitiveTypeName() == INT32) { + return new Int32ShortDecimalValueWriter(valuesWriter, type, parquetType); + } + if (parquetType.getPrimitiveTypeName() == INT64) { + return new Int64ShortDecimalValueWriter(valuesWriter, type, parquetType); + } + if (decimalType.isShort()) { + return new FixedLenByteArrayShortDecimalValueWriter(valuesWriter, type, parquetType); + } + return new FixedLenByteArrayLongDecimalValueWriter(valuesWriter, type, parquetType); + } + if (DATE.equals(type)) { + return new DateValueWriter(valuesWriter, parquetType); + } + if (TIME_MICROS.equals(type)) { + verifyParquetType(type, parquetType, TimeLogicalTypeAnnotation.class, isTime(LogicalTypeAnnotation.TimeUnit.MICROS)); + return new TimeMicrosValueWriter(valuesWriter, parquetType); + } + if (type instanceof TimestampType) { + if (parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)) { + checkArgument(parquetTimeZone.isPresent(), "parquetTimeZone must be provided for INT96 timestamps"); + return new Int96TimestampValueWriter(valuesWriter, type, parquetType, parquetTimeZone.get()); + } + if (TIMESTAMP_MILLIS.equals(type)) { + verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.MILLIS)); + return new TimestampMillisValueWriter(valuesWriter, type, parquetType); + } + if (TIMESTAMP_MICROS.equals(type)) { + verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.MICROS)); + return new BigintValueWriter(valuesWriter, type, parquetType); + } + if (TIMESTAMP_NANOS.equals(type)) { + verifyParquetType(type, parquetType, TimestampLogicalTypeAnnotation.class, isTimestamp(LogicalTypeAnnotation.TimeUnit.NANOS)); + return new TimestampNanosValueWriter(valuesWriter, type, parquetType); + } + } + + if (TIMESTAMP_TZ_MILLIS.equals(type)) { + return new TimestampTzMillisValueWriter(valuesWriter, parquetType); + } + if (TIMESTAMP_TZ_MICROS.equals(type)) { + return new TimestampTzMicrosValueWriter(valuesWriter, parquetType); + } + if (DOUBLE.equals(type)) { + return new DoubleValueWriter(valuesWriter, parquetType); + } + if (REAL.equals(type)) { + return new RealValueWriter(valuesWriter, parquetType); + } + if (type instanceof VarcharType || type instanceof CharType || type instanceof VarbinaryType) { + // Binary writer is suitable also for char data, as UTF-8 encoding is used on both sides. + return new BinaryValueWriter(valuesWriter, type, parquetType); + } + if (type instanceof UuidType) { + return new UuidValueWriter(valuesWriter, parquetType); + } + throw new TrinoException(NOT_SUPPORTED, format("Unsupported type for Parquet writer: %s", type)); + } + + static List getColumnWriters( + MessageType messageType, + Map, Type> trinoTypes, + CompressionCodec compressionCodec, + ParquetWriterOptions writerOptions, + Optional parquetTimeZone) + { + TrinoValuesWriterFactory valuesWriterFactory = new TrinoValuesWriterFactory(writerOptions.getMaxPageSize(), DEFAULT_DICTIONARY_PAGE_SIZE); + WriteBuilder writeBuilder = new WriteBuilder( + messageType, + trinoTypes, + valuesWriterFactory, + compressionCodec, + writerOptions, + parquetTimeZone); + ParquetTypeVisitor.visit(messageType, writeBuilder); + return writeBuilder.build(); + } + + private static class WriteBuilder + extends ParquetTypeVisitor + { + private final MessageType type; + private final Map, Type> trinoTypes; + private final TrinoValuesWriterFactory valuesWriterFactory; + private final CompressionCodec compressionCodec; + private final int maxPageSize; + private final int pageValueCountLimit; + private final Set bloomFilterColumns; + private final Optional parquetTimeZone; + private final ImmutableList.Builder builder = ImmutableList.builder(); + private final int maxBloomFilterSize; + private final double bloomFilterFpp; + + WriteBuilder( + MessageType messageType, + Map, Type> trinoTypes, + TrinoValuesWriterFactory valuesWriterFactory, + CompressionCodec compressionCodec, + ParquetWriterOptions writerOptions, + Optional parquetTimeZone) + { + this.type = requireNonNull(messageType, "messageType is null"); + this.trinoTypes = requireNonNull(trinoTypes, "trinoTypes is null"); + this.valuesWriterFactory = requireNonNull(valuesWriterFactory, "valuesWriterFactory is null"); + this.compressionCodec = requireNonNull(compressionCodec, "compressionCodec is null"); + this.maxPageSize = writerOptions.getMaxPageSize(); + this.pageValueCountLimit = writerOptions.getMaxPageValueCount(); + this.maxBloomFilterSize = writerOptions.getMaxBloomFilterSize(); + this.bloomFilterColumns = requireNonNull(writerOptions.getBloomFilterColumns(), "bloomFilterColumns is null"); + this.bloomFilterFpp = writerOptions.getBLoomFilterFpp(); + this.parquetTimeZone = requireNonNull(parquetTimeZone, "parquetTimeZone is null"); + } + + List build() + { + return builder.build(); + } + + @Override + public ColumnWriter message(MessageType message, List fields) + { + builder.addAll(fields); + return super.message(message, fields); + } + + @Override + public ColumnWriter struct(GroupType struct, List fields) + { + String[] path = currentPath(); + int fieldDefinitionLevel = type.getMaxDefinitionLevel(path); + return new StructColumnWriter(ImmutableList.copyOf(fields), fieldDefinitionLevel); + } + + @Override + public ColumnWriter list(GroupType array, ColumnWriter element) + { + String[] path = currentPath(); + int fieldDefinitionLevel = type.getMaxDefinitionLevel(path); + int fieldRepetitionLevel = type.getMaxRepetitionLevel(path); + return new ArrayColumnWriter(element, fieldDefinitionLevel, fieldRepetitionLevel); + } + + @Override + public ColumnWriter map(GroupType map, ColumnWriter key, ColumnWriter value) + { + String[] path = currentPath(); + int fieldDefinitionLevel = type.getMaxDefinitionLevel(path); + int fieldRepetitionLevel = type.getMaxRepetitionLevel(path); + return new MapColumnWriter(key, value, fieldDefinitionLevel, fieldRepetitionLevel); + } + + @Override + public ColumnWriter primitive(PrimitiveType primitive) + { + String[] path = currentPath(); + int fieldDefinitionLevel = type.getMaxDefinitionLevel(path); + int fieldRepetitionLevel = type.getMaxRepetitionLevel(path); + ColumnDescriptor columnDescriptor = new ColumnDescriptor(path, primitive, fieldRepetitionLevel, fieldDefinitionLevel); + Type trinoType = requireNonNull(trinoTypes.get(ImmutableList.copyOf(path)), "Trino type is null"); + Optional bloomFilter = createBloomFilter(bloomFilterColumns, maxBloomFilterSize, bloomFilterFpp, columnDescriptor, trinoType); + return new PrimitiveColumnWriter( + columnDescriptor, + getValueWriter(valuesWriterFactory.newValuesWriter(columnDescriptor, bloomFilter), trinoType, columnDescriptor.getPrimitiveType(), parquetTimeZone), + newDefinitionLevelWriter(columnDescriptor, maxPageSize), + newRepetitionLevelWriter(columnDescriptor, maxPageSize), + compressionCodec, + maxPageSize, + pageValueCountLimit, + bloomFilter); + } + + private String[] currentPath() + { + String[] path = new String[fieldNames.size()]; + if (!fieldNames.isEmpty()) { + Iterator iter = fieldNames.descendingIterator(); + for (int i = 0; iter.hasNext(); i += 1) { + path[i] = iter.next(); + } + } + return path; + } + + private static Optional createBloomFilter(Set bloomFilterColumns, int maxBloomFilterSize, double bloomFilterFpp, ColumnDescriptor columnDescriptor, Type colummType) + { + if (!SUPPORTED_BLOOM_FILTER_TYPES.contains(colummType)) { + return Optional.empty(); + } + // TODO: Enable use of AdaptiveBlockSplitBloomFilter once parquet-mr 1.14.0 is released + String dotPath = Joiner.on('.').join(columnDescriptor.getPath()); + if (bloomFilterColumns.contains(dotPath)) { + int optimalNumOfBits = BlockSplitBloomFilter.optimalNumOfBits(BLOOM_FILTER_EXPECTED_ENTRIES, bloomFilterFpp); + return Optional.of(new BlockSplitBloomFilter(optimalNumOfBits / 8, maxBloomFilterSize)); + } + return Optional.empty(); + } + } + + private static void verifyParquetType(Type type, PrimitiveType parquetType, Class annotationType, Predicate predicate) + { + checkArgument( + annotationType.isInstance(parquetType.getLogicalTypeAnnotation()) && + predicate.test(annotationType.cast(parquetType.getLogicalTypeAnnotation())), + "Wrong Parquet type '%s' for Trino type '%s'", parquetType, type); + } + + private static Predicate isTime(LogicalTypeAnnotation.TimeUnit precision) + { + requireNonNull(precision, "precision is null"); + return annotation -> annotation.getUnit() == precision && + // isAdjustedToUTC=false indicates Local semantics (timestamps not normalized to UTC) + !annotation.isAdjustedToUTC(); + } + + private static Predicate isTimestamp(LogicalTypeAnnotation.TimeUnit precision) + { + requireNonNull(precision, "precision is null"); + return annotation -> annotation.getUnit() == precision && + // isAdjustedToUTC=false indicates Local semantics (timestamps not normalized to UTC) + !annotation.isAdjustedToUTC(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/PrimitiveColumnWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/PrimitiveColumnWriter.java new file mode 100644 index 000000000000..6715be46ecec --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/PrimitiveColumnWriter.java @@ -0,0 +1,383 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.ImmutableList; +import io.airlift.slice.Slices; +import io.trino.parquet.ParquetMetadataConverter; +import io.trino.parquet.writer.repdef.DefLevelWriterProvider; +import io.trino.parquet.writer.repdef.DefLevelWriterProviders; +import io.trino.parquet.writer.repdef.RepLevelWriterProvider; +import io.trino.parquet.writer.repdef.RepLevelWriterProviders; +import io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter; +import io.trino.parquet.writer.valuewriter.PrimitiveValueWriter; +import io.trino.plugin.base.io.ChunkedSliceOutput; +import jakarta.annotation.Nullable; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.EncodingStats; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.format.ColumnMetaData; +import org.apache.parquet.format.CompressionCodec; +import org.apache.parquet.format.DataPageHeader; +import org.apache.parquet.format.DictionaryPageHeader; +import org.apache.parquet.format.PageEncodingStats; +import org.apache.parquet.format.PageHeader; +import org.apache.parquet.format.PageType; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalInt; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.trino.parquet.ParquetMetadataConverter.convertEncodingStats; +import static io.trino.parquet.ParquetMetadataConverter.getEncoding; +import static io.trino.parquet.writer.ParquetCompressor.getCompressor; +import static io.trino.parquet.writer.ParquetDataOutput.createDataOutput; +import static io.trino.parquet.writer.repdef.DefLevelWriterProvider.DefinitionLevelWriter; +import static io.trino.parquet.writer.repdef.DefLevelWriterProvider.getRootDefinitionLevelWriter; +import static io.trino.parquet.writer.repdef.RepLevelWriterProvider.RepetitionLevelWriter; +import static io.trino.parquet.writer.repdef.RepLevelWriterProvider.getRootRepetitionLevelWriter; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.format.Util.writePageHeader; + +public class PrimitiveColumnWriter + implements ColumnWriter +{ + private static final int INSTANCE_SIZE = instanceSize(PrimitiveColumnWriter.class); + private static final int MINIMUM_OUTPUT_BUFFER_CHUNK_SIZE = 8 * 1024; + private static final int MAXIMUM_OUTPUT_BUFFER_CHUNK_SIZE = 2 * 1024 * 1024; + // ParquetMetadataConverter.MAX_STATS_SIZE is 4096, we need a value which would guarantee that min and max + // don't add up to 4096 (so less than 2048). Using 1K as that is big enough for most use cases. + private static final int MAX_STATISTICS_LENGTH_IN_BYTES = 1024; + + private final ColumnDescriptor columnDescriptor; + private final CompressionCodec compressionCodec; + + private final PrimitiveValueWriter primitiveValueWriter; + private final ColumnDescriptorValuesWriter definitionLevelWriter; + private final ColumnDescriptorValuesWriter repetitionLevelWriter; + + private boolean closed; + private boolean getDataStreamsCalled; + + // current page stats + private int valueCount; + private int currentPageNullCounts; + + // column meta data stats + private final Set encodings = new HashSet<>(); + private final Map dataPagesWithEncoding = new HashMap<>(); + private final Map dictionaryPagesWithEncoding = new HashMap<>(); + private final Statistics columnStatistics; + private final Optional bloomFilter; + private long totalCompressedSize; + private long totalUnCompressedSize; + private long totalValues; + + private final int maxDefinitionLevel; + + private final ChunkedSliceOutput compressedOutputStream; + + @Nullable + private final ParquetCompressor compressor; + + private final int pageSizeThreshold; + private final int pageValueCountLimit; + + // Total size of compressed parquet pages and the current uncompressed page buffered in memory + // Used by ParquetWriter to decide when a row group is big enough to flush + private long bufferedBytes; + private long pageBufferedBytes; + + public PrimitiveColumnWriter( + ColumnDescriptor columnDescriptor, + PrimitiveValueWriter primitiveValueWriter, + ColumnDescriptorValuesWriter definitionLevelWriter, + ColumnDescriptorValuesWriter repetitionLevelWriter, + CompressionCodec compressionCodec, + int pageSizeThreshold, + int pageValueCountLimit, + Optional bloomFilter) + { + this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor is null"); + this.maxDefinitionLevel = columnDescriptor.getMaxDefinitionLevel(); + this.definitionLevelWriter = requireNonNull(definitionLevelWriter, "definitionLevelWriter is null"); + this.repetitionLevelWriter = requireNonNull(repetitionLevelWriter, "repetitionLevelWriter is null"); + this.primitiveValueWriter = requireNonNull(primitiveValueWriter, "primitiveValueWriter is null"); + this.compressionCodec = requireNonNull(compressionCodec, "compressionCodec is null"); + this.compressor = getCompressor(compressionCodec); + this.pageSizeThreshold = pageSizeThreshold; + this.pageValueCountLimit = pageValueCountLimit; + this.columnStatistics = Statistics.createStats(columnDescriptor.getPrimitiveType()); + this.compressedOutputStream = new ChunkedSliceOutput(MINIMUM_OUTPUT_BUFFER_CHUNK_SIZE, MAXIMUM_OUTPUT_BUFFER_CHUNK_SIZE); + this.bloomFilter = requireNonNull(bloomFilter, "bloomFilter is null"); + } + + @Override + public void writeBlock(ColumnChunk columnChunk) + throws IOException + { + checkState(!closed); + // write values + primitiveValueWriter.write(columnChunk.getBlock()); + + List defLevelWriterProviders = ImmutableList.builder() + .addAll(columnChunk.getDefLevelWriterProviders()) + .add(DefLevelWriterProviders.of(columnChunk.getBlock(), maxDefinitionLevel)) + .build(); + DefinitionLevelWriter rootDefinitionLevelWriter = getRootDefinitionLevelWriter(defLevelWriterProviders, definitionLevelWriter); + + DefLevelWriterProvider.ValuesCount valuesCount = rootDefinitionLevelWriter.writeDefinitionLevels(); + currentPageNullCounts += valuesCount.totalValuesCount() - valuesCount.maxDefinitionLevelValuesCount(); + valueCount += valuesCount.totalValuesCount(); + + if (columnDescriptor.getMaxRepetitionLevel() > 0) { + // write repetition levels for nested types + List repLevelWriterProviders = ImmutableList.builder() + .addAll(columnChunk.getRepLevelWriterProviders()) + .add(RepLevelWriterProviders.of(columnChunk.getBlock())) + .build(); + RepetitionLevelWriter rootRepetitionLevelWriter = getRootRepetitionLevelWriter(repLevelWriterProviders, repetitionLevelWriter); + rootRepetitionLevelWriter.writeRepetitionLevels(0); + } + + long currentPageBufferedBytes = getCurrentPageBufferedBytes(); + if (valueCount >= pageValueCountLimit || currentPageBufferedBytes >= pageSizeThreshold) { + flushCurrentPageToBuffer(); + } + else { + updateBufferedBytes(currentPageBufferedBytes); + } + } + + @Override + public void close() + { + closed = true; + } + + @Override + public List getBuffer() + throws IOException + { + checkState(closed); + DataStreams dataStreams = getDataStreams(); + ColumnMetaData columnMetaData = getColumnMetaData(); + + EncodingStats stats = convertEncodingStats(columnMetaData.getEncoding_stats()); + boolean isOnlyDictionaryEncodingPages = stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages(); + + return ImmutableList.of(new BufferData( + dataStreams.data(), + dataStreams.dictionaryPageSize(), + isOnlyDictionaryEncodingPages ? Optional.empty() : dataStreams.bloomFilter(), + columnMetaData)); + } + + // Returns ColumnMetaData that offset is invalid + private ColumnMetaData getColumnMetaData() + { + checkState(getDataStreamsCalled); + + ColumnMetaData columnMetaData = new ColumnMetaData( + ParquetTypeConverter.getType(columnDescriptor.getPrimitiveType().getPrimitiveTypeName()), + encodings.stream().map(ParquetMetadataConverter::getEncoding).collect(toImmutableList()), + ImmutableList.copyOf(columnDescriptor.getPath()), + compressionCodec, + totalValues, + totalUnCompressedSize, + totalCompressedSize, + -1); + columnMetaData.setStatistics(ParquetMetadataConverter.toParquetStatistics(columnStatistics, MAX_STATISTICS_LENGTH_IN_BYTES)); + ImmutableList.Builder pageEncodingStats = ImmutableList.builder(); + dataPagesWithEncoding.entrySet().stream() + .map(encodingAndCount -> new PageEncodingStats(PageType.DATA_PAGE, encodingAndCount.getKey(), encodingAndCount.getValue())) + .forEach(pageEncodingStats::add); + dictionaryPagesWithEncoding.entrySet().stream() + .map(encodingAndCount -> new PageEncodingStats(PageType.DICTIONARY_PAGE, encodingAndCount.getKey(), encodingAndCount.getValue())) + .forEach(pageEncodingStats::add); + columnMetaData.setEncoding_stats(pageEncodingStats.build()); + return columnMetaData; + } + + // page header + // repetition levels + // definition levels + // data + private void flushCurrentPageToBuffer() + throws IOException + { + byte[] pageDataBytes = BytesInput.concat( + repetitionLevelWriter.getBytes(), + definitionLevelWriter.getBytes(), + primitiveValueWriter.getBytes()) + .toByteArray(); + int uncompressedSize = pageDataBytes.length; + ParquetDataOutput pageData = (compressor != null) + ? compressor.compress(pageDataBytes) + : createDataOutput(Slices.wrappedBuffer(pageDataBytes)); + int compressedSize = pageData.size(); + + Statistics statistics = primitiveValueWriter.getStatistics(); + statistics.incrementNumNulls(currentPageNullCounts); + columnStatistics.mergeStatistics(statistics); + + int writtenBytesSoFar = compressedOutputStream.size(); + PageHeader header = dataPageV1Header( + uncompressedSize, + compressedSize, + valueCount, + repetitionLevelWriter.getEncoding(), + definitionLevelWriter.getEncoding(), + primitiveValueWriter.getEncoding()); + writePageHeader(header, compressedOutputStream); + int pageHeaderSize = compressedOutputStream.size() - writtenBytesSoFar; + + dataPagesWithEncoding.merge(getEncoding(primitiveValueWriter.getEncoding()), 1, Integer::sum); + + // update total stats + totalUnCompressedSize += pageHeaderSize + uncompressedSize; + int pageCompressedSize = pageHeaderSize + compressedSize; + totalCompressedSize += pageCompressedSize; + totalValues += valueCount; + + pageData.writeData(compressedOutputStream); + pageBufferedBytes += pageCompressedSize; + + // Add encoding should be called after ValuesWriter#getBytes() and before ValuesWriter#reset() + encodings.add(repetitionLevelWriter.getEncoding()); + encodings.add(definitionLevelWriter.getEncoding()); + encodings.add(primitiveValueWriter.getEncoding()); + + // reset page stats + valueCount = 0; + currentPageNullCounts = 0; + + repetitionLevelWriter.reset(); + definitionLevelWriter.reset(); + primitiveValueWriter.reset(); + updateBufferedBytes(getCurrentPageBufferedBytes()); + } + + private DataStreams getDataStreams() + throws IOException + { + ImmutableList.Builder outputs = ImmutableList.builder(); + if (valueCount > 0) { + flushCurrentPageToBuffer(); + } + // write dict page if possible + DictionaryPage dictionaryPage = primitiveValueWriter.toDictPageAndClose(); + OptionalInt dictionaryPageSize = OptionalInt.empty(); + if (dictionaryPage != null) { + int uncompressedSize = dictionaryPage.getUncompressedSize(); + byte[] pageBytes = dictionaryPage.getBytes().toByteArray(); + ParquetDataOutput pageData = compressor != null + ? compressor.compress(pageBytes) + : createDataOutput(Slices.wrappedBuffer(pageBytes)); + int compressedSize = pageData.size(); + + ByteArrayOutputStream dictStream = new ByteArrayOutputStream(); + PageHeader header = dictionaryPageHeader( + uncompressedSize, + compressedSize, + dictionaryPage.getDictionarySize(), + dictionaryPage.getEncoding()); + writePageHeader(header, dictStream); + ParquetDataOutput pageHeader = createDataOutput(dictStream); + outputs.add(pageHeader); + outputs.add(pageData); + totalCompressedSize += pageHeader.size() + compressedSize; + totalUnCompressedSize += pageHeader.size() + uncompressedSize; + dictionaryPagesWithEncoding.merge(getEncoding(dictionaryPage.getEncoding()), 1, Integer::sum); + dictionaryPageSize = OptionalInt.of(pageHeader.size() + compressedSize); + + primitiveValueWriter.resetDictionary(); + } + getDataStreamsCalled = true; + + outputs.add(createDataOutput(compressedOutputStream)); + return new DataStreams(outputs.build(), dictionaryPageSize, bloomFilter); + } + + @Override + public long getBufferedBytes() + { + return bufferedBytes; + } + + @Override + public long getRetainedBytes() + { + return INSTANCE_SIZE + + compressedOutputStream.getRetainedSize() + + primitiveValueWriter.getAllocatedSize() + + definitionLevelWriter.getAllocatedSize() + + repetitionLevelWriter.getAllocatedSize(); + } + + private void updateBufferedBytes(long currentPageBufferedBytes) + { + bufferedBytes = pageBufferedBytes + currentPageBufferedBytes; + } + + private long getCurrentPageBufferedBytes() + { + return definitionLevelWriter.getBufferedSize() + + repetitionLevelWriter.getBufferedSize() + + primitiveValueWriter.getBufferedSize(); + } + + private static PageHeader dataPageV1Header( + int uncompressedSize, + int compressedSize, + int valueCount, + org.apache.parquet.column.Encoding rlEncoding, + org.apache.parquet.column.Encoding dlEncoding, + org.apache.parquet.column.Encoding valuesEncoding) + { + PageHeader header = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); + header.setData_page_header(new DataPageHeader( + valueCount, + getEncoding(valuesEncoding), + getEncoding(dlEncoding), + getEncoding(rlEncoding))); + return header; + } + + private static PageHeader dictionaryPageHeader( + int uncompressedSize, + int compressedSize, + int valueCount, + org.apache.parquet.column.Encoding valuesEncoding) + { + PageHeader header = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize); + header.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding))); + return header; + } + + private record DataStreams(List data, OptionalInt dictionaryPageSize, Optional bloomFilter) {} +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/StructColumnWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/StructColumnWriter.java new file mode 100644 index 000000000000..181c1942ea35 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/StructColumnWriter.java @@ -0,0 +1,103 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer; + +import com.google.common.collect.ImmutableList; +import io.trino.parquet.writer.repdef.DefLevelWriterProvider; +import io.trino.parquet.writer.repdef.DefLevelWriterProviders; +import io.trino.parquet.writer.repdef.RepLevelWriterProvider; +import io.trino.parquet.writer.repdef.RepLevelWriterProviders; +import io.trino.spi.block.Block; +import io.trino.spi.block.RowBlock; + +import java.io.IOException; +import java.util.List; + +import static io.airlift.slice.SizeOf.instanceSize; +import static java.util.Objects.requireNonNull; +import static org.apache.parquet.Preconditions.checkArgument; + +public class StructColumnWriter + implements ColumnWriter +{ + private static final int INSTANCE_SIZE = instanceSize(StructColumnWriter.class); + + private final List columnWriters; + private final int maxDefinitionLevel; + + public StructColumnWriter(List columnWriters, int maxDefinitionLevel) + { + this.columnWriters = requireNonNull(columnWriters, "columnWriters is null"); + this.maxDefinitionLevel = maxDefinitionLevel; + } + + @Override + public void writeBlock(ColumnChunk columnChunk) + throws IOException + { + Block block = columnChunk.getBlock(); + List fields = RowBlock.getNullSuppressedRowFieldsFromBlock(block); + checkArgument(fields.size() == columnWriters.size(), "Row field size %s is not equal to columnWriters size %s", fields.size(), columnWriters.size()); + + List defLevelWriterProviders = ImmutableList.builder() + .addAll(columnChunk.getDefLevelWriterProviders()) + .add(DefLevelWriterProviders.of(block, maxDefinitionLevel)) + .build(); + List repLevelWriterProviders = ImmutableList.builder() + .addAll(columnChunk.getRepLevelWriterProviders()) + .add(RepLevelWriterProviders.of(block)) + .build(); + + for (int i = 0; i < columnWriters.size(); ++i) { + ColumnWriter columnWriter = columnWriters.get(i); + Block field = fields.get(i); + columnWriter.writeBlock(new ColumnChunk(field, defLevelWriterProviders, repLevelWriterProviders)); + } + } + + @Override + public void close() + { + columnWriters.forEach(ColumnWriter::close); + } + + @Override + public List getBuffer() + throws IOException + { + ImmutableList.Builder builder = ImmutableList.builder(); + for (ColumnWriter columnWriter : columnWriters) { + builder.addAll(columnWriter.getBuffer()); + } + return builder.build(); + } + + @Override + public long getBufferedBytes() + { + // Avoid using streams here for performance reasons + long bufferedBytes = 0; + for (ColumnWriter columnWriter : columnWriters) { + bufferedBytes += columnWriter.getBufferedBytes(); + } + return bufferedBytes; + } + + @Override + public long getRetainedBytes() + { + return INSTANCE_SIZE + + columnWriters.stream().mapToLong(ColumnWriter::getRetainedBytes).sum(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/DefLevelWriterProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/DefLevelWriterProvider.java new file mode 100644 index 000000000000..447f7275d364 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/DefLevelWriterProvider.java @@ -0,0 +1,47 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.repdef; + +import com.google.common.collect.Iterables; +import io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter; + +import java.util.List; +import java.util.Optional; + +public interface DefLevelWriterProvider +{ + DefinitionLevelWriter getDefinitionLevelWriter(Optional nestedWriter, ColumnDescriptorValuesWriter encoder); + + interface DefinitionLevelWriter + { + ValuesCount writeDefinitionLevels(int positionsCount); + + ValuesCount writeDefinitionLevels(); + } + + record ValuesCount(int totalValuesCount, int maxDefinitionLevelValuesCount) {} + + static DefinitionLevelWriter getRootDefinitionLevelWriter(List defLevelWriterProviders, ColumnDescriptorValuesWriter encoder) + { + // Constructs hierarchy of DefinitionLevelWriter from leaf to root + DefinitionLevelWriter rootDefinitionLevelWriter = Iterables.getLast(defLevelWriterProviders) + .getDefinitionLevelWriter(Optional.empty(), encoder); + for (int nestedLevel = defLevelWriterProviders.size() - 2; nestedLevel >= 0; nestedLevel--) { + DefinitionLevelWriter nestedWriter = rootDefinitionLevelWriter; + rootDefinitionLevelWriter = defLevelWriterProviders.get(nestedLevel) + .getDefinitionLevelWriter(Optional.of(nestedWriter), encoder); + } + return rootDefinitionLevelWriter; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/DefLevelWriterProviders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/DefLevelWriterProviders.java new file mode 100644 index 000000000000..ffd630ad2f63 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/DefLevelWriterProviders.java @@ -0,0 +1,344 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.repdef; + +import io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter; +import io.trino.spi.block.ArrayBlock; +import io.trino.spi.block.Block; +import io.trino.spi.block.ColumnarArray; +import io.trino.spi.block.ColumnarMap; +import io.trino.spi.block.MapBlock; +import io.trino.spi.block.RowBlock; + +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class DefLevelWriterProviders +{ + private DefLevelWriterProviders() {} + + public static DefLevelWriterProvider of(Block block, int maxDefinitionLevel) + { + if (block.getUnderlyingValueBlock() instanceof RowBlock) { + return new RowDefLevelWriterProvider(block, maxDefinitionLevel); + } + return new PrimitiveDefLevelWriterProvider(block, maxDefinitionLevel); + } + + public static DefLevelWriterProvider of(ColumnarArray columnarArray, int maxDefinitionLevel) + { + return new ColumnArrayDefLevelWriterProvider(columnarArray, maxDefinitionLevel); + } + + public static DefLevelWriterProvider of(ColumnarMap columnarMap, int maxDefinitionLevel) + { + return new ColumnMapDefLevelWriterProvider(columnarMap, maxDefinitionLevel); + } + + static class PrimitiveDefLevelWriterProvider + implements DefLevelWriterProvider + { + private final Block block; + private final int maxDefinitionLevel; + + PrimitiveDefLevelWriterProvider(Block block, int maxDefinitionLevel) + { + this.block = requireNonNull(block, "block is null"); + this.maxDefinitionLevel = maxDefinitionLevel; + checkArgument(!(block.getUnderlyingValueBlock() instanceof RowBlock), "block is a row block"); + checkArgument(!(block.getUnderlyingValueBlock() instanceof ArrayBlock), "block is an array block"); + checkArgument(!(block.getUnderlyingValueBlock() instanceof MapBlock), "block is a map block"); + } + + @Override + public DefinitionLevelWriter getDefinitionLevelWriter(Optional nestedWriter, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriter.isEmpty(), "nestedWriter should be empty for primitive definition level writer"); + return new DefinitionLevelWriter() + { + private int offset; + + @Override + public ValuesCount writeDefinitionLevels() + { + return writeDefinitionLevels(block.getPositionCount()); + } + + @Override + public ValuesCount writeDefinitionLevels(int positionsCount) + { + checkValidPosition(offset, positionsCount, block.getPositionCount()); + int nonNullsCount = 0; + if (!block.mayHaveNull()) { + encoder.writeRepeatInteger(maxDefinitionLevel, positionsCount); + nonNullsCount = positionsCount; + } + else { + for (int position = offset; position < offset + positionsCount; position++) { + int isNull = block.isNull(position) ? 1 : 0; + encoder.writeInteger(maxDefinitionLevel - isNull); + nonNullsCount += isNull ^ 1; + } + } + offset += positionsCount; + return new ValuesCount(positionsCount, nonNullsCount); + } + }; + } + } + + static class RowDefLevelWriterProvider + implements DefLevelWriterProvider + { + private final Block block; + private final int maxDefinitionLevel; + + RowDefLevelWriterProvider(Block block, int maxDefinitionLevel) + { + this.block = requireNonNull(block, "block is null"); + this.maxDefinitionLevel = maxDefinitionLevel; + checkArgument(block.getUnderlyingValueBlock() instanceof RowBlock, "block is not a row block"); + } + + @Override + public DefinitionLevelWriter getDefinitionLevelWriter(Optional nestedWriterOptional, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriterOptional.isPresent(), "nestedWriter should be present for column row definition level writer"); + return new DefinitionLevelWriter() + { + private final DefinitionLevelWriter nestedWriter = nestedWriterOptional.orElseThrow(); + + private int offset; + + @Override + public ValuesCount writeDefinitionLevels() + { + return writeDefinitionLevels(block.getPositionCount()); + } + + @Override + public ValuesCount writeDefinitionLevels(int positionsCount) + { + checkValidPosition(offset, positionsCount, block.getPositionCount()); + if (!block.mayHaveNull()) { + offset += positionsCount; + return nestedWriter.writeDefinitionLevels(positionsCount); + } + int maxDefinitionValuesCount = 0; + int totalValuesCount = 0; + for (int position = offset; position < offset + positionsCount; ) { + if (block.isNull(position)) { + encoder.writeInteger(maxDefinitionLevel - 1); + totalValuesCount++; + position++; + } + else { + int consecutiveNonNullsCount = 1; + position++; + while (position < offset + positionsCount && !block.isNull(position)) { + position++; + consecutiveNonNullsCount++; + } + ValuesCount valuesCount = nestedWriter.writeDefinitionLevels(consecutiveNonNullsCount); + maxDefinitionValuesCount += valuesCount.maxDefinitionLevelValuesCount(); + totalValuesCount += valuesCount.totalValuesCount(); + } + } + offset += positionsCount; + return new ValuesCount(totalValuesCount, maxDefinitionValuesCount); + } + }; + } + } + + static class ColumnMapDefLevelWriterProvider + implements DefLevelWriterProvider + { + private final ColumnarMap columnarMap; + private final int maxDefinitionLevel; + + ColumnMapDefLevelWriterProvider(ColumnarMap columnarMap, int maxDefinitionLevel) + { + this.columnarMap = requireNonNull(columnarMap, "columnarMap is null"); + this.maxDefinitionLevel = maxDefinitionLevel; + } + + @Override + public DefinitionLevelWriter getDefinitionLevelWriter(Optional nestedWriterOptional, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriterOptional.isPresent(), "nestedWriter should be present for column map definition level writer"); + return new DefinitionLevelWriter() + { + private final DefinitionLevelWriter nestedWriter = nestedWriterOptional.orElseThrow(); + + private int offset; + + @Override + public ValuesCount writeDefinitionLevels() + { + return writeDefinitionLevels(columnarMap.getPositionCount()); + } + + @Override + public ValuesCount writeDefinitionLevels(int positionsCount) + { + checkValidPosition(offset, positionsCount, columnarMap.getPositionCount()); + int maxDefinitionValuesCount = 0; + int totalValuesCount = 0; + if (!columnarMap.mayHaveNull()) { + for (int position = offset; position < offset + positionsCount; ) { + int mapLength = columnarMap.getEntryCount(position); + if (mapLength == 0) { + encoder.writeInteger(maxDefinitionLevel - 1); + totalValuesCount++; + position++; + } + else { + int consecutiveNonEmptyArrayLength = mapLength; + position++; + while (position < offset + positionsCount) { + mapLength = columnarMap.getEntryCount(position); + if (mapLength == 0) { + break; + } + position++; + consecutiveNonEmptyArrayLength += mapLength; + } + ValuesCount valuesCount = nestedWriter.writeDefinitionLevels(consecutiveNonEmptyArrayLength); + maxDefinitionValuesCount += valuesCount.maxDefinitionLevelValuesCount(); + totalValuesCount += valuesCount.totalValuesCount(); + } + } + } + else { + for (int position = offset; position < offset + positionsCount; position++) { + if (columnarMap.isNull(position)) { + encoder.writeInteger(maxDefinitionLevel - 2); + totalValuesCount++; + continue; + } + int mapLength = columnarMap.getEntryCount(position); + if (mapLength == 0) { + encoder.writeInteger(maxDefinitionLevel - 1); + totalValuesCount++; + } + else { + ValuesCount valuesCount = nestedWriter.writeDefinitionLevels(mapLength); + maxDefinitionValuesCount += valuesCount.maxDefinitionLevelValuesCount(); + totalValuesCount += valuesCount.totalValuesCount(); + } + } + } + offset += positionsCount; + return new ValuesCount(totalValuesCount, maxDefinitionValuesCount); + } + }; + } + } + + static class ColumnArrayDefLevelWriterProvider + implements DefLevelWriterProvider + { + private final ColumnarArray columnarArray; + private final int maxDefinitionLevel; + + ColumnArrayDefLevelWriterProvider(ColumnarArray columnarArray, int maxDefinitionLevel) + { + this.columnarArray = requireNonNull(columnarArray, "columnarArray is null"); + this.maxDefinitionLevel = maxDefinitionLevel; + } + + @Override + public DefinitionLevelWriter getDefinitionLevelWriter(Optional nestedWriterOptional, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriterOptional.isPresent(), "nestedWriter should be present for column map definition level writer"); + return new DefinitionLevelWriter() + { + private final DefinitionLevelWriter nestedWriter = nestedWriterOptional.orElseThrow(); + + private int offset; + + @Override + public ValuesCount writeDefinitionLevels() + { + return writeDefinitionLevels(columnarArray.getPositionCount()); + } + + @Override + public ValuesCount writeDefinitionLevels(int positionsCount) + { + checkValidPosition(offset, positionsCount, columnarArray.getPositionCount()); + int maxDefinitionValuesCount = 0; + int totalValuesCount = 0; + if (!columnarArray.mayHaveNull()) { + for (int position = offset; position < offset + positionsCount; ) { + int arrayLength = columnarArray.getLength(position); + if (arrayLength == 0) { + encoder.writeInteger(maxDefinitionLevel - 1); + totalValuesCount++; + position++; + } + else { + int consecutiveNonEmptyArrayLength = arrayLength; + position++; + while (position < offset + positionsCount) { + arrayLength = columnarArray.getLength(position); + if (arrayLength == 0) { + break; + } + position++; + consecutiveNonEmptyArrayLength += arrayLength; + } + ValuesCount valuesCount = nestedWriter.writeDefinitionLevels(consecutiveNonEmptyArrayLength); + maxDefinitionValuesCount += valuesCount.maxDefinitionLevelValuesCount(); + totalValuesCount += valuesCount.totalValuesCount(); + } + } + } + else { + for (int position = offset; position < offset + positionsCount; position++) { + if (columnarArray.isNull(position)) { + encoder.writeInteger(maxDefinitionLevel - 2); + totalValuesCount++; + continue; + } + int arrayLength = columnarArray.getLength(position); + if (arrayLength == 0) { + encoder.writeInteger(maxDefinitionLevel - 1); + totalValuesCount++; + } + else { + ValuesCount valuesCount = nestedWriter.writeDefinitionLevels(arrayLength); + maxDefinitionValuesCount += valuesCount.maxDefinitionLevelValuesCount(); + totalValuesCount += valuesCount.totalValuesCount(); + } + } + } + offset += positionsCount; + return new ValuesCount(totalValuesCount, maxDefinitionValuesCount); + } + }; + } + } + + private static void checkValidPosition(int offset, int positionsCount, int totalPositionsCount) + { + if (offset < 0 || positionsCount < 0 || offset + positionsCount > totalPositionsCount) { + throw new IndexOutOfBoundsException(format("Invalid offset %s and positionsCount %s in block with %s positions", offset, positionsCount, totalPositionsCount)); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/RepLevelWriterProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/RepLevelWriterProvider.java new file mode 100644 index 000000000000..941219ef1878 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/RepLevelWriterProvider.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.repdef; + +import com.google.common.collect.Iterables; +import io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter; + +import java.util.List; +import java.util.Optional; + +public interface RepLevelWriterProvider +{ + RepetitionLevelWriter getRepetitionLevelWriter(Optional nestedWriter, ColumnDescriptorValuesWriter encoder); + + /** + * Parent repetition level marks at which level either: + * 1. A new collection starts + * 2. A collection is null or empty + * 3. A primitive column stays + */ + interface RepetitionLevelWriter + { + void writeRepetitionLevels(int parentLevel, int positionsCount); + + void writeRepetitionLevels(int parentLevel); + } + + static RepetitionLevelWriter getRootRepetitionLevelWriter(List repLevelWriterProviders, ColumnDescriptorValuesWriter encoder) + { + // Constructs hierarchy of RepetitionLevelWriter from leaf to root + RepetitionLevelWriter rootRepetitionLevelWriter = Iterables.getLast(repLevelWriterProviders) + .getRepetitionLevelWriter(Optional.empty(), encoder); + for (int nestedLevel = repLevelWriterProviders.size() - 2; nestedLevel >= 0; nestedLevel--) { + RepetitionLevelWriter nestedWriter = rootRepetitionLevelWriter; + rootRepetitionLevelWriter = repLevelWriterProviders.get(nestedLevel) + .getRepetitionLevelWriter(Optional.of(nestedWriter), encoder); + } + return rootRepetitionLevelWriter; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/RepLevelWriterProviders.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/RepLevelWriterProviders.java new file mode 100644 index 000000000000..7fdc0f48e4d2 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/repdef/RepLevelWriterProviders.java @@ -0,0 +1,282 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.repdef; + +import io.trino.parquet.writer.valuewriter.ColumnDescriptorValuesWriter; +import io.trino.spi.block.ArrayBlock; +import io.trino.spi.block.Block; +import io.trino.spi.block.ColumnarArray; +import io.trino.spi.block.ColumnarMap; +import io.trino.spi.block.MapBlock; +import io.trino.spi.block.RowBlock; + +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class RepLevelWriterProviders +{ + private RepLevelWriterProviders() {} + + public static RepLevelWriterProvider of(Block block) + { + if (block.getUnderlyingValueBlock() instanceof RowBlock) { + return new RowRepLevelWriterProvider(block); + } + return new PrimitiveRepLevelWriterProvider(block); + } + + public static RepLevelWriterProvider of(ColumnarArray columnarArray, int maxRepetitionLevel) + { + return new ColumnArrayRepLevelWriterProvider(columnarArray, maxRepetitionLevel); + } + + public static RepLevelWriterProvider of(ColumnarMap columnarMap, int maxRepetitionLevel) + { + return new ColumnMapRepLevelWriterProvider(columnarMap, maxRepetitionLevel); + } + + static class PrimitiveRepLevelWriterProvider + implements RepLevelWriterProvider + { + private final Block block; + + PrimitiveRepLevelWriterProvider(Block block) + { + this.block = requireNonNull(block, "block is null"); + checkArgument(!(block.getUnderlyingValueBlock() instanceof RowBlock), "block is a row block"); + checkArgument(!(block.getUnderlyingValueBlock() instanceof ArrayBlock), "block is an array block"); + checkArgument(!(block.getUnderlyingValueBlock() instanceof MapBlock), "block is a map block"); + } + + @Override + public RepetitionLevelWriter getRepetitionLevelWriter(Optional nestedWriter, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriter.isEmpty(), "nestedWriter should be empty for primitive repetition level writer"); + return new RepetitionLevelWriter() + { + private int offset; + + @Override + public void writeRepetitionLevels(int parentLevel) + { + writeRepetitionLevels(parentLevel, block.getPositionCount()); + } + + @Override + public void writeRepetitionLevels(int parentLevel, int positionsCount) + { + checkValidPosition(offset, positionsCount, block.getPositionCount()); + encoder.writeRepeatInteger(parentLevel, positionsCount); + offset += positionsCount; + } + }; + } + } + + static class RowRepLevelWriterProvider + implements RepLevelWriterProvider + { + private final Block block; + + RowRepLevelWriterProvider(Block block) + { + this.block = requireNonNull(block, "block is null"); + checkArgument(block.getUnderlyingValueBlock() instanceof RowBlock, "block is not a row block"); + } + + @Override + public RepetitionLevelWriter getRepetitionLevelWriter(Optional nestedWriterOptional, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriterOptional.isPresent(), "nestedWriter should be present for column row repetition level writer"); + return new RepetitionLevelWriter() + { + private final RepetitionLevelWriter nestedWriter = nestedWriterOptional.orElseThrow(); + + private int offset; + + @Override + public void writeRepetitionLevels(int parentLevel) + { + writeRepetitionLevels(parentLevel, block.getPositionCount()); + } + + @Override + public void writeRepetitionLevels(int parentLevel, int positionsCount) + { + checkValidPosition(offset, positionsCount, block.getPositionCount()); + if (!block.mayHaveNull()) { + nestedWriter.writeRepetitionLevels(parentLevel, positionsCount); + offset += positionsCount; + return; + } + + for (int position = offset; position < offset + positionsCount; ) { + if (block.isNull(position)) { + encoder.writeInteger(parentLevel); + position++; + } + else { + int consecutiveNonNullsCount = 1; + position++; + while (position < offset + positionsCount && !block.isNull(position)) { + position++; + consecutiveNonNullsCount++; + } + nestedWriter.writeRepetitionLevels(parentLevel, consecutiveNonNullsCount); + } + } + offset += positionsCount; + } + }; + } + } + + static class ColumnMapRepLevelWriterProvider + implements RepLevelWriterProvider + { + private final ColumnarMap columnarMap; + private final int maxRepetitionLevel; + + ColumnMapRepLevelWriterProvider(ColumnarMap columnarMap, int maxRepetitionLevel) + { + this.columnarMap = requireNonNull(columnarMap, "columnarMap is null"); + this.maxRepetitionLevel = maxRepetitionLevel; + } + + @Override + public RepetitionLevelWriter getRepetitionLevelWriter(Optional nestedWriterOptional, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriterOptional.isPresent(), "nestedWriter should be present for column map repetition level writer"); + return new RepetitionLevelWriter() + { + private final RepetitionLevelWriter nestedWriter = nestedWriterOptional.orElseThrow(); + + private int offset; + + @Override + public void writeRepetitionLevels(int parentLevel) + { + writeRepetitionLevels(parentLevel, columnarMap.getPositionCount()); + } + + @Override + public void writeRepetitionLevels(int parentLevel, int positionsCount) + { + checkValidPosition(offset, positionsCount, columnarMap.getPositionCount()); + if (!columnarMap.mayHaveNull()) { + for (int position = offset; position < offset + positionsCount; position++) { + writeNonNullableLevels(parentLevel, position); + } + } + else { + for (int position = offset; position < offset + positionsCount; position++) { + if (columnarMap.isNull(position)) { + encoder.writeInteger(parentLevel); + continue; + } + writeNonNullableLevels(parentLevel, position); + } + } + offset += positionsCount; + } + + private void writeNonNullableLevels(int parentLevel, int position) + { + int entryLength = columnarMap.getEntryCount(position); + if (entryLength == 0) { + encoder.writeInteger(parentLevel); + } + else { + nestedWriter.writeRepetitionLevels(parentLevel, 1); + nestedWriter.writeRepetitionLevels(maxRepetitionLevel, entryLength - 1); + } + } + }; + } + } + + static class ColumnArrayRepLevelWriterProvider + implements RepLevelWriterProvider + { + private final ColumnarArray columnarArray; + private final int maxRepetitionLevel; + + ColumnArrayRepLevelWriterProvider(ColumnarArray columnarArray, int maxRepetitionLevel) + { + this.columnarArray = requireNonNull(columnarArray, "columnarArray is null"); + this.maxRepetitionLevel = maxRepetitionLevel; + } + + @Override + public RepetitionLevelWriter getRepetitionLevelWriter(Optional nestedWriterOptional, ColumnDescriptorValuesWriter encoder) + { + checkArgument(nestedWriterOptional.isPresent(), "nestedWriter should be present for column map repetition level writer"); + return new RepetitionLevelWriter() + { + private final RepetitionLevelWriter nestedWriter = nestedWriterOptional.orElseThrow(); + + private int offset; + + @Override + public void writeRepetitionLevels(int parentLevel) + { + writeRepetitionLevels(parentLevel, columnarArray.getPositionCount()); + } + + @Override + public void writeRepetitionLevels(int parentLevel, int positionsCount) + { + checkValidPosition(offset, positionsCount, columnarArray.getPositionCount()); + if (!columnarArray.mayHaveNull()) { + for (int position = offset; position < offset + positionsCount; position++) { + writeNonNullableLevels(parentLevel, position); + } + } + else { + for (int position = offset; position < offset + positionsCount; position++) { + if (columnarArray.isNull(position)) { + encoder.writeInteger(parentLevel); + continue; + } + writeNonNullableLevels(parentLevel, position); + } + } + offset += positionsCount; + } + + private void writeNonNullableLevels(int parentLevel, int position) + { + int arrayLength = columnarArray.getLength(position); + if (arrayLength == 0) { + encoder.writeInteger(parentLevel); + } + else { + nestedWriter.writeRepetitionLevels(parentLevel, 1); + nestedWriter.writeRepetitionLevels(maxRepetitionLevel, arrayLength - 1); + } + } + }; + } + } + + private static void checkValidPosition(int offset, int positionsCount, int totalPositionsCount) + { + if (offset < 0 || positionsCount < 0 || offset + positionsCount > totalPositionsCount) { + throw new IndexOutOfBoundsException(format("Invalid offset %s and positionsCount %s in block with %s positions", offset, positionsCount, totalPositionsCount)); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BigintValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BigintValueWriter.java new file mode 100644 index 000000000000..918553abacf4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BigintValueWriter.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static java.util.Objects.requireNonNull; + +public class BigintValueWriter + extends PrimitiveValueWriter +{ + private final Type type; + + public BigintValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.type = requireNonNull(type, "type is null"); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + long value = type.getLong(block, i); + valuesWriter.writeLong(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BinaryValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BinaryValueWriter.java new file mode 100644 index 000000000000..41b4b883c4cc --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BinaryValueWriter.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.airlift.slice.Slice; +import io.trino.spi.block.Block; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import static java.util.Objects.requireNonNull; + +public class BinaryValueWriter + extends PrimitiveValueWriter +{ + private final Type type; + + public BinaryValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.type = requireNonNull(type, "type is null"); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + Slice slice = type.getSlice(block, i); + // fromReusedByteArray must be used instead of fromConstantByteArray to avoid retaining entire + // base byte array of the Slice in DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter + Binary binary = Binary.fromReusedByteArray(slice.byteArray(), slice.byteArrayOffset(), slice.length()); + valuesWriter.writeBytes(binary); + statistics.updateStats(binary); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BloomFilterValuesWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BloomFilterValuesWriter.java new file mode 100644 index 000000000000..2f1d44cafc2a --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BloomFilterValuesWriter.java @@ -0,0 +1,152 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.io.api.Binary; + +import java.util.Optional; + +public class BloomFilterValuesWriter + extends ValuesWriter +{ + private final ValuesWriter writer; + private final BloomFilter bloomFilter; + + public static ValuesWriter createBloomFilterValuesWriter(ValuesWriter writer, Optional bloomFilter) + { + if (bloomFilter.isPresent()) { + return new BloomFilterValuesWriter(writer, bloomFilter.orElseThrow()); + } + return writer; + } + + private BloomFilterValuesWriter(ValuesWriter writer, BloomFilter bloomFilter) + { + this.writer = writer; + this.bloomFilter = bloomFilter; + } + + @VisibleForTesting + public ValuesWriter getWriter() + { + return writer; + } + + @Override + public long getBufferedSize() + { + return writer.getBufferedSize(); + } + + @Override + public BytesInput getBytes() + { + return writer.getBytes(); + } + + @Override + public Encoding getEncoding() + { + return writer.getEncoding(); + } + + @Override + public void reset() + { + writer.reset(); + } + + @Override + public void close() + { + writer.close(); + } + + @Override + public DictionaryPage toDictPageAndClose() + { + return writer.toDictPageAndClose(); + } + + @Override + public void resetDictionary() + { + writer.resetDictionary(); + } + + @Override + public long getAllocatedSize() + { + return writer.getAllocatedSize() + bloomFilter.getBitsetSize(); + } + + @Override + public void writeByte(int value) + { + throw new UnsupportedOperationException(); + } + + @Override + public void writeBoolean(boolean v) + { + throw new UnsupportedOperationException(); + } + + @Override + public void writeBytes(Binary v) + { + writer.writeBytes(v); + bloomFilter.insertHash(bloomFilter.hash(v)); + } + + @Override + public void writeInteger(int v) + { + writer.writeInteger(v); + bloomFilter.insertHash(bloomFilter.hash(v)); + } + + @Override + public void writeLong(long v) + { + writer.writeLong(v); + bloomFilter.insertHash(bloomFilter.hash(v)); + } + + @Override + public void writeDouble(double v) + { + writer.writeDouble(v); + bloomFilter.insertHash(bloomFilter.hash(v)); + } + + @Override + public void writeFloat(float v) + { + writer.writeFloat(v); + bloomFilter.insertHash(bloomFilter.hash(v)); + } + + @Override + public String memUsageString(String s) + { + return writer.memUsageString(s); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BooleanValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BooleanValueWriter.java new file mode 100644 index 000000000000..da90e02d1905 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/BooleanValueWriter.java @@ -0,0 +1,46 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static java.util.Objects.requireNonNull; + +public class BooleanValueWriter + extends PrimitiveValueWriter +{ + public BooleanValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + boolean value = BOOLEAN.getBoolean(block, i); + valuesWriter.writeBoolean(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/ColumnDescriptorValuesWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/ColumnDescriptorValuesWriter.java new file mode 100644 index 000000000000..435a1e0ecfd2 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/ColumnDescriptorValuesWriter.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; + +import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt; + +/** + * Used for writing repetition and definition levels + */ +public interface ColumnDescriptorValuesWriter +{ + /** + * @param value the value to encode + */ + void writeInteger(int value); + + /** + * @param value the value to encode + * @param valueRepetitions number of times the input value is repeated in the input stream + */ + void writeRepeatInteger(int value, int valueRepetitions); + + /** + * used to decide if we want to work to the next page + * + * @return the size of the currently buffered data (in bytes) + */ + long getBufferedSize(); + + /** + * @return the allocated size of the buffer + */ + long getAllocatedSize(); + + /** + * @return the bytes buffered so far to write to the current page + */ + BytesInput getBytes(); + + /** + * @return the encoding that was used to encode the bytes + */ + Encoding getEncoding(); + + /** + * called after getBytes() to reset the current buffer and start writing the next page + */ + void reset(); + + static ColumnDescriptorValuesWriter newRepetitionLevelWriter(ColumnDescriptor path, int pageSizeThreshold) + { + return newColumnDescriptorValuesWriter(path.getMaxRepetitionLevel(), pageSizeThreshold); + } + + static ColumnDescriptorValuesWriter newDefinitionLevelWriter(ColumnDescriptor path, int pageSizeThreshold) + { + return newColumnDescriptorValuesWriter(path.getMaxDefinitionLevel(), pageSizeThreshold); + } + + private static ColumnDescriptorValuesWriter newColumnDescriptorValuesWriter(int maxLevel, int pageSizeThreshold) + { + if (maxLevel == 0) { + return new DevNullValuesWriter(); + } + return new RunLengthBitPackingHybridValuesWriter(getWidthFromMaxInt(maxLevel), pageSizeThreshold); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DateValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DateValueWriter.java new file mode 100644 index 000000000000..4682fef21fc7 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DateValueWriter.java @@ -0,0 +1,46 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.DateType.DATE; +import static java.util.Objects.requireNonNull; + +public class DateValueWriter + extends PrimitiveValueWriter +{ + public DateValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int position = 0; position < block.getPositionCount(); position++) { + if (!mayHaveNull || !block.isNull(position)) { + int value = DATE.getInt(block, position); + valuesWriter.writeInteger(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DevNullValuesWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DevNullValuesWriter.java new file mode 100644 index 000000000000..ed54be167d94 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DevNullValuesWriter.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.Encoding; + +/** + * This is a special writer that doesn't write anything. The idea being that + * some columns will always be the same value, and this will capture that. An + * example is the set of repetition levels for a schema with no repeated fields. + */ +public class DevNullValuesWriter + implements ColumnDescriptorValuesWriter +{ + @Override + public long getBufferedSize() + { + return 0; + } + + @Override + public void reset() {} + + @Override + public void writeInteger(int v) {} + + @Override + public void writeRepeatInteger(int value, int valueRepetitions) {} + + @Override + public BytesInput getBytes() + { + return BytesInput.empty(); + } + + @Override + public long getAllocatedSize() + { + return 0; + } + + @Override + @SuppressWarnings("deprecation") + public Encoding getEncoding() + { + return Encoding.BIT_PACKED; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DictionaryFallbackValuesWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DictionaryFallbackValuesWriter.java new file mode 100644 index 000000000000..305a591f4fea --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DictionaryFallbackValuesWriter.java @@ -0,0 +1,234 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import com.google.common.annotations.VisibleForTesting; +import jakarta.annotation.Nullable; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter; +import org.apache.parquet.io.api.Binary; + +import static com.google.common.base.Verify.verify; +import static java.util.Objects.requireNonNull; + +/** + * Based on org.apache.parquet.column.values.fallback.FallbackValuesWriter + */ +public class DictionaryFallbackValuesWriter + extends ValuesWriter +{ + private final ValuesWriter fallBackWriter; + + private boolean fellBackAlready; + private ValuesWriter currentWriter; + @Nullable + private DictionaryValuesWriter initialWriter; + private boolean initialUsedAndHadDictionary; + /* size of raw data, even if dictionary is used, it will not have effect on raw data size, it is used to decide + * if fall back to plain encoding is better by comparing rawDataByteSize with Encoded data size + * It's also used in getBufferedSize, so the page will be written based on raw data size + */ + private long rawDataByteSize; + // indicates if this is the first page being processed + private boolean firstPage = true; + + public DictionaryFallbackValuesWriter(DictionaryValuesWriter initialWriter, ValuesWriter fallBackWriter) + { + super(); + this.initialWriter = initialWriter; + this.fallBackWriter = fallBackWriter; + this.currentWriter = initialWriter; + } + + @Override + public long getBufferedSize() + { + // use raw data size to decide if we want to flush the page + // so the actual size of the page written could be much more smaller + // due to dictionary encoding. This prevents page being too big when fallback happens. + return rawDataByteSize; + } + + @Override + public BytesInput getBytes() + { + if (!fellBackAlready && firstPage) { + // we use the first page to decide if we're going to use this encoding + BytesInput bytes = initialWriter.getBytes(); + if (!initialWriter.isCompressionSatisfying(rawDataByteSize, bytes.size())) { + fallBack(); + // Since fallback happened on first page itself, we can drop the contents of initialWriter + initialWriter.close(); + initialWriter = null; + verify(!initialUsedAndHadDictionary, "initialUsedAndHadDictionary should be false when falling back to PLAIN in first page"); + } + else { + return bytes; + } + } + return currentWriter.getBytes(); + } + + @Override + public Encoding getEncoding() + { + Encoding encoding = currentWriter.getEncoding(); + if (!fellBackAlready && !initialUsedAndHadDictionary) { + initialUsedAndHadDictionary = encoding.usesDictionary(); + } + return encoding; + } + + @Override + public void reset() + { + rawDataByteSize = 0; + firstPage = false; + currentWriter.reset(); + } + + @Override + public void close() + { + if (initialWriter != null) { + initialWriter.close(); + } + fallBackWriter.close(); + } + + @Override + public DictionaryPage toDictPageAndClose() + { + if (initialUsedAndHadDictionary) { + return initialWriter.toDictPageAndClose(); + } + else { + return currentWriter.toDictPageAndClose(); + } + } + + @Override + public void resetDictionary() + { + if (initialUsedAndHadDictionary) { + initialWriter.resetDictionary(); + } + else { + currentWriter.resetDictionary(); + } + currentWriter = initialWriter; + fellBackAlready = false; + initialUsedAndHadDictionary = false; + firstPage = true; + } + + @Override + public long getAllocatedSize() + { + return fallBackWriter.getAllocatedSize() + (initialWriter != null ? initialWriter.getAllocatedSize() : 0); + } + + @Override + public String memUsageString(String prefix) + { + return String.format( + "%s FallbackValuesWriter{\n" + + "%s\n" + + "%s\n" + + "%s}\n", + prefix, + initialWriter != null ? initialWriter.memUsageString(prefix + " initial:") : "", + fallBackWriter.memUsageString(prefix + " fallback:"), + prefix); + } + + // passthrough writing the value + @Override + public void writeByte(int value) + { + rawDataByteSize += Byte.BYTES; + currentWriter.writeByte(value); + checkFallback(); + } + + @Override + public void writeBytes(Binary value) + { + // For raw data, length(4 bytes int) is stored, followed by the binary content itself + rawDataByteSize += value.length() + Integer.BYTES; + currentWriter.writeBytes(value); + checkFallback(); + } + + @Override + public void writeInteger(int value) + { + rawDataByteSize += Integer.BYTES; + currentWriter.writeInteger(value); + checkFallback(); + } + + @Override + public void writeLong(long value) + { + rawDataByteSize += Long.BYTES; + currentWriter.writeLong(value); + checkFallback(); + } + + @Override + public void writeFloat(float value) + { + rawDataByteSize += Float.BYTES; + currentWriter.writeFloat(value); + checkFallback(); + } + + @Override + public void writeDouble(double value) + { + rawDataByteSize += Double.BYTES; + currentWriter.writeDouble(value); + checkFallback(); + } + + @VisibleForTesting + public DictionaryValuesWriter getInitialWriter() + { + return requireNonNull(initialWriter, "initialWriter is null"); + } + + @VisibleForTesting + public ValuesWriter getFallBackWriter() + { + return fallBackWriter; + } + + private void checkFallback() + { + if (!fellBackAlready && initialWriter.shouldFallBack()) { + fallBack(); + } + } + + private void fallBack() + { + fellBackAlready = true; + initialWriter.fallBackAllValuesTo(fallBackWriter); + currentWriter = fallBackWriter; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DoubleValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DoubleValueWriter.java new file mode 100644 index 000000000000..16f60a10d110 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/DoubleValueWriter.java @@ -0,0 +1,46 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.DoubleType.DOUBLE; +import static java.util.Objects.requireNonNull; + +public class DoubleValueWriter + extends PrimitiveValueWriter +{ + public DoubleValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); ++i) { + if (!mayHaveNull || !block.isNull(i)) { + double value = DOUBLE.getDouble(block, i); + valuesWriter.writeDouble(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/FixedLenByteArrayLongDecimalValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/FixedLenByteArrayLongDecimalValueWriter.java new file mode 100644 index 000000000000..2b7808649046 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/FixedLenByteArrayLongDecimalValueWriter.java @@ -0,0 +1,64 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Int128; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import java.math.BigInteger; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetTypeUtils.paddingBigInteger; +import static java.util.Objects.requireNonNull; + +public class FixedLenByteArrayLongDecimalValueWriter + extends PrimitiveValueWriter +{ + private final DecimalType decimalType; + + public FixedLenByteArrayLongDecimalValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.decimalType = (DecimalType) requireNonNull(type, "type is null"); + checkArgument(!this.decimalType.isShort(), "type is not a long decimal"); + checkArgument( + parquetType.getTypeLength() > 0 && parquetType.getTypeLength() <= Int128.SIZE, + "Type length %s must be in range 1-%s", + parquetType.getTypeLength(), + Int128.SIZE); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); ++i) { + if (!mayHaveNull || !block.isNull(i)) { + Int128 decimal = (Int128) decimalType.getObject(block, i); + BigInteger bigInteger = decimal.toBigInteger(); + Binary binary = Binary.fromConstantByteArray(paddingBigInteger(bigInteger, getTypeLength())); + valuesWriter.writeBytes(binary); + statistics.updateStats(binary); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/FixedLenByteArrayShortDecimalValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/FixedLenByteArrayShortDecimalValueWriter.java new file mode 100644 index 000000000000..b1cb3b2494fa --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/FixedLenByteArrayShortDecimalValueWriter.java @@ -0,0 +1,121 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class FixedLenByteArrayShortDecimalValueWriter + extends PrimitiveValueWriter +{ + private final DecimalType decimalType; + + public FixedLenByteArrayShortDecimalValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.decimalType = (DecimalType) requireNonNull(type, "type is null"); + checkArgument(this.decimalType.isShort(), "type is not a short decimal"); + checkArgument( + parquetType.getTypeLength() > 0 && parquetType.getTypeLength() <= Long.BYTES, + "Type length %s must be in range 1-%s", + parquetType.getTypeLength(), + Long.BYTES); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + byte[] buffer = new byte[getTypeLength()]; + Binary reusedBinary = Binary.fromReusedByteArray(buffer); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + long value = decimalType.getLong(block, i); + storeLongIntoBuffer(value, buffer); + valuesWriter.writeBytes(reusedBinary); + statistics.updateStats(reusedBinary); + } + } + } + + private static void storeLongIntoBuffer(long unscaledValue, byte[] buffer) + { + switch (buffer.length) { + case 1: + buffer[0] = (byte) unscaledValue; + break; + case 2: + buffer[0] = (byte) (unscaledValue >> 8); + buffer[1] = (byte) unscaledValue; + break; + case 3: + buffer[0] = (byte) (unscaledValue >> 16); + buffer[1] = (byte) (unscaledValue >> 8); + buffer[2] = (byte) unscaledValue; + break; + case 4: + buffer[0] = (byte) (unscaledValue >> 24); + buffer[1] = (byte) (unscaledValue >> 16); + buffer[2] = (byte) (unscaledValue >> 8); + buffer[3] = (byte) unscaledValue; + break; + case 5: + buffer[0] = (byte) (unscaledValue >> 32); + buffer[1] = (byte) (unscaledValue >> 24); + buffer[2] = (byte) (unscaledValue >> 16); + buffer[3] = (byte) (unscaledValue >> 8); + buffer[4] = (byte) unscaledValue; + break; + case 6: + buffer[0] = (byte) (unscaledValue >> 40); + buffer[1] = (byte) (unscaledValue >> 32); + buffer[2] = (byte) (unscaledValue >> 24); + buffer[3] = (byte) (unscaledValue >> 16); + buffer[4] = (byte) (unscaledValue >> 8); + buffer[5] = (byte) unscaledValue; + break; + case 7: + buffer[0] = (byte) (unscaledValue >> 48); + buffer[1] = (byte) (unscaledValue >> 40); + buffer[2] = (byte) (unscaledValue >> 32); + buffer[3] = (byte) (unscaledValue >> 24); + buffer[4] = (byte) (unscaledValue >> 16); + buffer[5] = (byte) (unscaledValue >> 8); + buffer[6] = (byte) unscaledValue; + break; + case 8: + buffer[0] = (byte) (unscaledValue >> 56); + buffer[1] = (byte) (unscaledValue >> 48); + buffer[2] = (byte) (unscaledValue >> 40); + buffer[3] = (byte) (unscaledValue >> 32); + buffer[4] = (byte) (unscaledValue >> 24); + buffer[5] = (byte) (unscaledValue >> 16); + buffer[6] = (byte) (unscaledValue >> 8); + buffer[7] = (byte) unscaledValue; + break; + default: + throw new IllegalArgumentException("Invalid number of bytes: " + buffer.length); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int32ShortDecimalValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int32ShortDecimalValueWriter.java new file mode 100644 index 000000000000..d443ff55a9d3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int32ShortDecimalValueWriter.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +public class Int32ShortDecimalValueWriter + extends PrimitiveValueWriter +{ + private final DecimalType decimalType; + + public Int32ShortDecimalValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.decimalType = (DecimalType) requireNonNull(type, "type is null"); + checkArgument(this.decimalType.getPrecision() <= 9, "decimalType precision %s must be <= 9", this.decimalType.getPrecision()); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + int value = toIntExact(decimalType.getLong(block, i)); + valuesWriter.writeInteger(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int64ShortDecimalValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int64ShortDecimalValueWriter.java new file mode 100644 index 000000000000..f15e1ae4571e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int64ShortDecimalValueWriter.java @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public class Int64ShortDecimalValueWriter + extends PrimitiveValueWriter +{ + private final DecimalType decimalType; + + public Int64ShortDecimalValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.decimalType = (DecimalType) requireNonNull(type, "type is null"); + checkArgument(this.decimalType.isShort(), "type is not a short decimal"); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + long value = decimalType.getLong(block, i); + valuesWriter.writeLong(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int96TimestampValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int96TimestampValueWriter.java new file mode 100644 index 000000000000..4651b0b98bb3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/Int96TimestampValueWriter.java @@ -0,0 +1,131 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.LongTimestamp; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; +import org.joda.time.DateTimeZone; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.parquet.ParquetTimestampUtils.JULIAN_EPOCH_OFFSET_DAYS; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.Timestamps.MILLISECONDS_PER_DAY; +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MICROSECOND; +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_NANOSECOND; +import static java.lang.Math.floorDiv; +import static java.lang.Math.floorMod; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +public class Int96TimestampValueWriter + extends PrimitiveValueWriter +{ + private final TimestampType timestampType; + private final DateTimeZone parquetTimeZone; + + public Int96TimestampValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType, DateTimeZone parquetTimeZone) + { + super(parquetType, valuesWriter); + requireNonNull(type, "type is null"); + checkArgument( + type instanceof TimestampType timestampType && timestampType.getPrecision() <= 9, + "type %s is not a TimestampType with precision <= 9", + type); + this.timestampType = (TimestampType) type; + checkArgument( + parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96), + "parquetType %s is not INT96", + parquetType); + this.parquetTimeZone = requireNonNull(parquetTimeZone, "parquetTimeZone is null"); + } + + @Override + public void write(Block block) + { + if (timestampType.isShort()) { + writeShortTimestamps(block); + } + else { + writeLongTimestamps(block); + } + } + + private void writeShortTimestamps(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + byte[] buffer = new byte[Long.BYTES + Integer.BYTES]; + Binary reusedBinary = Binary.fromReusedByteArray(buffer); + + for (int position = 0; position < block.getPositionCount(); position++) { + if (!mayHaveNull || !block.isNull(position)) { + long epochMicros = timestampType.getLong(block, position); + long localEpochMillis = floorDiv(epochMicros, MICROSECONDS_PER_MILLISECOND); + int nanosOfMillis = floorMod(epochMicros, MICROSECONDS_PER_MILLISECOND) * NANOSECONDS_PER_MICROSECOND; + + convertAndWriteToBuffer(localEpochMillis, nanosOfMillis, buffer); + valuesWriter.writeBytes(reusedBinary); + statistics.updateStats(reusedBinary); + } + } + } + + private void writeLongTimestamps(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + byte[] buffer = new byte[Long.BYTES + Integer.BYTES]; + Binary reusedBinary = Binary.fromReusedByteArray(buffer); + + for (int position = 0; position < block.getPositionCount(); position++) { + if (!mayHaveNull || !block.isNull(position)) { + LongTimestamp timestamp = (LongTimestamp) timestampType.getObject(block, position); + long epochMicros = timestamp.getEpochMicros(); + // This should divide exactly because timestamp precision is <= 9 + int nanosOfMicro = timestamp.getPicosOfMicro() / PICOSECONDS_PER_NANOSECOND; + long localEpochMillis = floorDiv(epochMicros, MICROSECONDS_PER_MILLISECOND); + int nanosOfMillis = floorMod(epochMicros, MICROSECONDS_PER_MILLISECOND) * NANOSECONDS_PER_MICROSECOND + nanosOfMicro; + + convertAndWriteToBuffer(localEpochMillis, nanosOfMillis, buffer); + valuesWriter.writeBytes(reusedBinary); + statistics.updateStats(reusedBinary); + } + } + } + + private void convertAndWriteToBuffer(long localEpochMillis, int nanosOfMillis, byte[] buffer) + { + long epochMillis = parquetTimeZone.convertLocalToUTC(localEpochMillis, false); + long epochDay = floorDiv(epochMillis, MILLISECONDS_PER_DAY); + int julianDay = JULIAN_EPOCH_OFFSET_DAYS + toIntExact(epochDay); + + long nanosOfEpochDay = nanosOfMillis + ((long) floorMod(epochMillis, MILLISECONDS_PER_DAY) * NANOSECONDS_PER_MILLISECOND); + ByteBuffer.wrap(buffer) + .order(ByteOrder.LITTLE_ENDIAN) + .putLong(0, nanosOfEpochDay) + .putInt(Long.BYTES, julianDay); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/IntegerValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/IntegerValueWriter.java new file mode 100644 index 000000000000..c55c995e3623 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/IntegerValueWriter.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static java.util.Objects.requireNonNull; + +public class IntegerValueWriter + extends PrimitiveValueWriter +{ + private final Type type; + + public IntegerValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.type = requireNonNull(type, "type is null"); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); ++i) { + if (!mayHaveNull || !block.isNull(i)) { + int value = (int) type.getLong(block, i); + valuesWriter.writeInteger(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/PrimitiveValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/PrimitiveValueWriter.java new file mode 100644 index 000000000000..511bcdb1eae9 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/PrimitiveValueWriter.java @@ -0,0 +1,111 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.page.DictionaryPage; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static java.util.Objects.requireNonNull; + +public abstract class PrimitiveValueWriter + extends ValuesWriter +{ + private Statistics statistics; + private final PrimitiveType parquetType; + private final ValuesWriter valuesWriter; + + public PrimitiveValueWriter(PrimitiveType parquetType, ValuesWriter valuesWriter) + { + this.parquetType = requireNonNull(parquetType, "parquetType is null"); + this.valuesWriter = requireNonNull(valuesWriter, "valuesWriter is null"); + this.statistics = Statistics.createStats(parquetType); + } + + ValuesWriter getValuesWriter() + { + return valuesWriter; + } + + public Statistics getStatistics() + { + return statistics; + } + + protected int getTypeLength() + { + return parquetType.getTypeLength(); + } + + @Override + public long getBufferedSize() + { + return valuesWriter.getBufferedSize(); + } + + @Override + public BytesInput getBytes() + { + return valuesWriter.getBytes(); + } + + @Override + public Encoding getEncoding() + { + return valuesWriter.getEncoding(); + } + + @Override + public void reset() + { + valuesWriter.reset(); + this.statistics = Statistics.createStats(parquetType); + } + + @Override + public void close() + { + valuesWriter.close(); + } + + @Override + public DictionaryPage toDictPageAndClose() + { + return valuesWriter.toDictPageAndClose(); + } + + @Override + public void resetDictionary() + { + valuesWriter.resetDictionary(); + } + + @Override + public long getAllocatedSize() + { + return valuesWriter.getAllocatedSize(); + } + + @Override + public String memUsageString(String prefix) + { + return valuesWriter.memUsageString(prefix); + } + + public abstract void write(Block block); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RealValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RealValueWriter.java new file mode 100644 index 000000000000..e13e265e95b4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RealValueWriter.java @@ -0,0 +1,46 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.RealType.REAL; +import static java.util.Objects.requireNonNull; + +public class RealValueWriter + extends PrimitiveValueWriter +{ + public RealValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + float value = REAL.getFloat(block, i); + valuesWriter.writeFloat(value); + statistics.updateStats(value); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RunLengthBitPackingHybridEncoder.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RunLengthBitPackingHybridEncoder.java new file mode 100644 index 000000000000..d3482ff0eaeb --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RunLengthBitPackingHybridEncoder.java @@ -0,0 +1,318 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.bytes.CapacityByteArrayOutputStream; +import org.apache.parquet.bytes.HeapByteBufferAllocator; +import org.apache.parquet.column.values.bitpacking.BytePacker; +import org.apache.parquet.column.values.bitpacking.Packer; +import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * Encodes values using a combination of run length encoding and bit packing, + * according to the following grammar: + * + *

+ * {@code
+ * rle-bit-packed-hybrid:  
+ * length := length of the  in bytes stored as 4 bytes little endian
+ * encoded-data := *
+ * run :=  | 
+ * bit-packed-run :=  
+ * bit-packed-header := varint-encode( << 1 | 1)
+ * // we always bit-pack a multiple of 8 values at a time, so we only store the number of values / 8
+ * bit-pack-count := (number of values in this run) / 8
+ * bit-packed-values :=  bit packed back to back, from LSB to MSB
+ * rle-run :=  
+ * rle-header := varint-encode( (number of times repeated) << 1)
+ * repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width)
+ * }
+ * 
+ * NOTE: this class is only responsible for creating and returning the {@code } + * portion of the above grammar. The {@code } portion is done by + * {@link RunLengthBitPackingHybridValuesWriter} + *

+ * Only supports positive values (including 0) + */ +public class RunLengthBitPackingHybridEncoder +{ + private static final int INITIAL_SLAB_SIZE = 64; + + private final BytePacker packer; + private final CapacityByteArrayOutputStream baos; + + /** + * The bit width used for bit-packing and for writing + * the repeated-value + */ + private final int bitWidth; + /** + * Values that are bit-packed 8 at a time are packed into this + * buffer, which is then written to baos + */ + private final byte[] packBuffer; + /** + * Previous value written, used to detect repeated values + */ + private int previousValue; + + /** + * We buffer 8 values at a time, and either bit pack them + * or discard them after writing a rle-run + */ + private final int[] bufferedValues; + private int numBufferedValues; + + /** + * How many times a value has been repeated + */ + private int repeatCount; + /** + * How many groups of 8 values have been written + * to the current bit-packed-run + */ + private int bitPackedGroupCount; + + /** + * A "pointer" to a single byte in baos, + * which we use as our bit-packed-header. It's really + * the logical index of the byte in baos. + *

+ * We are only using one byte for this header, + * which limits us to writing 504 values per bit-packed-run. + *

+ * MSB must be 0 for varint encoding, LSB must be 1 to signify + * that this is a bit-packed-header leaves 6 bits to write the + * number of 8-groups -> (2^6 - 1) * 8 = 504 + */ + private long bitPackedRunHeaderPointer; + private boolean toBytesCalled; + + public RunLengthBitPackingHybridEncoder(int bitWidth, int maxCapacityHint) + { + checkArgument(bitWidth >= 0 && bitWidth <= 32, "bitWidth must be >= 0 and <= 32"); + + this.bitWidth = bitWidth; + this.baos = new CapacityByteArrayOutputStream(INITIAL_SLAB_SIZE, maxCapacityHint, new HeapByteBufferAllocator()); + this.packBuffer = new byte[bitWidth]; + this.bufferedValues = new int[8]; + this.packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); + reset(false); + } + + private void reset(boolean resetBaos) + { + if (resetBaos) { + this.baos.reset(); + } + this.previousValue = 0; + this.numBufferedValues = 0; + this.repeatCount = 0; + this.bitPackedGroupCount = 0; + this.bitPackedRunHeaderPointer = -1; + this.toBytesCalled = false; + } + + public void writeInt(int value) + throws IOException + { + writeRepeatedInteger(value, 1); + } + + public void writeRepeatedInteger(int value, int valueRepetitions) + throws IOException + { + if (valueRepetitions == 0) { + return; + } + // Process 1st occurrence of new value + if (value != previousValue) { + // This is a new value, check if it signals the end of an rle-run + if (repeatCount >= 8) { + // it does! write an rle-run + writeRleRun(); + } + + // this is a new value so we've only seen it once + repeatCount = 1; + valueRepetitions--; + // start tracking this value for repeats + previousValue = value; + + bufferedValues[numBufferedValues++] = value; + if (numBufferedValues == 8) { + // we've encountered less than 8 repeated values, so + // either start a new bit-packed-run or append to the + // current bit-packed-run + writeOrAppendBitPackedRun(); + // we're going to see this value at least 8 times, so + // just count remaining repeats for an rle-run + if (valueRepetitions >= 8) { + repeatCount = valueRepetitions; + return; + } + } + } + + // Process remaining repetitions of value + while (valueRepetitions > 0) { + repeatCount++; + valueRepetitions--; + if (repeatCount >= 8) { + // we've seen this at least 8 times, we're + // certainly going to write an rle-run, + // so just keep on counting repeats for now + repeatCount += valueRepetitions; + return; + } + + bufferedValues[numBufferedValues++] = value; + if (numBufferedValues == 8) { + // we've encountered less than 8 repeated values, so + // either start a new bit-packed-run or append to the + // current bit-packed-run + writeOrAppendBitPackedRun(); + if (valueRepetitions >= 8) { + // we're going to see this value at least 8 times, so + // just count remaining repeats for an rle-run + repeatCount = valueRepetitions; + return; + } + } + } + } + + private void writeOrAppendBitPackedRun() + throws IOException + { + if (bitPackedGroupCount >= 63) { + // we've packed as many values as we can for this run, + // end it and start a new one + endPreviousBitPackedRun(); + } + + if (bitPackedRunHeaderPointer == -1) { + // this is a new bit-packed-run, allocate a byte for the header + // and keep a "pointer" to it so that it can be mutated later + baos.write(0); // write a sentinel value + bitPackedRunHeaderPointer = baos.getCurrentIndex(); + } + + packer.pack8Values(bufferedValues, 0, packBuffer, 0); + baos.write(packBuffer); + + // empty the buffer, they've all been written + numBufferedValues = 0; + + // clear the repeat count, as some repeated values + // may have just been bit packed into this run + repeatCount = 0; + + ++bitPackedGroupCount; + } + + /** + * If we are currently writing a bit-packed-run, update the + * bit-packed-header and consider this run to be over + *

+ * does nothing if we're not currently writing a bit-packed run + */ + private void endPreviousBitPackedRun() + { + if (bitPackedRunHeaderPointer == -1) { + // we're not currently in a bit-packed-run + return; + } + + // create bit-packed-header, which needs to fit in 1 byte + byte bitPackHeader = (byte) ((bitPackedGroupCount << 1) | 1); + + // update this byte + baos.setByte(bitPackedRunHeaderPointer, bitPackHeader); + + // mark that this run is over + bitPackedRunHeaderPointer = -1; + + // reset the number of groups + bitPackedGroupCount = 0; + } + + private void writeRleRun() + throws IOException + { + // we may have been working on a bit-packed-run + // so close that run if it exists before writing this + // rle-run + endPreviousBitPackedRun(); + + // write the rle-header (lsb of 0 signifies a rle run) + BytesUtils.writeUnsignedVarInt(repeatCount << 1, baos); + // write the repeated-value + BytesUtils.writeIntLittleEndianPaddedOnBitWidth(baos, previousValue, bitWidth); + + // reset the repeat count + repeatCount = 0; + + // throw away all the buffered values, they were just repeats and they've been written + numBufferedValues = 0; + } + + public BytesInput toBytes() + throws IOException + { + checkArgument(!toBytesCalled, "You cannot call toBytes() more than once without calling reset()"); + + // write anything that is buffered / queued up for an rle-run + if (repeatCount >= 8) { + writeRleRun(); + } + else if (numBufferedValues > 0) { + for (int i = numBufferedValues; i < 8; i++) { + bufferedValues[i] = 0; + } + writeOrAppendBitPackedRun(); + endPreviousBitPackedRun(); + } + else { + endPreviousBitPackedRun(); + } + + toBytesCalled = true; + return BytesInput.from(baos); + } + + /** + * Reset this encoder for re-use + */ + public void reset() + { + reset(true); + } + + public long getBufferedSize() + { + return baos.size(); + } + + public long getAllocatedSize() + { + return baos.getCapacity(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RunLengthBitPackingHybridValuesWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RunLengthBitPackingHybridValuesWriter.java new file mode 100644 index 000000000000..4129b72ce481 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/RunLengthBitPackingHybridValuesWriter.java @@ -0,0 +1,93 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import org.apache.parquet.bytes.BytesInput; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.io.ParquetEncodingException; + +import java.io.IOException; + +import static java.lang.Math.toIntExact; +import static org.apache.parquet.column.Encoding.RLE; + +public class RunLengthBitPackingHybridValuesWriter + implements ColumnDescriptorValuesWriter +{ + private final RunLengthBitPackingHybridEncoder encoder; + + public RunLengthBitPackingHybridValuesWriter(int bitWidth, int maxCapacityHint) + { + this.encoder = new RunLengthBitPackingHybridEncoder(bitWidth, maxCapacityHint); + } + + @Override + public void writeInteger(int value) + { + try { + encoder.writeInt(value); + } + catch (IOException e) { + throw new ParquetEncodingException(e); + } + } + + @Override + public void writeRepeatInteger(int value, int valueRepetitions) + { + try { + encoder.writeRepeatedInteger(value, valueRepetitions); + } + catch (IOException e) { + throw new ParquetEncodingException(e); + } + } + + @Override + public long getBufferedSize() + { + return encoder.getBufferedSize(); + } + + @Override + public long getAllocatedSize() + { + return encoder.getAllocatedSize(); + } + + @Override + public BytesInput getBytes() + { + try { + // prepend the length of the column + BytesInput rle = encoder.toBytes(); + return BytesInput.concat(BytesInput.fromInt(toIntExact(rle.size())), rle); + } + catch (IOException e) { + throw new ParquetEncodingException(e); + } + } + + @Override + public Encoding getEncoding() + { + return RLE; + } + + @Override + public void reset() + { + encoder.reset(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimeMicrosValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimeMicrosValueWriter.java new file mode 100644 index 000000000000..8f49571779a7 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimeMicrosValueWriter.java @@ -0,0 +1,47 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.TimeType.TIME_MICROS; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_MICROSECOND; +import static java.util.Objects.requireNonNull; + +public class TimeMicrosValueWriter + extends PrimitiveValueWriter +{ + public TimeMicrosValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + long scaledValue = TIME_MICROS.getLong(block, i) / PICOSECONDS_PER_MICROSECOND; + valuesWriter.writeLong(scaledValue); + statistics.updateStats(scaledValue); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampMillisValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampMillisValueWriter.java new file mode 100644 index 000000000000..730cea0efa54 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampMillisValueWriter.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static java.lang.Math.floorDiv; +import static java.util.Objects.requireNonNull; + +public class TimestampMillisValueWriter + extends PrimitiveValueWriter +{ + private final Type type; + + public TimestampMillisValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.type = requireNonNull(type, "type is null"); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + long scaledValue = floorDiv(type.getLong(block, i), MICROSECONDS_PER_MILLISECOND); + valuesWriter.writeLong(scaledValue); + statistics.updateStats(scaledValue); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampNanosValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampNanosValueWriter.java new file mode 100644 index 000000000000..7bf953179a16 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampNanosValueWriter.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import com.google.common.math.LongMath; +import io.trino.spi.block.Block; +import io.trino.spi.type.LongTimestamp; +import io.trino.spi.type.Type; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.Timestamps.NANOSECONDS_PER_MICROSECOND; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_NANOSECOND; +import static java.lang.Math.multiplyExact; +import static java.math.RoundingMode.UNNECESSARY; +import static java.util.Objects.requireNonNull; + +public class TimestampNanosValueWriter + extends PrimitiveValueWriter +{ + private final Type type; + + public TimestampNanosValueWriter(ValuesWriter valuesWriter, Type type, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + this.type = requireNonNull(type, "type is null"); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + LongTimestamp value = (LongTimestamp) type.getObject(block, i); + long epochNanos = multiplyExact(value.getEpochMicros(), NANOSECONDS_PER_MICROSECOND) + + LongMath.divide(value.getPicosOfMicro(), PICOSECONDS_PER_NANOSECOND, UNNECESSARY); + valuesWriter.writeLong(epochNanos); + statistics.updateStats(epochNanos); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampTzMicrosValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampTzMicrosValueWriter.java new file mode 100644 index 000000000000..2a03a805f9e5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampTzMicrosValueWriter.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import io.trino.spi.type.LongTimestampWithTimeZone; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_MICROSECOND; +import static io.trino.spi.type.Timestamps.roundDiv; +import static java.util.Objects.requireNonNull; + +public class TimestampTzMicrosValueWriter + extends PrimitiveValueWriter +{ + public TimestampTzMicrosValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + long micros = toMicros((LongTimestampWithTimeZone) TIMESTAMP_TZ_MICROS.getObject(block, i)); + valuesWriter.writeLong(micros); + statistics.updateStats(micros); + } + } + } + + private static long toMicros(LongTimestampWithTimeZone timestamp) + { + return (timestamp.getEpochMillis() * MICROSECONDS_PER_MILLISECOND) + + roundDiv(timestamp.getPicosOfMilli(), PICOSECONDS_PER_MICROSECOND); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampTzMillisValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampTzMillisValueWriter.java new file mode 100644 index 000000000000..6a5fbb92ebec --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TimestampTzMillisValueWriter.java @@ -0,0 +1,47 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.trino.spi.block.Block; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; +import static java.util.Objects.requireNonNull; + +public class TimestampTzMillisValueWriter + extends PrimitiveValueWriter +{ + public TimestampTzMillisValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + long millis = unpackMillisUtc(TIMESTAMP_TZ_MILLIS.getLong(block, i)); + valuesWriter.writeLong(millis); + statistics.updateStats(millis); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TrinoValuesWriterFactory.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TrinoValuesWriterFactory.java new file mode 100644 index 000000000000..bbcc966d2763 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/TrinoValuesWriterFactory.java @@ -0,0 +1,140 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import org.apache.parquet.bytes.HeapByteBufferAllocator; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.column.values.bloomfilter.BloomFilter; +import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter; +import org.apache.parquet.column.values.plain.BooleanPlainValuesWriter; +import org.apache.parquet.column.values.plain.FixedLenByteArrayPlainValuesWriter; +import org.apache.parquet.column.values.plain.PlainValuesWriter; + +import java.util.Optional; + +import static io.trino.parquet.writer.valuewriter.BloomFilterValuesWriter.createBloomFilterValuesWriter; +import static org.apache.parquet.column.Encoding.PLAIN_DICTIONARY; + +/** + * Based on org.apache.parquet.column.values.factory.DefaultV1ValuesWriterFactory + */ +public class TrinoValuesWriterFactory +{ + private static final int INITIAL_SLAB_SIZE = 64; + + private final int maxPageSize; + private final int maxDictionaryPageSize; + + public TrinoValuesWriterFactory(int maxPageSize, int maxDictionaryPageSize) + { + this.maxPageSize = maxPageSize; + this.maxDictionaryPageSize = maxDictionaryPageSize; + } + + public ValuesWriter newValuesWriter(ColumnDescriptor descriptor, Optional bloomFilter) + { + return switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { + case BOOLEAN -> new BooleanPlainValuesWriter(); // no dictionary encoding for boolean + case FIXED_LEN_BYTE_ARRAY -> getFixedLenByteArrayValuesWriter(descriptor, bloomFilter); + case BINARY -> getBinaryValuesWriter(descriptor, bloomFilter); + case INT32 -> getInt32ValuesWriter(descriptor, bloomFilter); + case INT64 -> getInt64ValuesWriter(descriptor, bloomFilter); + case INT96 -> getInt96ValuesWriter(descriptor, bloomFilter); + case DOUBLE -> getDoubleValuesWriter(descriptor, bloomFilter); + case FLOAT -> getFloatValuesWriter(descriptor, bloomFilter); + }; + } + + private ValuesWriter getFixedLenByteArrayValuesWriter(ColumnDescriptor path, Optional bloomFilter) + { + // dictionary encoding was not enabled in PARQUET 1.0 + return createBloomFilterValuesWriter(new FixedLenByteArrayPlainValuesWriter(path.getPrimitiveType().getTypeLength(), INITIAL_SLAB_SIZE, maxPageSize, new HeapByteBufferAllocator()), bloomFilter); + } + + private ValuesWriter getBinaryValuesWriter(ColumnDescriptor path, Optional bloomFilter) + { + ValuesWriter fallbackWriter = new PlainValuesWriter(INITIAL_SLAB_SIZE, maxPageSize, new HeapByteBufferAllocator()); + return createBloomFilterValuesWriter(dictWriterWithFallBack(path, getEncodingForDictionaryPage(), getEncodingForDataPage(), fallbackWriter), bloomFilter); + } + + private ValuesWriter getInt32ValuesWriter(ColumnDescriptor path, Optional bloomFilter) + { + ValuesWriter fallbackWriter = new PlainValuesWriter(INITIAL_SLAB_SIZE, maxPageSize, new HeapByteBufferAllocator()); + return createBloomFilterValuesWriter(dictWriterWithFallBack(path, getEncodingForDictionaryPage(), getEncodingForDataPage(), fallbackWriter), bloomFilter); + } + + private ValuesWriter getInt64ValuesWriter(ColumnDescriptor path, Optional bloomFilter) + { + ValuesWriter fallbackWriter = new PlainValuesWriter(INITIAL_SLAB_SIZE, maxPageSize, new HeapByteBufferAllocator()); + return createBloomFilterValuesWriter(dictWriterWithFallBack(path, getEncodingForDictionaryPage(), getEncodingForDataPage(), fallbackWriter), bloomFilter); + } + + private ValuesWriter getInt96ValuesWriter(ColumnDescriptor path, Optional bloomFilter) + { + ValuesWriter fallbackWriter = new FixedLenByteArrayPlainValuesWriter(12, INITIAL_SLAB_SIZE, maxPageSize, new HeapByteBufferAllocator()); + return createBloomFilterValuesWriter(dictWriterWithFallBack(path, getEncodingForDictionaryPage(), getEncodingForDataPage(), fallbackWriter), bloomFilter); + } + + private ValuesWriter getDoubleValuesWriter(ColumnDescriptor path, Optional bloomFilter) + { + ValuesWriter fallbackWriter = new PlainValuesWriter(INITIAL_SLAB_SIZE, maxPageSize, new HeapByteBufferAllocator()); + return createBloomFilterValuesWriter(dictWriterWithFallBack(path, getEncodingForDictionaryPage(), getEncodingForDataPage(), fallbackWriter), bloomFilter); + } + + private ValuesWriter getFloatValuesWriter(ColumnDescriptor path, Optional bloomFilter) + { + ValuesWriter fallbackWriter = new PlainValuesWriter(INITIAL_SLAB_SIZE, maxPageSize, new HeapByteBufferAllocator()); + return createBloomFilterValuesWriter(dictWriterWithFallBack(path, getEncodingForDictionaryPage(), getEncodingForDataPage(), fallbackWriter), bloomFilter); + } + + @SuppressWarnings("deprecation") + private static Encoding getEncodingForDataPage() + { + return PLAIN_DICTIONARY; + } + + @SuppressWarnings("deprecation") + private static Encoding getEncodingForDictionaryPage() + { + return PLAIN_DICTIONARY; + } + + private DictionaryValuesWriter dictionaryWriter(ColumnDescriptor path, Encoding dictPageEncoding, Encoding dataPageEncoding) + { + return switch (path.getPrimitiveType().getPrimitiveTypeName()) { + case BOOLEAN -> throw new IllegalArgumentException("no dictionary encoding for BOOLEAN"); + case BINARY -> + new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(maxDictionaryPageSize, dataPageEncoding, dictPageEncoding, new HeapByteBufferAllocator()); + case INT32 -> + new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(maxDictionaryPageSize, dataPageEncoding, dictPageEncoding, new HeapByteBufferAllocator()); + case INT64 -> + new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(maxDictionaryPageSize, dataPageEncoding, dictPageEncoding, new HeapByteBufferAllocator()); + case INT96 -> + new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(maxDictionaryPageSize, 12, dataPageEncoding, dictPageEncoding, new HeapByteBufferAllocator()); + case DOUBLE -> + new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(maxDictionaryPageSize, dataPageEncoding, dictPageEncoding, new HeapByteBufferAllocator()); + case FLOAT -> + new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(maxDictionaryPageSize, dataPageEncoding, dictPageEncoding, new HeapByteBufferAllocator()); + case FIXED_LEN_BYTE_ARRAY -> + new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(maxDictionaryPageSize, path.getPrimitiveType().getTypeLength(), dataPageEncoding, dictPageEncoding, new HeapByteBufferAllocator()); + }; + } + + private ValuesWriter dictWriterWithFallBack(ColumnDescriptor path, Encoding dictPageEncoding, Encoding dataPageEncoding, ValuesWriter writerToFallBackTo) + { + return new DictionaryFallbackValuesWriter(dictionaryWriter(path, dictPageEncoding, dataPageEncoding), writerToFallBackTo); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/UuidValueWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/UuidValueWriter.java new file mode 100644 index 000000000000..dde410772bc5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/parquet/writer/valuewriter/UuidValueWriter.java @@ -0,0 +1,51 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.parquet.writer.valuewriter; + +import io.airlift.slice.Slice; +import io.trino.spi.block.Block; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.column.values.ValuesWriter; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; + +import static io.trino.spi.type.UuidType.UUID; +import static java.util.Objects.requireNonNull; + +public class UuidValueWriter + extends PrimitiveValueWriter +{ + public UuidValueWriter(ValuesWriter valuesWriter, PrimitiveType parquetType) + { + super(parquetType, valuesWriter); + } + + @Override + public void write(Block block) + { + ValuesWriter valuesWriter = requireNonNull(getValuesWriter(), "valuesWriter is null"); + Statistics statistics = requireNonNull(getStatistics(), "statistics is null"); + boolean mayHaveNull = block.mayHaveNull(); + for (int i = 0; i < block.getPositionCount(); i++) { + if (!mayHaveNull || !block.isNull(i)) { + Slice slice = UUID.getSlice(block, i); + // fromReusedByteArray must be used instead of fromConstantByteArray to avoid retaining entire + // base byte array of the Slice in DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter + Binary binary = Binary.fromReusedByteArray(slice.byteArray(), slice.byteArrayOffset(), slice.length()); + valuesWriter.writeBytes(binary); + statistics.updateStats(binary); + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/base/util/ExecutorUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/base/util/ExecutorUtil.java new file mode 100644 index 000000000000..b605c585b37d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/base/util/ExecutorUtil.java @@ -0,0 +1,144 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.base.util; + +import com.google.errorprone.annotations.ThreadSafe; +import com.google.errorprone.annotations.concurrent.GuardedBy; +import io.opentelemetry.context.Context; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.Future; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.util.Collections.nCopies; +import static java.util.Objects.requireNonNull; + +public final class ExecutorUtil +{ + private ExecutorUtil() {} + + /** + * Process tasks in executors and additionally in calling thread. + * Upon task execution failure, other tasks are canceled and interrupted, but not waited + * for. + *

+ * This method propagates {@link Context#current()} into tasks it starts within the executor. + *

+ * Note: using this method allows simple parallelization of tasks within executor, when sub-tasks + * are also scheduled in that executor, without risking starvation when pool is saturated. + * + * @throws ExecutionException if any task fails; exception cause is the first task failure + */ + public static List processWithAdditionalThreads(Collection> tasks, Executor executor) + throws ExecutionException + { + List> wrapped = tasks.stream() + .map(Task::new) + .collect(toImmutableList()); + CompletionService> completionService = new ExecutorCompletionService<>(executor); + List> futures = new ArrayList<>(wrapped.size()); + Context tracingContext = Context.current(); + + try { + // schedule in the executor + for (int i = 0; i < wrapped.size(); i++) { + int index = i; + Task task = wrapped.get(i); + futures.add(completionService.submit(() -> { + if (!task.take()) { + return null; // will be ignored + } + try (var ignore = tracingContext.makeCurrent()) { + return new TaskResult<>(index, task.callable.call()); + } + })); + } + + List results = new ArrayList<>(nCopies(wrapped.size(), null)); + int pending = wrapped.size(); + // process in the calling thread (in reverse order, as an optimization) + for (int i = wrapped.size() - 1; i >= 0; i--) { + // process ready results to fail fast on exceptions + for (Future> ready = completionService.poll(); ready != null; ready = completionService.poll()) { + TaskResult taskResult = ready.get(); + // Null result means task was processed by the calling thread + if (taskResult != null) { + results.set(taskResult.taskIndex(), taskResult.result()); + pending--; + } + } + Task task = wrapped.get(i); + if (!task.take()) { + continue; + } + try { + results.set(i, task.callable.call()); + pending--; + } + catch (Exception e) { + throw new ExecutionException(e); + } + } + + while (pending > 0) { + TaskResult taskResult = completionService.take().get(); + // Null result means task was processed by the calling thread + if (taskResult != null) { + results.set(taskResult.taskIndex(), taskResult.result()); + pending--; + } + } + + return results; + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted", e); + } + finally { + futures.forEach(future -> future.cancel(true)); + } + } + + @ThreadSafe + private static final class Task + { + private final Callable callable; + @GuardedBy("this") + private boolean taken; + + public Task(Callable callable) + { + this.callable = requireNonNull(callable, "callable is null"); + } + + public synchronized boolean take() + { + if (taken) { + return false; + } + taken = true; + return true; + } + } + + private record TaskResult(int taskIndex, T result) {} +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/Schema.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/Schema.java new file mode 100644 index 000000000000..9ed591927af5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/Schema.java @@ -0,0 +1,43 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive; + +import io.airlift.slice.SizeOf; + +import java.util.Map; + +import static io.airlift.slice.SizeOf.estimatedSizeOf; +import static io.airlift.slice.SizeOf.instanceSize; +import static java.util.Objects.requireNonNull; + +public record Schema( + String serializationLibraryName, + boolean isFullAcidTable, + Map serdeProperties) +{ + private static final int INSTANCE_SIZE = instanceSize(Schema.class); + + public Schema + { + requireNonNull(serializationLibraryName, "serializationLibraryName is null"); + requireNonNull(serdeProperties, "serdeProperties is null"); + } + + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE + + estimatedSizeOf(serializationLibraryName) + + estimatedSizeOf(serdeProperties, SizeOf::estimatedSizeOf, SizeOf::estimatedSizeOf); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/TransformConnectorPageSource.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/TransformConnectorPageSource.java new file mode 100644 index 000000000000..2bb2e9f80272 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/TransformConnectorPageSource.java @@ -0,0 +1,374 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive; + +import com.google.common.collect.ImmutableList; +import com.google.errorprone.annotations.CanIgnoreReturnValue; +import com.google.errorprone.annotations.CheckReturnValue; +import io.trino.spi.Page; +import io.trino.spi.block.Block; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.metrics.Metrics; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.concurrent.CompletableFuture; +import java.util.function.Function; +import java.util.function.ObjLongConsumer; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.SizeOf.sizeOf; +import static io.trino.plugin.base.util.Closables.closeAllSuppress; +import static io.trino.plugin.iceberg.IcebergUtil.getRowFieldsFromBlock; +import static java.util.Objects.requireNonNull; + +public final class TransformConnectorPageSource + implements ConnectorPageSource +{ + private final ConnectorPageSource connectorPageSource; + private final Function transform; + + @CheckReturnValue + public static TransformConnectorPageSource create(ConnectorPageSource connectorPageSource, Function transform) + { + return new TransformConnectorPageSource(connectorPageSource, transform); + } + + private TransformConnectorPageSource(ConnectorPageSource connectorPageSource, Function transform) + { + this.connectorPageSource = requireNonNull(connectorPageSource, "connectorPageSource is null"); + this.transform = requireNonNull(transform, "transform is null"); + } + + @Override + public long getCompletedBytes() + { + return connectorPageSource.getCompletedBytes(); + } + + @Override + public OptionalLong getCompletedPositions() + { + return connectorPageSource.getCompletedPositions(); + } + + @Override + public long getReadTimeNanos() + { + return connectorPageSource.getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return connectorPageSource.isFinished(); + } + + @Override + public Page getNextPage() + { + try { + Page page = connectorPageSource.getNextPage(); + if (page == null) { + return null; + } + return transform.apply(page); + } + catch (Throwable e) { + closeAllSuppress(e, connectorPageSource); + throw e; + } + } + + @Override + public long getMemoryUsage() + { + return connectorPageSource.getMemoryUsage(); + } + + @Override + public void close() + throws IOException + { + connectorPageSource.close(); + } + + @Override + public CompletableFuture isBlocked() + { + return connectorPageSource.isBlocked(); + } + + @Override + public Metrics getMetrics() + { + return connectorPageSource.getMetrics(); + } + + @CheckReturnValue + public static Builder builder() + { + return new Builder(); + } + + public static final class Builder + { + private final List> transforms = new ArrayList<>(); + private boolean requiresTransform; + + private Builder() {} + + @CanIgnoreReturnValue + public Builder constantValue(Block constantValue) + { + requiresTransform = true; + transforms.add(new ConstantValue(constantValue)); + return this; + } + + @CanIgnoreReturnValue + public Builder column(int inputField) + { + return column(inputField, Optional.empty()); + } + + @CanIgnoreReturnValue + public Builder column(int inputField, Optional> transform) + { + if (transform.isPresent()) { + return transform(inputField, transform.get()); + } + + if (inputField != transforms.size()) { + requiresTransform = true; + } + transforms.add(new InputColumn(inputField)); + return this; + } + + @CanIgnoreReturnValue + public Builder dereferenceField(List path) + { + return dereferenceField(path, Optional.empty()); + } + + @CanIgnoreReturnValue + public Builder dereferenceField(List path, Optional> transform) + { + requireNonNull(path, "path is null"); + if (path.size() == 1) { + return column(path.get(0), transform); + } + + requiresTransform = true; + transforms.add(new DereferenceFieldTransform(path, transform)); + return this; + } + + @CanIgnoreReturnValue + public Builder transform(int inputColumn, Function transform) + { + requireNonNull(transform, "transform is null"); + requiresTransform = true; + transforms.add(new TransformBlock(transform, inputColumn)); + return this; + } + + @CanIgnoreReturnValue + public Builder transform(Function transform) + { + requiresTransform = true; + transforms.add(transform); + return this; + } + + @CheckReturnValue + public ConnectorPageSource build(ConnectorPageSource pageSource) + { + if (!requiresTransform) { + return pageSource; + } + + List> functions = List.copyOf(transforms); + return new TransformConnectorPageSource(pageSource, new TransformPages(functions)); + } + } + + private record ConstantValue(Block constantValue) + implements Function + { + @Override + public Block apply(Page page) + { + return RunLengthEncodedBlock.create(constantValue, page.getPositionCount()); + } + } + + private record InputColumn(int inputField) + implements Function + { + @Override + public Block apply(Page page) + { + return page.getBlock(inputField); + } + } + + private record DereferenceFieldTransform(List path, Optional> transform) + implements Function + { + private DereferenceFieldTransform + { + path = ImmutableList.copyOf(requireNonNull(path, "path is null")); + checkArgument(!path.isEmpty(), "path is empty"); + checkArgument(path.stream().allMatch(element -> element >= 0), "path element is negative"); + requireNonNull(transform, "transform is null"); + } + + @Override + public Block apply(Page sourcePage) + { + Block block = sourcePage.getBlock(path.get(0)); + for (int dereferenceIndex : path.subList(1, path.size())) { + block = getRowFieldsFromBlock(block).get(dereferenceIndex); + } + if (transform.isPresent()) { + block = transform.get().apply(block); + } + return block; + } + } + + private record TransformBlock(Function transform, int inputColumn) + implements Function + { + @Override + public Block apply(Page page) + { + return transform.apply(page.getBlock(inputColumn)); + } + } + + private record TransformPages(List> functions) + implements Function + { + private TransformPages + { + functions = List.copyOf(requireNonNull(functions, "functions is null")); + } + + @Override + public Page apply(Page page) + { + return new TransformSourcePage(page, functions).getPage(); + } + } + + private record TransformSourcePage( + Page sourcePage, + List> transforms, // not considered "retained" since the same list is shared between instances + Block[] blocks) + { + private static final long INSTANCE_SIZE = instanceSize(TransformSourcePage.class); + + private TransformSourcePage(Page sourcePage, List> transforms) + { + this(sourcePage, transforms, new Block[transforms.size()]); + } + + private TransformSourcePage + { + requireNonNull(sourcePage, "sourcePage is null"); + transforms = List.copyOf(requireNonNull(transforms, "transforms is null")); + requireNonNull(blocks, "blocks is null"); + checkArgument(transforms.size() == blocks.length, "transforms and blocks size mismatch"); + } + + //@Override + public int getPositionCount() + { + return sourcePage.getPositionCount(); + } + + //@Override + public long getSizeInBytes() + { + return sourcePage.getSizeInBytes(); + } + + //@Override + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE + + sizeOf(blocks) + + sourcePage.getRetainedSizeInBytes(); + } + + //@Override + public void retainedBytesForEachPart(ObjLongConsumer consumer) + { + consumer.accept(this, INSTANCE_SIZE); + consumer.accept(blocks, sizeOf(blocks)); + for (Block block : blocks) { + if (block != null) { + block.retainedBytesForEachPart(consumer); + } + } + //sourcePage.retainedBytesForEachPart(consumer); + } + + //@Override + public int getChannelCount() + { + return blocks.length; + } + + //@Override + public Block getBlock(int channel) + { + Block block = blocks[channel]; + if (block == null) { + block = transforms.get(channel).apply(sourcePage); + blocks[channel] = block; + } + return block; + } + + //@Override + public Page getPage() + { + for (int i = 0; i < blocks.length; i++) { + getBlock(i); + } + return new Page(getPositionCount(), blocks); + } + + //@Override + public void selectPositions(int[] positions, int offset, int size) + { + sourcePage.getPositions(positions, offset, size); + for (int i = 0; i < blocks.length; i++) { + Block block = blocks[i]; + if (block != null) { + blocks[i] = block.getPositions(positions, offset, size); + } + } + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/MemoryParquetDataSource.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/MemoryParquetDataSource.java new file mode 100644 index 000000000000..b5b848cb24b4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/MemoryParquetDataSource.java @@ -0,0 +1,151 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.parquet; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ListMultimap; +import io.airlift.slice.Slice; +import io.trino.filesystem.TrinoInput; +import io.trino.filesystem.TrinoInputFile; +import io.trino.memory.context.AggregatedMemoryContext; +import io.trino.memory.context.LocalMemoryContext; +import io.trino.parquet.ChunkReader; +import io.trino.parquet.DiskRange; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.reader.ChunkedInputStream; +import io.trino.plugin.hive.FileFormatDataSourceStats; +import jakarta.annotation.Nullable; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; +import static java.util.Objects.requireNonNull; + +public class MemoryParquetDataSource + implements ParquetDataSource +{ + private final ParquetDataSourceId id; + private final long readTimeNanos; + private final long readBytes; + private final LocalMemoryContext memoryUsage; + @Nullable + private Slice data; + + public MemoryParquetDataSource(TrinoInputFile inputFile, AggregatedMemoryContext memoryContext, FileFormatDataSourceStats stats) + throws IOException + { + try (TrinoInput input = inputFile.newInput()) { + long readStart = System.nanoTime(); + this.data = input.readTail(toIntExact(inputFile.length())); + this.readTimeNanos = System.nanoTime() - readStart; + stats.readDataBytesPerSecond(data.length(), readTimeNanos); + } + this.memoryUsage = memoryContext.newLocalMemoryContext(MemoryParquetDataSource.class.getSimpleName()); + this.memoryUsage.setBytes(data.length()); + this.readBytes = data.length(); + this.id = new ParquetDataSourceId(inputFile.location().toString()); + } + + @Override + public ParquetDataSourceId getId() + { + return id; + } + + @Override + public long getReadBytes() + { + return readBytes; + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public long getEstimatedSize() + { + return readBytes; + } + + @Override + public Slice readTail(int length) + { + int readSize = min(data.length(), length); + return readFully(data.length() - readSize, readSize); + } + + @Override + public final Slice readFully(long position, int length) + { + return data.slice(toIntExact(position), length); + } + + @Override + public Map planRead(ListMultimap diskRanges, AggregatedMemoryContext memoryContext) + { + requireNonNull(diskRanges, "diskRanges is null"); + + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + ImmutableMap.Builder builder = ImmutableMap.builder(); + for (Map.Entry> entry : diskRanges.asMap().entrySet()) { + List chunkReaders = entry.getValue().stream() + .map(diskRange -> new ChunkReader() + { + @Override + public long getDiskOffset() + { + return diskRange.getOffset(); + } + + @Override + public Slice read() + { + return data.slice(toIntExact(diskRange.getOffset()), toIntExact(diskRange.getLength())); + } + + @Override + public void free() {} + }) + .collect(toImmutableList()); + builder.put(entry.getKey(), new ChunkedInputStream(chunkReaders)); + } + return builder.buildOrThrow(); + } + + @Override + public void close() + throws IOException + { + data = null; + memoryUsage.close(); + } + + @Override + public final String toString() + { + return id.toString(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceNew.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceNew.java new file mode 100644 index 000000000000..e2d490dc070b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceNew.java @@ -0,0 +1,130 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.parquet; + +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.ParquetDataSourceId; +import io.trino.parquet.reader.ParquetReader; +import io.trino.spi.Page; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.metrics.Metrics; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.OptionalLong; + +import static io.trino.plugin.base.util.Closables.closeAllSuppress; +import static io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA; +import static io.trino.plugin.hive.HiveErrorCode.HIVE_CURSOR_ERROR; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class ParquetPageSourceNew + implements ConnectorPageSource +{ + private final ParquetReader parquetReader; + + private boolean closed; + private long completedPositions; + + public ParquetPageSourceNew(ParquetReader parquetReader) + { + this.parquetReader = requireNonNull(parquetReader, "parquetReader is null"); + } + + @Override + public long getCompletedBytes() + { + return parquetReader.getDataSource().getReadBytes(); + } + + @Override + public OptionalLong getCompletedPositions() + { + return OptionalLong.of(completedPositions); + } + + @Override + public long getReadTimeNanos() + { + return parquetReader.getDataSource().getReadTimeNanos(); + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public long getMemoryUsage() + { + return parquetReader.getMemoryContext().getBytes(); + } + + @Override + public Page getNextPage() + { + Page page; + try { + page = parquetReader.nextPage(); + } + catch (IOException | RuntimeException e) { + closeAllSuppress(e, this); + throw handleException(parquetReader.getDataSource().getId(), e); + } + + if (closed || page == null) { + close(); + return null; + } + + completedPositions += page.getPositionCount(); + return page; + } + + @Override + public void close() + { + if (closed) { + return; + } + closed = true; + + try { + parquetReader.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public Metrics getMetrics() + { + return parquetReader.getMetrics(); + } + + static TrinoException handleException(ParquetDataSourceId dataSourceId, Exception exception) + { + if (exception instanceof TrinoException trinoException) { + return trinoException; + } + if (exception instanceof ParquetCorruptionException) { + return new TrinoException(HIVE_BAD_DATA, exception); + } + return new TrinoException(HIVE_CURSOR_ERROR, format("Failed to read Parquet file: %s", dataSourceId), exception); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/ParquetTypeTranslator.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/ParquetTypeTranslator.java new file mode 100644 index 000000000000..ac96b70c6eec --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/parquet/ParquetTypeTranslator.java @@ -0,0 +1,85 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.parquet; + +import io.trino.plugin.hive.coercions.IntegerNumberToDoubleCoercer; +import io.trino.plugin.hive.coercions.IntegerNumberToVarcharCoercer; +import io.trino.plugin.hive.coercions.TypeCoercer; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.DoubleType; +import io.trino.spi.type.Type; +import io.trino.spi.type.VarcharType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; + +import java.util.Optional; + +import static io.trino.parquet.reader.ColumnReaderFactory.isIntegerAnnotationAndPrimitive; +import static io.trino.plugin.hive.coercions.DecimalCoercers.createDecimalToVarcharCoercer; +import static io.trino.plugin.hive.coercions.DoubleToVarcharCoercers.createDoubleToVarcharCoercer; +import static io.trino.plugin.hive.coercions.FloatToVarcharCoercers.createFloatToVarcharCoercer; +import static io.trino.plugin.hive.coercions.TimestampCoercer.LongTimestampToVarcharCoercer; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.TimestampType.TIMESTAMP_NANOS; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; + +public final class ParquetTypeTranslator +{ + private ParquetTypeTranslator() {} + + public static Optional> createCoercer(PrimitiveTypeName fromParquetType, LogicalTypeAnnotation typeAnnotation, Type toTrinoType) + { + if (toTrinoType instanceof DoubleType) { + if (isIntegerAnnotationAndPrimitive(typeAnnotation, fromParquetType)) { + if (fromParquetType == INT32) { + return Optional.of(new IntegerNumberToDoubleCoercer<>(INTEGER)); + } + if (fromParquetType == INT64) { + return Optional.of(new IntegerNumberToDoubleCoercer<>(BIGINT)); + } + } + } + if (toTrinoType instanceof VarcharType varcharType) { + if (isIntegerAnnotationAndPrimitive(typeAnnotation, fromParquetType)) { + if (fromParquetType == INT32) { + return Optional.of(new IntegerNumberToVarcharCoercer<>(INTEGER, varcharType)); + } + if (fromParquetType == INT64) { + return Optional.of(new IntegerNumberToVarcharCoercer<>(BIGINT, varcharType)); + } + } + if (fromParquetType == FLOAT) { + return Optional.of(createFloatToVarcharCoercer(varcharType, false)); + } + if (fromParquetType == DOUBLE) { + return Optional.of(createDoubleToVarcharCoercer(varcharType, false)); + } + if (typeAnnotation instanceof DecimalLogicalTypeAnnotation decimalAnnotation) { + return Optional.of(createDecimalToVarcharCoercer( + DecimalType.createDecimalType(decimalAnnotation.getPrecision(), decimalAnnotation.getScale()), + varcharType)); + } + if (fromParquetType == INT96) { + return Optional.of(new LongTimestampToVarcharCoercer(TIMESTAMP_NANOS, varcharType)); + } + } + return Optional.empty(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/security/UsingSystemSecurity.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/security/UsingSystemSecurity.java new file mode 100644 index 000000000000..7d7cbeaa8131 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/security/UsingSystemSecurity.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.security; + +import com.google.inject.BindingAnnotation; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@BindingAnnotation +public @interface UsingSystemSecurity {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java new file mode 100644 index 000000000000..4310893835e2 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/hive/util/HiveTypeUtil.java @@ -0,0 +1,182 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hive.util; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.hive.HiveTimestampPrecision; +import io.trino.plugin.hive.HiveType; +import io.trino.plugin.hive.metastore.StorageFormat; +import io.trino.plugin.hive.type.ListTypeInfo; +import io.trino.plugin.hive.type.MapTypeInfo; +import io.trino.plugin.hive.type.PrimitiveCategory; +import io.trino.plugin.hive.type.PrimitiveTypeInfo; +import io.trino.plugin.hive.type.StructTypeInfo; +import io.trino.plugin.hive.type.TypeInfo; +import io.trino.plugin.hive.type.UnionTypeInfo; +import io.trino.spi.type.Type; +import io.trino.spi.type.TypeManager; +import io.trino.spi.type.TypeSignature; + +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.lenientFormat; +import static io.trino.hive.formats.UnionToRowCoercionUtils.UNION_FIELD_FIELD_PREFIX; +import static io.trino.hive.formats.UnionToRowCoercionUtils.UNION_FIELD_TAG_NAME; +import static io.trino.hive.formats.UnionToRowCoercionUtils.UNION_FIELD_TAG_TYPE; +import static io.trino.plugin.hive.HiveStorageFormat.AVRO; +import static io.trino.plugin.hive.HiveStorageFormat.ORC; +import static io.trino.plugin.hive.HiveTimestampPrecision.DEFAULT_PRECISION; +import static io.trino.plugin.hive.HiveType.toHiveType; +import static io.trino.plugin.hive.util.HiveTypeTranslator.toTypeSignature; + +public final class HiveTypeUtil +{ + private HiveTypeUtil() {} + + /** + * @deprecated Prefer {@link #getTypeSignature(HiveType, HiveTimestampPrecision)}. + */ + @Deprecated + public static TypeSignature getTypeSignature(HiveType type) + { + return getTypeSignature(type, DEFAULT_PRECISION); + } + + public static TypeSignature getTypeSignature(HiveType type, HiveTimestampPrecision timestampPrecision) + { + return toTypeSignature(type.getTypeInfo(), timestampPrecision); + } + + public static Type getType(HiveType type, TypeManager typeManager, HiveTimestampPrecision timestampPrecision) + { + return typeManager.getType(getTypeSignature(type, timestampPrecision)); + } + + public static boolean typeSupported(TypeInfo typeInfo, StorageFormat storageFormat) + { + return switch (typeInfo.getCategory()) { + case PRIMITIVE -> typeSupported(((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()); + case MAP -> typeSupported(((MapTypeInfo) typeInfo).getMapKeyTypeInfo(), storageFormat) && + typeSupported(((MapTypeInfo) typeInfo).getMapValueTypeInfo(), storageFormat); + case LIST -> typeSupported(((ListTypeInfo) typeInfo).getListElementTypeInfo(), storageFormat); + case STRUCT -> ((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos().stream().allMatch(fieldTypeInfo -> typeSupported(fieldTypeInfo, storageFormat)); + case UNION -> + // This feature (reading union types as structs) has only been verified against Avro and ORC tables. Here's a discussion: + // 1. Avro tables are supported and verified. + // 2. ORC tables are supported and verified. + // 3. The Parquet format doesn't support union types itself so there's no need to add support for it in Trino. + // 4. TODO: RCFile tables are not supported yet. + // 5. TODO: The support for Avro is done in SerDeUtils so it's possible that formats other than Avro are also supported. But verification is needed. + storageFormat.getSerde().equalsIgnoreCase(AVRO.getSerde()) || + storageFormat.getSerde().equalsIgnoreCase(ORC.getSerde()) || + ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos().stream().allMatch(fieldTypeInfo -> typeSupported(fieldTypeInfo, storageFormat)); + }; + } + + private static boolean typeSupported(PrimitiveCategory category) + { + return switch (category) { + case BOOLEAN, + BYTE, + SHORT, + INT, + LONG, + FLOAT, + DOUBLE, + STRING, + VARCHAR, + CHAR, + DATE, + TIMESTAMP, + TIMESTAMPLOCALTZ, + BINARY, + DECIMAL -> true; + case INTERVAL_YEAR_MONTH, + INTERVAL_DAY_TIME, + VOID, + UNKNOWN -> false; + }; + } + + public static Optional getHiveTypeForDereferences(HiveType hiveType, List dereferences) + { + TypeInfo typeInfo = hiveType.getTypeInfo(); + for (int fieldIndex : dereferences) { + if (typeInfo instanceof StructTypeInfo structTypeInfo) { + try { + typeInfo = structTypeInfo.getAllStructFieldTypeInfos().get(fieldIndex); + } + catch (RuntimeException e) { + // return empty when failed to dereference, this could happen when partition and table schema mismatch + return Optional.empty(); + } + } + else if (typeInfo instanceof UnionTypeInfo unionTypeInfo) { + try { + if (fieldIndex == 0) { + // union's tag field, defined in {@link io.trino.hive.formats.UnionToRowCoercionUtils} + return Optional.of(toHiveType(UNION_FIELD_TAG_TYPE)); + } + typeInfo = unionTypeInfo.getAllUnionObjectTypeInfos().get(fieldIndex - 1); + } + catch (RuntimeException e) { + // return empty when failed to dereference, this could happen when partition and table schema mismatch + return Optional.empty(); + } + } + else { + throw new IllegalArgumentException(lenientFormat("typeInfo: %s should be struct or union type", typeInfo)); + } + } + return Optional.of(HiveType.toHiveType(typeInfo)); + } + + public static List getHiveDereferenceNames(HiveType hiveType, List dereferences) + { + ImmutableList.Builder dereferenceNames = ImmutableList.builder(); + TypeInfo typeInfo = hiveType.getTypeInfo(); + for (int i = 0; i < dereferences.size(); i++) { + int fieldIndex = dereferences.get(i); + checkArgument(fieldIndex >= 0, "fieldIndex cannot be negative"); + + if (typeInfo instanceof StructTypeInfo structTypeInfo) { + checkArgument(fieldIndex < structTypeInfo.getAllStructFieldNames().size(), + "fieldIndex should be less than the number of fields in the struct"); + + String fieldName = structTypeInfo.getAllStructFieldNames().get(fieldIndex); + dereferenceNames.add(fieldName); + typeInfo = structTypeInfo.getAllStructFieldTypeInfos().get(fieldIndex); + } + else if (typeInfo instanceof UnionTypeInfo unionTypeInfo) { + checkArgument((fieldIndex - 1) < unionTypeInfo.getAllUnionObjectTypeInfos().size(), + "fieldIndex should be less than the number of fields in the union plus tag field"); + + if (fieldIndex == 0) { + checkArgument(i == (dereferences.size() - 1), "Union's tag field should not have more subfields"); + dereferenceNames.add(UNION_FIELD_TAG_NAME); + break; + } + typeInfo = unionTypeInfo.getAllUnionObjectTypeInfos().get(fieldIndex - 1); + dereferenceNames.add(UNION_FIELD_FIELD_PREFIX + (fieldIndex - 1)); + } + else { + throw new IllegalArgumentException(lenientFormat("typeInfo: %s should be struct or union type", typeInfo)); + } + } + + return dereferenceNames.build(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/CommitTaskData.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/CommitTaskData.java index 9870f0b03502..73140ce60f0a 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/CommitTaskData.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/CommitTaskData.java @@ -13,91 +13,33 @@ */ package io.trino.plugin.iceberg; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; import org.apache.iceberg.FileContent; +import java.util.List; import java.util.Optional; import static java.util.Objects.requireNonNull; -public class CommitTaskData +public record CommitTaskData( + String path, + IcebergFileFormat fileFormat, + long fileSizeInBytes, + MetricsWrapper metrics, + String partitionSpecJson, + Optional partitionDataJson, + FileContent content, + Optional referencedDataFile, + Optional> fileSplitOffsets) { - private final String path; - private final IcebergFileFormat fileFormat; - private final long fileSizeInBytes; - private final MetricsWrapper metrics; - private final String partitionSpecJson; - private final Optional partitionDataJson; - private final FileContent content; - private final Optional referencedDataFile; - - @JsonCreator - public CommitTaskData( - @JsonProperty("path") String path, - @JsonProperty("fileFormat") IcebergFileFormat fileFormat, - @JsonProperty("fileSizeInBytes") long fileSizeInBytes, - @JsonProperty("metrics") MetricsWrapper metrics, - @JsonProperty("partitionSpecJson") String partitionSpecJson, - @JsonProperty("partitionDataJson") Optional partitionDataJson, - @JsonProperty("content") FileContent content, - @JsonProperty("referencedDataFile") Optional referencedDataFile) - { - this.path = requireNonNull(path, "path is null"); - this.fileFormat = requireNonNull(fileFormat, "fileFormat is null"); - this.fileSizeInBytes = fileSizeInBytes; - this.metrics = requireNonNull(metrics, "metrics is null"); - this.partitionSpecJson = requireNonNull(partitionSpecJson, "partitionSpecJson is null"); - this.partitionDataJson = requireNonNull(partitionDataJson, "partitionDataJson is null"); - this.content = requireNonNull(content, "content is null"); - this.referencedDataFile = requireNonNull(referencedDataFile, "referencedDataFile is null"); - } - - @JsonProperty - public String getPath() - { - return path; - } - - @JsonProperty - public IcebergFileFormat getFileFormat() - { - return fileFormat; - } - - @JsonProperty - public long getFileSizeInBytes() - { - return fileSizeInBytes; - } - - @JsonProperty - public MetricsWrapper getMetrics() - { - return metrics; - } - - @JsonProperty - public String getPartitionSpecJson() - { - return partitionSpecJson; - } - - @JsonProperty - public Optional getPartitionDataJson() - { - return partitionDataJson; - } - - @JsonProperty - public FileContent getContent() - { - return content; - } - - @JsonProperty - public Optional getReferencedDataFile() - { - return referencedDataFile; + public CommitTaskData + { + requireNonNull(path, "path is null"); + requireNonNull(fileFormat, "fileFormat is null"); + requireNonNull(metrics, "metrics is null"); + requireNonNull(partitionSpecJson, "partitionSpecJson is null"); + requireNonNull(partitionDataJson, "partitionDataJson is null"); + requireNonNull(content, "content is null"); + requireNonNull(referencedDataFile, "referencedDataFile is null"); + requireNonNull(fileSplitOffsets, "fileSplitOffsets is null"); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/CreateTableException.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/CreateTableException.java new file mode 100644 index 000000000000..66dfccc6a424 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/CreateTableException.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import io.trino.spi.TrinoException; +import io.trino.spi.connector.SchemaTableName; +import org.apache.iceberg.exceptions.CleanableFailure; + +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_COMMIT_ERROR; +import static java.lang.String.format; + +public class CreateTableException + extends TrinoException + implements CleanableFailure +{ + public CreateTableException(Throwable throwable, SchemaTableName tableName) + { + super(ICEBERG_COMMIT_ERROR, format("Failed to create table %s: %s", tableName, throwable.getMessage()), throwable); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ExpressionConverter.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ExpressionConverter.java index ceeeeb7b8e03..f63fc00bb9c8 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ExpressionConverter.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ExpressionConverter.java @@ -32,8 +32,10 @@ import java.util.function.BiFunction; import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.plugin.hive.util.HiveUtil.isStructuralType; import static io.trino.plugin.iceberg.IcebergMetadataColumn.isMetadataColumnId; import static io.trino.plugin.iceberg.IcebergTypes.convertTrinoValueToIceberg; +import static io.trino.spi.type.UuidType.UUID; import static java.lang.String.format; import static org.apache.iceberg.expressions.Expressions.alwaysFalse; import static org.apache.iceberg.expressions.Expressions.alwaysTrue; @@ -50,6 +52,21 @@ public final class ExpressionConverter { private ExpressionConverter() {} + public static boolean isConvertibleToIcebergExpression(Domain domain) + { + if (isStructuralType(domain.getType())) { + // structural types cannot be used to filter a table scan in Iceberg library. + return false; + } + + if (domain.getType() == UUID) { + // Iceberg orders UUID values differently than Trino (perhaps due to https://bugs.openjdk.org/browse/JDK-7025832), so allow only IS NULL / IS NOT NULL checks + return domain.isOnlyNull() || domain.getValues().isAll(); + } + + return true; + } + public static Expression toIcebergExpression(TupleDomain tupleDomain) { if (tupleDomain.isAll()) { @@ -64,6 +81,7 @@ public static Expression toIcebergExpression(TupleDomain tu IcebergColumnHandle columnHandle = entry.getKey(); checkArgument(!isMetadataColumnId(columnHandle.getId()), "Constraint on an unexpected column %s", columnHandle); Domain domain = entry.getValue(); + checkArgument(isConvertibleToIcebergExpression(domain), "Unexpected not convertible domain on column %s: %s", columnHandle, domain); conjuncts.add(toIcebergExpression(columnHandle.getQualifiedName(), columnHandle.getType(), domain)); } return and(conjuncts); diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergFileDelete.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergFileDelete.java new file mode 100644 index 000000000000..673d655d7c53 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergFileDelete.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.iceberg; + +import com.google.inject.BindingAnnotation; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@BindingAnnotation +public @interface ForIcebergFileDelete +{} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergMetadata.java new file mode 100644 index 000000000000..6a4e26046389 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergMetadata.java @@ -0,0 +1,30 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.iceberg; + +import com.google.inject.BindingAnnotation; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@BindingAnnotation +public @interface ForIcebergMetadata {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergPlanning.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergPlanning.java new file mode 100644 index 000000000000..ba290cb4844a --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergPlanning.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.inject.BindingAnnotation; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@BindingAnnotation +public @interface ForIcebergPlanning {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergSplitManager.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergSplitManager.java new file mode 100644 index 000000000000..f11f112791c9 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergSplitManager.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.inject.BindingAnnotation; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@BindingAnnotation +public @interface ForIcebergSplitManager {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergSplitSource.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergSplitSource.java new file mode 100644 index 000000000000..044cb043a78d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/ForIcebergSplitSource.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.inject.BindingAnnotation; + +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import static java.lang.annotation.ElementType.FIELD; +import static java.lang.annotation.ElementType.METHOD; +import static java.lang.annotation.ElementType.PARAMETER; +import static java.lang.annotation.RetentionPolicy.RUNTIME; + +@Retention(RUNTIME) +@Target({FIELD, PARAMETER, METHOD}) +@BindingAnnotation +public @interface ForIcebergSplitSource {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergAvroPageSource.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergAvroPageSource.java index 65fbad86c794..09ca0280af47 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergAvroPageSource.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergAvroPageSource.java @@ -23,7 +23,7 @@ import org.apache.iceberg.avro.Avro; import org.apache.iceberg.avro.AvroIterable; import org.apache.iceberg.data.Record; -import org.apache.iceberg.data.avro.DataReader; +import org.apache.iceberg.data.avro.PlannedDataReader; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.NameMapping; @@ -49,11 +49,7 @@ public class IcebergAvroPageSource private final List columnNames; private final List columnTypes; private final Map icebergTypes; - /** - * Indicates whether the column at each index should be populated with the - * indices of its rows - */ - private final List rowIndexLocations; + private final boolean appendRowNumberColumn; private final PageBuilder pageBuilder; private final AggregatedMemoryContext memoryUsage; @@ -69,36 +65,31 @@ public IcebergAvroPageSource( Optional nameMapping, List columnNames, List columnTypes, - List rowIndexLocations, + boolean appendRowNumberColumn, AggregatedMemoryContext memoryUsage) { this.columnNames = ImmutableList.copyOf(requireNonNull(columnNames, "columnNames is null")); this.columnTypes = ImmutableList.copyOf(requireNonNull(columnTypes, "columnTypes is null")); - this.rowIndexLocations = ImmutableList.copyOf(requireNonNull(rowIndexLocations, "rowIndexLocations is null")); + this.appendRowNumberColumn = appendRowNumberColumn; this.memoryUsage = requireNonNull(memoryUsage, "memoryUsage is null"); checkArgument( - columnNames.size() == rowIndexLocations.size() && columnNames.size() == columnTypes.size(), - "names, rowIndexLocations, and types must correspond one-to-one-to-one"); + columnNames.size() == columnTypes.size(), + "names and types must correspond one-to-one-to-one"); // The column orders in the generated schema might be different from the original order Schema readSchema = fileSchema.select(columnNames); Avro.ReadBuilder builder = Avro.read(file) .project(readSchema) - .createReaderFunc(DataReader::create) + .createReaderFunc(ignore -> PlannedDataReader.create(readSchema)) .split(start, length); nameMapping.ifPresent(builder::withNameMapping); AvroIterable avroReader = builder.build(); icebergTypes = readSchema.columns().stream() .collect(toImmutableMap(Types.NestedField::name, Types.NestedField::type)); - pageBuilder = new PageBuilder(columnTypes); + pageBuilder = new PageBuilder(appendRowNumberColumn ? ImmutableList.builder().addAll(columnTypes).add(BIGINT).build() : columnTypes); recordIterator = avroReader.iterator(); } - private boolean isIndexColumn(int column) - { - return rowIndexLocations.get(column); - } - @Override public long getCompletedBytes() { @@ -131,13 +122,11 @@ public Page getNextPage() pageBuilder.declarePosition(); Record record = recordIterator.next(); for (int channel = 0; channel < columnTypes.size(); channel++) { - if (isIndexColumn(channel)) { - BIGINT.writeLong(pageBuilder.getBlockBuilder(channel), rowId); - } - else { - String name = columnNames.get(channel); - serializeToTrinoBlock(columnTypes.get(channel), icebergTypes.get(name), pageBuilder.getBlockBuilder(channel), record.getField(name)); - } + String name = columnNames.get(channel); + serializeToTrinoBlock(columnTypes.get(channel), icebergTypes.get(name), pageBuilder.getBlockBuilder(channel), record.getField(name)); + } + if (appendRowNumberColumn) { + BIGINT.writeLong(pageBuilder.getBlockBuilder(columnTypes.size()), rowId); } rowId++; } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergBucketFunction.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergBucketFunction.java index 433466c966a0..11e7f76db716 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergBucketFunction.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergBucketFunction.java @@ -13,133 +13,141 @@ */ package io.trino.plugin.iceberg; -import io.trino.plugin.iceberg.PartitionTransforms.ColumnTransform; import io.trino.plugin.iceberg.PartitionTransforms.ValueTransform; import io.trino.spi.Page; import io.trino.spi.block.Block; +import io.trino.spi.block.RowBlock; import io.trino.spi.connector.BucketFunction; -import io.trino.spi.type.Type; +import io.trino.spi.connector.ConnectorSplit; import io.trino.spi.type.TypeOperators; -import org.apache.iceberg.PartitionSpec; import java.lang.invoke.MethodHandle; -import java.util.HashMap; import java.util.List; -import java.util.Map; +import java.util.function.ToIntFunction; import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.trino.plugin.iceberg.PartitionTransforms.getColumnTransform; +import static io.trino.plugin.iceberg.IcebergPartitionFunction.Transform.BUCKET; import static io.trino.spi.function.InvocationConvention.InvocationArgumentConvention.NEVER_NULL; import static io.trino.spi.function.InvocationConvention.InvocationReturnConvention.FAIL_ON_NULL; import static io.trino.spi.function.InvocationConvention.simpleConvention; import static io.trino.spi.type.TypeUtils.NULL_HASH_CODE; import static java.util.Objects.requireNonNull; +import static java.util.Objects.requireNonNullElse; public class IcebergBucketFunction - implements BucketFunction + implements BucketFunction, ToIntFunction { private final int bucketCount; + private final List functions; - private final List partitionColumns; - private final List hashCodeInvokers; + private final boolean singleBucketFunction; - public IcebergBucketFunction( - TypeOperators typeOperators, - PartitionSpec partitionSpec, - List partitioningColumns, - int bucketCount) + public IcebergBucketFunction(IcebergPartitioningHandle partitioningHandle, TypeOperators typeOperators, int bucketCount) { - requireNonNull(partitionSpec, "partitionSpec is null"); - checkArgument(!partitionSpec.isUnpartitioned(), "empty partitionSpec"); - requireNonNull(partitioningColumns, "partitioningColumns is null"); + requireNonNull(partitioningHandle, "partitioningHandle is null"); requireNonNull(typeOperators, "typeOperators is null"); checkArgument(bucketCount > 0, "Invalid bucketCount: %s", bucketCount); this.bucketCount = bucketCount; - - Map fieldIdToInputChannel = new HashMap<>(); - for (int i = 0; i < partitioningColumns.size(); i++) { - Integer previous = fieldIdToInputChannel.put(partitioningColumns.get(i).getId(), i); - checkState(previous == null, "Duplicate id %s in %s at %s and %s", partitioningColumns.get(i).getId(), partitioningColumns, i, previous); - } - partitionColumns = partitionSpec.fields().stream() - .map(field -> { - Integer channel = fieldIdToInputChannel.get(field.sourceId()); - checkArgument(channel != null, "partition field not found: %s", field); - Type inputType = partitioningColumns.get(channel).getType(); - ColumnTransform transform = getColumnTransform(field, inputType); - return new PartitionColumn(channel, transform.getValueTransform(), transform.getType()); - }) - .collect(toImmutableList()); - hashCodeInvokers = partitionColumns.stream() - .map(PartitionColumn::getResultType) - .map(type -> typeOperators.getHashCodeOperator(type, simpleConvention(FAIL_ON_NULL, NEVER_NULL))) + List partitionFunctions = partitioningHandle.partitionFunctions(); + this.functions = partitionFunctions.stream() + .map(partitionFunction -> HashFunction.create(partitionFunction, typeOperators)) .collect(toImmutableList()); + + this.singleBucketFunction = partitionFunctions.size() == 1 && + partitionFunctions.get(0).transform() == BUCKET && + partitionFunctions.get(0).size().orElseThrow() == bucketCount; } @Override public int getBucket(Page page, int position) { - long hash = 0; + if (singleBucketFunction) { + long bucket = (long) requireNonNullElse(functions.getFirst().getValue(page, position), 0L); + checkArgument(0 <= bucket && bucket < bucketCount, "Bucket value out of range: %s (bucketCount: %s)", bucket, bucketCount); + return (int) bucket; + } - for (int i = 0; i < partitionColumns.size(); i++) { - PartitionColumn partitionColumn = partitionColumns.get(i); - Block block = page.getBlock(partitionColumn.getSourceChannel()); - Object value = partitionColumn.getValueTransform().apply(block, position); - long valueHash = hashValue(hashCodeInvokers.get(i), value); + long hash = 0; + for (HashFunction function : functions) { + long valueHash = function.computeHash(page, position); hash = (31 * hash) + valueHash; } return (int) ((hash & Long.MAX_VALUE) % bucketCount); } - private static long hashValue(MethodHandle method, Object value) + @Override + public int applyAsInt(ConnectorSplit split) { - if (value == null) { - return NULL_HASH_CODE; - } - try { - return (long) method.invoke(value); + List partitionValues = ((IcebergSplit) split).getPartitionValues() + .orElseThrow(() -> new IllegalArgumentException("Split does not contain partition values")); + + if (singleBucketFunction) { + long bucket = (long) requireNonNullElse(partitionValues.getFirst(), 0L); + checkArgument(0 <= bucket && bucket < bucketCount, "Bucket value out of range: %s (bucketCount: %s)", bucket, bucketCount); + return (int) bucket; } - catch (Throwable throwable) { - if (throwable instanceof Error) { - throw (Error) throwable; - } - if (throwable instanceof RuntimeException) { - throw (RuntimeException) throwable; - } - throw new RuntimeException(throwable); + + long hash = 0; + for (int i = 0; i < functions.size(); i++) { + long valueHash = functions.get(i).computeHash(partitionValues.get(i)); + hash = (31 * hash) + valueHash; } + + return (int) ((hash & Long.MAX_VALUE) % bucketCount); } - private static class PartitionColumn + private record HashFunction(List dataPath, ValueTransform valueTransform, MethodHandle hashCodeOperator) { - private final int sourceChannel; - private final ValueTransform valueTransform; - private final Type resultType; + private static HashFunction create(IcebergPartitionFunction partitionFunction, TypeOperators typeOperators) + { + PartitionTransforms.ColumnTransform columnTransform = PartitionTransforms.getColumnTransform(partitionFunction); + return new HashFunction( + partitionFunction.dataPath(), + columnTransform.valueTransform(), + typeOperators.getHashCodeOperator(columnTransform.type(), simpleConvention(FAIL_ON_NULL, NEVER_NULL))); + } - public PartitionColumn(int sourceChannel, ValueTransform valueTransform, Type resultType) + private HashFunction { - this.sourceChannel = sourceChannel; - this.valueTransform = requireNonNull(valueTransform, "valueTransform is null"); - this.resultType = requireNonNull(resultType, "resultType is null"); + requireNonNull(valueTransform, "valueTransform is null"); + requireNonNull(hashCodeOperator, "hashCodeOperator is null"); } - public int getSourceChannel() + public Object getValue(Page page, int position) { - return sourceChannel; + Block block = page.getBlock(dataPath.getFirst()); + for (int i = 1; i < dataPath.size(); i++) { + position = block.getUnderlyingValuePosition(position); + block = ((RowBlock) block.getUnderlyingValueBlock()).getFieldBlock(dataPath.get(i)); + } + return valueTransform.apply(block, position); } - public Type getResultType() + public long computeHash(Page page, int position) { - return resultType; + return computeHash(getValue(page, position)); } - public ValueTransform getValueTransform() + private long computeHash(Object value) { - return valueTransform; + if (value == null) { + return NULL_HASH_CODE; + } + try { + return (long) hashCodeOperator.invoke(value); + } + catch (Throwable throwable) { + if (throwable instanceof Error error) { + throw error; + } + if (throwable instanceof RuntimeException runtimeException) { + throw runtimeException; + } + throw new RuntimeException(throwable); + } } } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergColumnHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergColumnHandle.java index ebb49d12a00f..0f70cc51ea18 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergColumnHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergColumnHandle.java @@ -18,16 +18,22 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; +import io.airlift.slice.SizeOf; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ColumnMetadata; import io.trino.spi.type.Type; +import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.Optional; +import static io.airlift.slice.SizeOf.estimatedSizeOf; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.SizeOf.sizeOf; import static io.trino.plugin.iceberg.IcebergMetadataColumn.FILE_MODIFIED_TIME; import static io.trino.plugin.iceberg.IcebergMetadataColumn.FILE_PATH; +import static io.trino.plugin.iceberg.IcebergMetadataColumn.PARTITION; import static java.util.Objects.requireNonNull; import static org.apache.iceberg.MetadataColumns.IS_DELETED; import static org.apache.iceberg.MetadataColumns.ROW_POSITION; @@ -35,35 +41,53 @@ public class IcebergColumnHandle implements ColumnHandle { + private static final int INSTANCE_SIZE = instanceSize(IcebergColumnHandle.class); + // Iceberg reserved row ids begin at INTEGER.MAX_VALUE and count down. Starting with MIN_VALUE here to avoid conflicts. - public static final int TRINO_UPDATE_ROW_ID = Integer.MIN_VALUE; - public static final int TRINO_MERGE_ROW_ID = Integer.MIN_VALUE + 1; + public static final int TRINO_MERGE_ROW_ID = Integer.MIN_VALUE; public static final String TRINO_ROW_ID_NAME = "$row_id"; - public static final int TRINO_MERGE_PARTITION_SPEC_ID = Integer.MIN_VALUE + 2; - public static final int TRINO_MERGE_PARTITION_DATA = Integer.MIN_VALUE + 3; + public static final int TRINO_MERGE_PARTITION_SPEC_ID = Integer.MIN_VALUE + 1; + public static final int TRINO_MERGE_PARTITION_DATA = Integer.MIN_VALUE + 2; + + public static final String DATA_CHANGE_TYPE_NAME = "_change_type"; + public static final int DATA_CHANGE_TYPE_ID = Integer.MIN_VALUE + 3; + public static final String DATA_CHANGE_VERSION_NAME = "_change_version_id"; + public static final int DATA_CHANGE_VERSION_ID = Integer.MIN_VALUE + 4; + public static final String DATA_CHANGE_TIMESTAMP_NAME = "_change_timestamp"; + public static final int DATA_CHANGE_TIMESTAMP_ID = Integer.MIN_VALUE + 5; + public static final String DATA_CHANGE_ORDINAL_NAME = "_change_ordinal"; + public static final int DATA_CHANGE_ORDINAL_ID = Integer.MIN_VALUE + 6; private final ColumnIdentity baseColumnIdentity; private final Type baseType; // The list of field ids to indicate the projected part of the top-level column represented by baseColumnIdentity private final List path; private final Type type; + private final boolean nullable; private final Optional comment; // Cache of ColumnIdentity#getId to ensure quick access, even with dereferences private final int id; + /** + * @deprecated This constructor is intended to be used by JSON deserialization only. + * Use {@link #builder(ColumnIdentity)}, {@link #required(ColumnIdentity)} or {@link #optional(ColumnIdentity)} instead. + */ + @Deprecated @JsonCreator public IcebergColumnHandle( @JsonProperty("baseColumnIdentity") ColumnIdentity baseColumnIdentity, @JsonProperty("baseType") Type baseType, @JsonProperty("path") List path, @JsonProperty("type") Type type, + @JsonProperty("nullable") boolean nullable, @JsonProperty("comment") Optional comment) { this.baseColumnIdentity = requireNonNull(baseColumnIdentity, "baseColumnIdentity is null"); this.baseType = requireNonNull(baseType, "baseType is null"); this.path = ImmutableList.copyOf(requireNonNull(path, "path is null")); this.type = requireNonNull(type, "type is null"); + this.nullable = nullable; this.comment = requireNonNull(comment, "comment is null"); this.id = path.isEmpty() ? baseColumnIdentity.getId() : Iterables.getLast(path); } @@ -99,7 +123,13 @@ public Type getBaseType() @JsonIgnore public IcebergColumnHandle getBaseColumn() { - return new IcebergColumnHandle(getBaseColumnIdentity(), getBaseType(), ImmutableList.of(), getBaseType(), Optional.empty()); + return new IcebergColumnHandle(getBaseColumnIdentity(), getBaseType(), ImmutableList.of(), getBaseType(), isNullable(), Optional.empty()); + } + + @JsonProperty + public boolean isNullable() + { + return nullable; } @JsonProperty @@ -158,12 +188,6 @@ public boolean isRowPositionColumn() return id == ROW_POSITION.fieldId(); } - @JsonIgnore - public boolean isUpdateRowIdColumn() - { - return id == TRINO_UPDATE_ROW_ID; - } - @JsonIgnore public boolean isMergeRowIdColumn() { @@ -179,6 +203,12 @@ public boolean isIsDeletedColumn() return id == IS_DELETED.fieldId(); } + @JsonIgnore + public boolean isPartitionColumn() + { + return id == PARTITION.getId(); + } + @JsonIgnore public boolean isFileModifiedTimeColumn() { @@ -188,7 +218,7 @@ public boolean isFileModifiedTimeColumn() @Override public int hashCode() { - return Objects.hash(baseColumnIdentity, baseType, path, type, comment); + return Objects.hash(baseColumnIdentity, baseType, path, type, nullable, comment); } @Override @@ -205,6 +235,7 @@ public boolean equals(Object obj) Objects.equals(this.baseType, other.baseType) && Objects.equals(this.path, other.path) && Objects.equals(this.type, other.type) && + this.nullable == other.nullable && Objects.equals(this.comment, other.comment); } @@ -214,14 +245,38 @@ public String toString() return getId() + ":" + getName() + ":" + type.getDisplayName(); } + public long getRetainedSizeInBytes() + { + // type is not accounted for as the instances are cached (by TypeRegistry) and shared + return INSTANCE_SIZE + + baseColumnIdentity.getRetainedSizeInBytes() + + estimatedSizeOf(path, SizeOf::sizeOf) + + sizeOf(nullable) + + sizeOf(comment, SizeOf::estimatedSizeOf) + + sizeOf(id); + } + + public static IcebergColumnHandle partitionColumnHandle() + { + return IcebergColumnHandle.required(columnIdentity(PARTITION)) + .columnType(PARTITION.getType()) + .build(); + } + + public static ColumnMetadata partitionColumnMetadata() + { + return ColumnMetadata.builder() + .setName(PARTITION.getColumnName()) + .setType(PARTITION.getType()) + .setHidden(true) + .build(); + } + public static IcebergColumnHandle pathColumnHandle() { - return new IcebergColumnHandle( - columnIdentity(FILE_PATH), - FILE_PATH.getType(), - ImmutableList.of(), - FILE_PATH.getType(), - Optional.empty()); + return IcebergColumnHandle.required(columnIdentity(FILE_PATH)) + .columnType(FILE_PATH.getType()) + .build(); } public static ColumnMetadata pathColumnMetadata() @@ -235,12 +290,9 @@ public static ColumnMetadata pathColumnMetadata() public static IcebergColumnHandle fileModifiedTimeColumnHandle() { - return new IcebergColumnHandle( - columnIdentity(FILE_MODIFIED_TIME), - FILE_MODIFIED_TIME.getType(), - ImmutableList.of(), - FILE_MODIFIED_TIME.getType(), - Optional.empty()); + return IcebergColumnHandle.required(columnIdentity(FILE_MODIFIED_TIME)) + .columnType(FILE_MODIFIED_TIME.getType()) + .build(); } public static ColumnMetadata fileModifiedTimeColumnMetadata() @@ -261,4 +313,96 @@ public boolean isPathColumn() { return getColumnIdentity().getId() == FILE_PATH.getId(); } + + public static Builder builder(ColumnIdentity columnIdentity) + { + return new Builder(columnIdentity); + } + + public static Builder optional(ColumnIdentity columnIdentity) + { + return new Builder(columnIdentity) + .nullable(true); + } + + public static Builder required(ColumnIdentity columnIdentity) + { + return new Builder(columnIdentity) + .nullable(false); + } + + public static final class Builder + { + private ColumnIdentity baseColumnIdentity; + private Type baseType; + private List path = ImmutableList.of(); + private Type type; + private boolean nullable = true; + private Optional comment = Optional.empty(); + + public Builder(ColumnIdentity columnIdentity) + { + this.baseColumnIdentity = requireNonNull(columnIdentity, "columnIdentity is null"); + } + + public Builder(IcebergColumnHandle handle) + { + requireNonNull(handle, "handle is null"); + this.baseColumnIdentity = handle.getBaseColumnIdentity(); + this.baseType = handle.getBaseType(); + this.path = handle.getPath(); + this.type = handle.getType(); + this.nullable = handle.isNullable(); + this.comment = handle.getComment(); + } + + public Builder baseColumnIdentity(ColumnIdentity baseColumnIdentity) + { + this.baseColumnIdentity = baseColumnIdentity; + return this; + } + + public Builder path(List path) + { + this.path = path; + return this; + } + + public Builder path(Integer... path) + { + this.path = Arrays.asList(path); + return this; + } + + public Builder columnType(Type type) + { + this.baseType = type; + this.type = type; + return this; + } + + public Builder fieldType(Type baseType, Type type) + { + this.baseType = baseType; + this.type = type; + return this; + } + + public Builder nullable(boolean nullable) + { + this.nullable = nullable; + return this; + } + + public Builder comment(String comment) + { + this.comment = Optional.ofNullable(comment); + return this; + } + + public IcebergColumnHandle build() + { + return new IcebergColumnHandle(baseColumnIdentity, baseType, path, type, nullable, comment); + } + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergConfig.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergConfig.java index f59284388194..926059a7c6c5 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergConfig.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergConfig.java @@ -13,25 +13,34 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; import io.airlift.configuration.Config; import io.airlift.configuration.ConfigDescription; import io.airlift.configuration.DefunctConfig; import io.airlift.configuration.LegacyConfig; import io.airlift.units.DataSize; import io.airlift.units.Duration; -import io.trino.plugin.hive.HiveCompressionCodec; +import io.airlift.units.ThreadCount; +import io.trino.plugin.hive.HiveCompressionOption; +import jakarta.validation.constraints.AssertFalse; import jakarta.validation.constraints.DecimalMax; import jakarta.validation.constraints.DecimalMin; import jakarta.validation.constraints.Max; import jakarta.validation.constraints.Min; import jakarta.validation.constraints.NotNull; +import java.util.List; import java.util.Optional; +import java.util.Set; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.airlift.units.DataSize.Unit.GIGABYTE; -import static io.trino.plugin.hive.HiveCompressionCodec.ZSTD; +import static io.airlift.units.DataSize.Unit.MEGABYTE; import static io.trino.plugin.iceberg.CatalogType.HIVE_METASTORE; import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; +import static java.util.Locale.ENGLISH; import static java.util.concurrent.TimeUnit.DAYS; import static java.util.concurrent.TimeUnit.SECONDS; @@ -46,33 +55,49 @@ public class IcebergConfig public static final String EXTENDED_STATISTICS_CONFIG = "iceberg.extended-statistics.enabled"; public static final String EXTENDED_STATISTICS_DESCRIPTION = "Enable collection (ANALYZE) and use of extended statistics."; public static final String COLLECT_EXTENDED_STATISTICS_ON_WRITE_DESCRIPTION = "Collect extended statistics during writes"; - public static final String EXPIRE_SNAPSHOTS_MIN_RETENTION = "iceberg.expire_snapshots.min-retention"; - public static final String REMOVE_ORPHAN_FILES_MIN_RETENTION = "iceberg.remove_orphan_files.min-retention"; + public static final String EXPIRE_SNAPSHOTS_MIN_RETENTION = "iceberg.expire-snapshots.min-retention"; + public static final String REMOVE_ORPHAN_FILES_MIN_RETENTION = "iceberg.remove-orphan-files.min-retention"; private IcebergFileFormat fileFormat = PARQUET; - private HiveCompressionCodec compressionCodec = ZSTD; + private HiveCompressionOption compressionCodec = HiveCompressionOption.ZSTD; + private Optional maxCommitRetry = Optional.empty(); private boolean useFileSizeFromMetadata = true; private int maxPartitionsPerWriter = 100; private boolean uniqueTableLocation = true; private CatalogType catalogType = HIVE_METASTORE; - private Duration dynamicFilteringWaitTimeout = new Duration(0, SECONDS); + private Duration dynamicFilteringWaitTimeout = new Duration(1, SECONDS); private boolean tableStatisticsEnabled = true; private boolean extendedStatisticsEnabled = true; private boolean collectExtendedStatisticsOnWrite = true; private boolean projectionPushdownEnabled = true; private boolean registerTableProcedureEnabled; + private boolean addFilesProcedureEnabled; private Optional hiveCatalogName = Optional.empty(); private int formatVersion = FORMAT_VERSION_SUPPORT_MAX; private Duration expireSnapshotsMinRetention = new Duration(7, DAYS); private Duration removeOrphanFilesMinRetention = new Duration(7, DAYS); private DataSize targetMaxFileSize = DataSize.of(1, GIGABYTE); + private DataSize idleWriterMinFileSize = DataSize.of(16, MEGABYTE); // This is meant to protect users who are misusing schema locations (by // putting schemas in locations with extraneous files), so default to false // to avoid deleting those files if Trino is unable to check. private boolean deleteSchemaLocationsFallback; private double minimumAssignedSplitWeight = 0.05; + private boolean hideMaterializedViewStorageTable = true; private Optional materializedViewsStorageSchema = Optional.empty(); private boolean sortedWritingEnabled = true; + private boolean queryPartitionFilterRequired; + private Set queryPartitionFilterRequiredSchemas = ImmutableSet.of(); + private int splitManagerThreads = Math.min(Runtime.getRuntime().availableProcessors() * 2, 32); + private int planningThreads = Math.min(Runtime.getRuntime().availableProcessors(), 16); + private int fileDeleteThreads = Runtime.getRuntime().availableProcessors() * 2; + private List allowedExtraProperties = ImmutableList.of(); + private boolean incrementalRefreshEnabled = true; + private boolean metadataCacheEnabled = true; + private boolean objectStoreLayoutEnabled; + private int metadataParallelism = 8; + private boolean bucketExecutionEnabled = true; + private boolean fileBasedConflictDetectionEnabled = true; public CatalogType getCatalogType() { @@ -100,18 +125,31 @@ public IcebergConfig setFileFormat(IcebergFileFormat fileFormat) } @NotNull - public HiveCompressionCodec getCompressionCodec() + public HiveCompressionOption getCompressionCodec() { return compressionCodec; } @Config("iceberg.compression-codec") - public IcebergConfig setCompressionCodec(HiveCompressionCodec compressionCodec) + public IcebergConfig setCompressionCodec(HiveCompressionOption compressionCodec) { this.compressionCodec = compressionCodec; return this; } + public Optional<@Min(0) Integer> getMaxCommitRetry() + { + return maxCommitRetry; + } + + @Config("iceberg.max-commit-retry") + @ConfigDescription("Number of times to retry a commit before failing") + public IcebergConfig setMaxCommitRetry(Integer maxCommitRetry) + { + this.maxCommitRetry = Optional.ofNullable(maxCommitRetry); + return this; + } + @Deprecated public boolean isUseFileSizeFromMetadata() { @@ -242,6 +280,20 @@ public IcebergConfig setRegisterTableProcedureEnabled(boolean registerTableProce return this; } + public boolean isAddFilesProcedureEnabled() + { + return addFilesProcedureEnabled; + } + + @Config("iceberg.add-files-procedure.enabled") + @LegacyConfig("iceberg.add_files-procedure.enabled") + @ConfigDescription("Allow users to call the add_files procedure") + public IcebergConfig setAddFilesProcedureEnabled(boolean addFilesProcedureEnabled) + { + this.addFilesProcedureEnabled = addFilesProcedureEnabled; + return this; + } + public Optional getHiveCatalogName() { return hiveCatalogName; @@ -277,6 +329,7 @@ public Duration getExpireSnapshotsMinRetention() } @Config(EXPIRE_SNAPSHOTS_MIN_RETENTION) + @LegacyConfig("iceberg.expire_snapshots.min-retention") @ConfigDescription("Minimal retention period for expire_snapshot procedure") public IcebergConfig setExpireSnapshotsMinRetention(Duration expireSnapshotsMinRetention) { @@ -291,6 +344,7 @@ public Duration getRemoveOrphanFilesMinRetention() } @Config(REMOVE_ORPHAN_FILES_MIN_RETENTION) + @LegacyConfig("iceberg.remove_orphan_files.min-retention") @ConfigDescription("Minimal retention period for remove_orphan_files procedure") public IcebergConfig setRemoveOrphanFilesMinRetention(Duration removeOrphanFilesMinRetention) { @@ -312,6 +366,20 @@ public IcebergConfig setTargetMaxFileSize(DataSize targetMaxFileSize) return this; } + @NotNull + public DataSize getIdleWriterMinFileSize() + { + return idleWriterMinFileSize; + } + + @Config("iceberg.idle-writer-min-file-size") + @ConfigDescription("Minimum data written by a single partition writer before it can be consider as 'idle' and could be closed by the engine") + public IcebergConfig setIdleWriterMinFileSize(DataSize idleWriterMinFileSize) + { + this.idleWriterMinFileSize = idleWriterMinFileSize; + return this; + } + public boolean isDeleteSchemaLocationsFallback() { return this.deleteSchemaLocationsFallback; @@ -341,12 +409,29 @@ public double getMinimumAssignedSplitWeight() return minimumAssignedSplitWeight; } + @Deprecated + public boolean isHideMaterializedViewStorageTable() + { + return hideMaterializedViewStorageTable; + } + + @Deprecated + @Config("iceberg.materialized-views.hide-storage-table") + @ConfigDescription("Hide materialized view storage tables in metastore") + public IcebergConfig setHideMaterializedViewStorageTable(boolean hideMaterializedViewStorageTable) + { + this.hideMaterializedViewStorageTable = hideMaterializedViewStorageTable; + return this; + } + + @Deprecated @NotNull public Optional getMaterializedViewsStorageSchema() { return materializedViewsStorageSchema; } + @Deprecated @Config("iceberg.materialized-views.storage-schema") @ConfigDescription("Schema for creating materialized views storage tables") public IcebergConfig setMaterializedViewsStorageSchema(String materializedViewsStorageSchema) @@ -367,4 +452,174 @@ public IcebergConfig setSortedWritingEnabled(boolean sortedWritingEnabled) this.sortedWritingEnabled = sortedWritingEnabled; return this; } + + @Config("iceberg.query-partition-filter-required") + @ConfigDescription("Require a filter on at least one partition column") + public IcebergConfig setQueryPartitionFilterRequired(boolean queryPartitionFilterRequired) + { + this.queryPartitionFilterRequired = queryPartitionFilterRequired; + return this; + } + + public boolean isQueryPartitionFilterRequired() + { + return queryPartitionFilterRequired; + } + + public Set getQueryPartitionFilterRequiredSchemas() + { + return queryPartitionFilterRequiredSchemas; + } + + @Config("iceberg.query-partition-filter-required-schemas") + @ConfigDescription("List of schemas for which filter on partition column is enforced") + public IcebergConfig setQueryPartitionFilterRequiredSchemas(Set queryPartitionFilterRequiredSchemas) + { + this.queryPartitionFilterRequiredSchemas = queryPartitionFilterRequiredSchemas.stream() + .map(value -> value.toLowerCase(ENGLISH)) + .collect(toImmutableSet()); + return this; + } + + @Min(0) + public int getSplitManagerThreads() + { + return splitManagerThreads; + } + + @Config("iceberg.split-manager-threads") + @ConfigDescription("Number of threads to use for generating splits") + public IcebergConfig setSplitManagerThreads(String splitManagerThreads) + { + this.splitManagerThreads = ThreadCount.valueOf(splitManagerThreads).getThreadCount(); + return this; + } + + @Min(0) + public int getPlanningThreads() + { + return planningThreads; + } + + @Config("iceberg.planning-threads") + @ConfigDescription("Number of threads to use for metadata scans in planning") + public IcebergConfig setPlanningThreads(String planningThreads) + { + this.planningThreads = ThreadCount.valueOf(planningThreads).getThreadCount(); + return this; + } + + @Min(0) + public int getFileDeleteThreads() + { + return fileDeleteThreads; + } + + @Config("iceberg.file-delete-threads") + @ConfigDescription("Number of threads to use for deleting files when running expire_snapshots procedure") + public IcebergConfig setFileDeleteThreads(String fileDeleteThreads) + { + this.fileDeleteThreads = ThreadCount.valueOf(fileDeleteThreads).getThreadCount(); + return this; + } + + public List getAllowedExtraProperties() + { + return allowedExtraProperties; + } + + @Config("iceberg.allowed-extra-properties") + @ConfigDescription("List of extra properties that are allowed to be set on Iceberg tables") + public IcebergConfig setAllowedExtraProperties(List allowedExtraProperties) + { + this.allowedExtraProperties = ImmutableList.copyOf(allowedExtraProperties); + checkArgument(!allowedExtraProperties.contains("*") || allowedExtraProperties.size() == 1, + "Wildcard * should be the only element in the list"); + return this; + } + + public boolean isIncrementalRefreshEnabled() + { + return incrementalRefreshEnabled; + } + + @Config("iceberg.incremental-refresh-enabled") + @ConfigDescription("Enable Incremental refresh for MVs backed by Iceberg tables, when possible") + public IcebergConfig setIncrementalRefreshEnabled(boolean incrementalRefreshEnabled) + { + this.incrementalRefreshEnabled = incrementalRefreshEnabled; + return this; + } + + @AssertFalse(message = "iceberg.materialized-views.storage-schema may only be set when iceberg.materialized-views.hide-storage-table is set to false") + public boolean isStorageSchemaSetWhenHidingIsEnabled() + { + return hideMaterializedViewStorageTable && materializedViewsStorageSchema.isPresent(); + } + + public boolean isMetadataCacheEnabled() + { + return metadataCacheEnabled; + } + + @Config("iceberg.metadata-cache.enabled") + @ConfigDescription("Enables in-memory caching of metadata files on coordinator if fs.cache.enabled is not set to true") + public IcebergConfig setMetadataCacheEnabled(boolean metadataCacheEnabled) + { + this.metadataCacheEnabled = metadataCacheEnabled; + return this; + } + + public boolean isObjectStoreLayoutEnabled() + { + return objectStoreLayoutEnabled; + } + + @Config("iceberg.object-store-layout.enabled") + @ConfigDescription("Enable the Iceberg object store file layout") + public IcebergConfig setObjectStoreLayoutEnabled(boolean objectStoreLayoutEnabled) + { + this.objectStoreLayoutEnabled = objectStoreLayoutEnabled; + return this; + } + + @Min(1) + public int getMetadataParallelism() + { + return metadataParallelism; + } + + @ConfigDescription("Limits metadata enumeration calls parallelism") + @Config("iceberg.metadata.parallelism") + public IcebergConfig setMetadataParallelism(int metadataParallelism) + { + this.metadataParallelism = metadataParallelism; + return this; + } + + public boolean isBucketExecutionEnabled() + { + return bucketExecutionEnabled; + } + + @Config("iceberg.bucket-execution") + @ConfigDescription("Enable bucket-aware execution: use physical bucketing information to optimize queries") + public IcebergConfig setBucketExecutionEnabled(boolean bucketExecutionEnabled) + { + this.bucketExecutionEnabled = bucketExecutionEnabled; + return this; + } + + public boolean isFileBasedConflictDetectionEnabled() + { + return fileBasedConflictDetectionEnabled; + } + + @Config("iceberg.file-based-conflict-detection") + @ConfigDescription("Enable file-based conflict detection: take partition information from the actual written files as a source for the conflict detection system") + public IcebergConfig setFileBasedConflictDetectionEnabled(boolean fileBasedConflictDetectionEnabled) + { + this.fileBasedConflictDetectionEnabled = fileBasedConflictDetectionEnabled; + return this; + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergErrorCode.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergErrorCode.java index 0ce831abb7e7..b6fa4ebcfd29 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergErrorCode.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergErrorCode.java @@ -40,6 +40,7 @@ public enum IcebergErrorCode ICEBERG_CATALOG_ERROR(13, EXTERNAL), ICEBERG_WRITER_CLOSE_ERROR(14, EXTERNAL), ICEBERG_MISSING_METADATA(15, EXTERNAL), + ICEBERG_UNSUPPORTED_VIEW_DIALECT(17, EXTERNAL) /**/; private final ErrorCode errorCode; diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergExceptions.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergExceptions.java new file mode 100644 index 000000000000..0fbcfe003346 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergExceptions.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import io.trino.spi.StandardErrorCode; +import io.trino.spi.TrinoException; +import org.apache.iceberg.exceptions.ValidationException; + +import java.io.FileNotFoundException; + +import static com.google.common.base.Throwables.getCausalChain; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_METADATA; + +public final class IcebergExceptions +{ + private IcebergExceptions() {} + + private static boolean isNotFoundException(Throwable failure) + { + return getCausalChain(failure).stream().anyMatch(e -> + e instanceof org.apache.iceberg.exceptions.NotFoundException + || e instanceof FileNotFoundException); + } + + public static boolean isFatalException(Throwable failure) + { + return isNotFoundException(failure) || failure instanceof ValidationException; + } + + public static RuntimeException translateMetadataException(Throwable failure, String tableName) + { + if (failure instanceof TrinoException trinoException) { + return trinoException; + } + if (isNotFoundException(failure)) { + throw new TrinoException(ICEBERG_MISSING_METADATA, "Metadata not found in metadata location for table " + tableName, failure); + } + if (failure instanceof ValidationException) { + throw new TrinoException(ICEBERG_INVALID_METADATA, "Invalid metadata file for table " + tableName, failure); + } + + return new TrinoException(StandardErrorCode.GENERIC_INTERNAL_ERROR, "Error processing metadata for table " + tableName, failure); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileFormat.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileFormat.java index c3be4cc9ea71..14a42180a946 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileFormat.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileFormat.java @@ -13,12 +13,8 @@ */ package io.trino.plugin.iceberg; -import com.google.common.base.VerifyException; -import io.trino.spi.TrinoException; import org.apache.iceberg.FileFormat; -import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; - public enum IcebergFileFormat { ORC, @@ -28,28 +24,31 @@ public enum IcebergFileFormat public FileFormat toIceberg() { - switch (this) { - case ORC: - return FileFormat.ORC; - case PARQUET: - return FileFormat.PARQUET; - case AVRO: - return FileFormat.AVRO; - } - throw new VerifyException("Unhandled type: " + this); + return switch (this) { + case ORC -> FileFormat.ORC; + case PARQUET -> FileFormat.PARQUET; + case AVRO -> FileFormat.AVRO; + }; } public static IcebergFileFormat fromIceberg(FileFormat format) { - switch (format) { - case ORC: - return ORC; - case PARQUET: - return PARQUET; - case AVRO: - return AVRO; - default: - throw new TrinoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + format); - } + return switch (format) { + case ORC -> ORC; + case PARQUET -> PARQUET; + case AVRO -> AVRO; + // Not used as a data file format + case METADATA -> throw new IllegalArgumentException("Unexpected METADATA file format"); + case PUFFIN -> throw new IllegalArgumentException("Unexpected PUFFIN file format"); + }; + } + + public String humanName() + { + return switch (this) { + case AVRO -> "Avro"; + case ORC -> "ORC"; + case PARQUET -> "Parquet"; + }; } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileSystemFactory.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileSystemFactory.java new file mode 100644 index 000000000000..a8bd36cce40c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileSystemFactory.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import io.trino.filesystem.TrinoFileSystem; +import io.trino.spi.security.ConnectorIdentity; + +import java.util.Map; + +public interface IcebergFileSystemFactory +{ + TrinoFileSystem create(ConnectorIdentity identity, Map fileIoProperties); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileWriter.java index 0d04338e413e..f6616bab1c9e 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileWriter.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergFileWriter.java @@ -16,8 +16,13 @@ import io.trino.plugin.hive.FileWriter; import org.apache.iceberg.Metrics; +import java.util.List; +import java.util.Optional; + public interface IcebergFileWriter extends FileWriter { - Metrics getMetrics(); + FileMetrics getFileMetrics(); + + record FileMetrics(Metrics metrics, Optional> splitOffsets) {} } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergInputInfo.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergInputInfo.java index f6dac0b08dbb..359ddf2a6071 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergInputInfo.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergInputInfo.java @@ -13,66 +13,30 @@ */ package io.trino.plugin.iceberg; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; -import java.util.Objects; +import java.util.List; import java.util.Optional; import static java.util.Objects.requireNonNull; -public class IcebergInputInfo +public record IcebergInputInfo( + Optional snapshotId, + List partitionFields, + String tableDefaultFileFormat, + Optional totalRecords, + Optional deletedRecords, + Optional totalDataFiles, + Optional totalDeleteFiles) { - private final Optional snapshotId; - private final Optional partitioned; - private final String tableDefaultFileFormat; - - @JsonCreator - public IcebergInputInfo( - @JsonProperty("snapshotId") Optional snapshotId, - @JsonProperty("partitioned") Optional partitioned, - @JsonProperty("fileFormat") String tableDefaultFileFormat) - { - this.snapshotId = requireNonNull(snapshotId, "snapshotId is null"); - this.partitioned = requireNonNull(partitioned, "partitioned is null"); - this.tableDefaultFileFormat = requireNonNull(tableDefaultFileFormat, "tableDefaultFileFormat is null"); - } - - @JsonProperty - public Optional getSnapshotId() - { - return snapshotId; - } - - @JsonProperty - public Optional getPartitioned() - { - return partitioned; - } - - @JsonProperty - public String getTableDefaultFileFormat() - { - return tableDefaultFileFormat; - } - - @Override - public boolean equals(Object o) - { - if (this == o) { - return true; - } - if (!(o instanceof IcebergInputInfo that)) { - return false; - } - return partitioned.equals(that.partitioned) - && snapshotId.equals(that.snapshotId) - && tableDefaultFileFormat.equals(that.tableDefaultFileFormat); - } - - @Override - public int hashCode() + public IcebergInputInfo { - return Objects.hash(snapshotId, partitioned, tableDefaultFileFormat); + requireNonNull(snapshotId, "snapshotId is null"); + partitionFields = ImmutableList.copyOf(partitionFields); + requireNonNull(tableDefaultFileFormat, "tableDefaultFileFormat is null"); + requireNonNull(totalRecords, "totalRecords is null"); + requireNonNull(deletedRecords, "deletedRecords is null"); + requireNonNull(totalDataFiles, "totalDataFiles is null"); + requireNonNull(totalDeleteFiles, "totalDeleteFiles is null"); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMaterializedViewDefinition.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMaterializedViewDefinition.java index dc6cd42e317a..de2537d9a088 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMaterializedViewDefinition.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMaterializedViewDefinition.java @@ -13,11 +13,11 @@ */ package io.trino.plugin.iceberg; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableList; import io.airlift.json.JsonCodec; import io.airlift.json.JsonCodecFactory; import io.airlift.json.ObjectMapperProvider; +import io.trino.spi.connector.CatalogSchemaName; import io.trino.spi.connector.ConnectorMaterializedViewDefinition; import io.trino.spi.type.TypeId; @@ -26,6 +26,7 @@ import java.util.List; import java.util.Optional; import java.util.StringJoiner; +import java.util.stream.Collectors; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; @@ -36,7 +37,14 @@ /* * Serializable version of ConnectorMaterializedViewDefinition stored by iceberg connector */ -public class IcebergMaterializedViewDefinition +public record IcebergMaterializedViewDefinition( + String originalSql, + Optional catalog, + Optional schema, + List columns, + Optional gracePeriod, + Optional comment, + List path) { private static final String MATERIALIZED_VIEW_PREFIX = "/* Presto Materialized View: "; private static final String MATERIALIZED_VIEW_SUFFIX = " */"; @@ -44,13 +52,6 @@ public class IcebergMaterializedViewDefinition private static final JsonCodec materializedViewCodec = new JsonCodecFactory(new ObjectMapperProvider()).jsonCodec(IcebergMaterializedViewDefinition.class); - private final String originalSql; - private final Optional catalog; - private final Optional schema; - private final List columns; - private final Optional gracePeriod; - private final Optional comment; - public static String encodeMaterializedViewData(IcebergMaterializedViewDefinition definition) { byte[] bytes = materializedViewCodec.toJsonBytes(definition); @@ -78,25 +79,19 @@ public static IcebergMaterializedViewDefinition fromConnectorMaterializedViewDef .map(column -> new Column(column.getName(), column.getType(), column.getComment())) .collect(toImmutableList()), definition.getGracePeriod(), - definition.getComment()); + definition.getComment(), + List.of()); } - @JsonCreator - public IcebergMaterializedViewDefinition( - @JsonProperty("originalSql") String originalSql, - @JsonProperty("catalog") Optional catalog, - @JsonProperty("schema") Optional schema, - @JsonProperty("columns") List columns, - @JsonProperty("gracePeriod") Optional gracePeriod, - @JsonProperty("comment") Optional comment) + public IcebergMaterializedViewDefinition { - this.originalSql = requireNonNull(originalSql, "originalSql is null"); - this.catalog = requireNonNull(catalog, "catalog is null"); - this.schema = requireNonNull(schema, "schema is null"); - this.columns = List.copyOf(requireNonNull(columns, "columns is null")); + requireNonNull(originalSql, "originalSql is null"); + requireNonNull(catalog, "catalog is null"); + requireNonNull(schema, "schema is null"); + columns = List.copyOf(requireNonNull(columns, "columns is null")); checkArgument(gracePeriod.isEmpty() || !gracePeriod.get().isNegative(), "gracePeriod cannot be negative: %s", gracePeriod); - this.gracePeriod = gracePeriod; - this.comment = requireNonNull(comment, "comment is null"); + requireNonNull(comment, "comment is null"); + path = path == null ? ImmutableList.of() : ImmutableList.copyOf(path); if (catalog.isEmpty() && schema.isPresent()) { throw new IllegalArgumentException("catalog must be present if schema is present"); @@ -106,42 +101,6 @@ public IcebergMaterializedViewDefinition( } } - @JsonProperty - public String getOriginalSql() - { - return originalSql; - } - - @JsonProperty - public Optional getCatalog() - { - return catalog; - } - - @JsonProperty - public Optional getSchema() - { - return schema; - } - - @JsonProperty - public List getColumns() - { - return columns; - } - - @JsonProperty - public Optional getGracePeriod() - { - return gracePeriod; - } - - @JsonProperty - public Optional getComment() - { - return comment; - } - @Override public String toString() { @@ -152,42 +111,17 @@ public String toString() joiner.add("columns=" + columns); gracePeriod.ifPresent(value -> joiner.add("gracePeriod≥=" + value)); comment.ifPresent(value -> joiner.add("comment=" + value)); + joiner.add(path.stream().map(CatalogSchemaName::toString).collect(Collectors.joining(", ", "path=(", ")"))); return getClass().getSimpleName() + joiner; } - public static final class Column + public record Column(String name, TypeId type, Optional comment) { - private final String name; - private final TypeId type; - private final Optional comment; - - @JsonCreator - public Column( - @JsonProperty("name") String name, - @JsonProperty("type") TypeId type, - @JsonProperty("comment") Optional comment) - { - this.name = requireNonNull(name, "name is null"); - this.type = requireNonNull(type, "type is null"); - this.comment = requireNonNull(comment, "comment is null"); - } - - @JsonProperty - public String getName() - { - return name; - } - - @JsonProperty - public TypeId getType() - { - return type; - } - - @JsonProperty - public Optional getComment() + public Column { - return comment; + requireNonNull(name, "name is null"); + requireNonNull(type, "type is null"); + requireNonNull(comment, "comment is null"); } @Override diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMaterializedViewProperties.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMaterializedViewProperties.java new file mode 100644 index 000000000000..036dd9d5a978 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMaterializedViewProperties.java @@ -0,0 +1,55 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Inject; +import io.trino.spi.session.PropertyMetadata; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.trino.spi.session.PropertyMetadata.stringProperty; + +public class IcebergMaterializedViewProperties +{ + public static final String STORAGE_SCHEMA = "storage_schema"; + + private final List> materializedViewProperties; + + @Inject + public IcebergMaterializedViewProperties(IcebergConfig icebergConfig, IcebergTableProperties tableProperties) + { + materializedViewProperties = ImmutableList.>builder() + .add(stringProperty( + STORAGE_SCHEMA, + "Schema for creating materialized view storage table", + icebergConfig.getMaterializedViewsStorageSchema().orElse(null), + false)) + // Materialized view should allow configuring all the supported iceberg table properties for the storage table + .addAll(tableProperties.getTableProperties()) + .build(); + } + + public List> getMaterializedViewProperties() + { + return materializedViewProperties; + } + + public static Optional getStorageSchema(Map materializedViewProperties) + { + return Optional.ofNullable((String) materializedViewProperties.get(STORAGE_SCHEMA)); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMergeSink.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMergeSink.java index e51f334e71d3..26eeaed1713c 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMergeSink.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMergeSink.java @@ -13,15 +13,13 @@ */ package io.trino.plugin.iceberg; -import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.airlift.json.JsonCodec; import io.airlift.slice.Slice; import io.trino.filesystem.TrinoFileSystem; -import io.trino.plugin.iceberg.delete.IcebergPositionDeletePageSink; +import io.trino.plugin.iceberg.delete.PositionDeleteWriter; import io.trino.spi.Page; -import io.trino.spi.PageBuilder; -import io.trino.spi.block.ColumnarRow; +import io.trino.spi.block.Block; import io.trino.spi.connector.ConnectorMergeSink; import io.trino.spi.connector.ConnectorPageSink; import io.trino.spi.connector.ConnectorSession; @@ -44,7 +42,7 @@ import java.util.concurrent.CompletableFuture; import static io.trino.plugin.base.util.Closables.closeAllSuppress; -import static io.trino.spi.block.ColumnarRow.toColumnarRow; +import static io.trino.plugin.iceberg.IcebergUtil.getRowFieldsFromBlock; import static io.trino.spi.connector.MergePage.createDeleteAndInsertPages; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.IntegerType.INTEGER; @@ -101,16 +99,19 @@ public void storeMergedRows(Page page) mergePage.getInsertionsPage().ifPresent(insertPageSink::appendPage); mergePage.getDeletionsPage().ifPresent(deletions -> { - ColumnarRow rowIdRow = toColumnarRow(deletions.getBlock(deletions.getChannelCount() - 1)); - - for (int position = 0; position < rowIdRow.getPositionCount(); position++) { - Slice filePath = VarcharType.VARCHAR.getSlice(rowIdRow.getField(0), position); - long rowPosition = BIGINT.getLong(rowIdRow.getField(1), position); + List fields = getRowFieldsFromBlock(deletions.getBlock(deletions.getChannelCount() - 1)); + Block fieldPathBlock = fields.get(0); + Block rowPositionBlock = fields.get(1); + Block partitionSpecIdBlock = fields.get(2); + Block partitionDataBlock = fields.get(3); + for (int position = 0; position < fieldPathBlock.getPositionCount(); position++) { + Slice filePath = VarcharType.VARCHAR.getSlice(fieldPathBlock, position); + long rowPosition = BIGINT.getLong(rowPositionBlock, position); int index = position; - FileDeletion deletion = fileDeletions.computeIfAbsent(filePath, ignored -> { - int partitionSpecId = INTEGER.getInt(rowIdRow.getField(2), index); - String partitionData = VarcharType.VARCHAR.getSlice(rowIdRow.getField(3), index).toStringUtf8(); + FileDeletion deletion = fileDeletions.computeIfAbsent(filePath, ignore -> { + int partitionSpecId = INTEGER.getInt(partitionSpecIdBlock, index); + String partitionData = VarcharType.VARCHAR.getSlice(partitionDataBlock, index).toStringUtf8(); return new FileDeletion(partitionSpecId, partitionData); }); @@ -125,12 +126,12 @@ public CompletableFuture> finish() List fragments = new ArrayList<>(insertPageSink.finish().join()); fileDeletions.forEach((dataFilePath, deletion) -> { - ConnectorPageSink sink = createPositionDeletePageSink( + PositionDeleteWriter writer = createPositionDeleteWriter( dataFilePath.toStringUtf8(), partitionsSpecs.get(deletion.partitionSpecId()), deletion.partitionDataJson()); - fragments.addAll(writePositionDeletes(sink, deletion.rowsToDelete())); + fragments.addAll(writePositionDeletes(writer, deletion.rowsToDelete())); }); return completedFuture(fragments); @@ -142,7 +143,7 @@ public void abort() insertPageSink.abort(); } - private ConnectorPageSink createPositionDeletePageSink(String dataFilePath, PartitionSpec partitionSpec, String partitionDataJson) + private PositionDeleteWriter createPositionDeleteWriter(String dataFilePath, PartitionSpec partitionSpec, String partitionDataJson) { Optional partitionData = Optional.empty(); if (partitionSpec.isPartitioned()) { @@ -152,7 +153,7 @@ private ConnectorPageSink createPositionDeletePageSink(String dataFilePath, Part partitionData = Optional.of(PartitionData.fromJson(partitionDataJson, columnTypes)); } - return new IcebergPositionDeletePageSink( + return new PositionDeleteWriter( dataFilePath, partitionSpec, partitionData, @@ -165,37 +166,17 @@ private ConnectorPageSink createPositionDeletePageSink(String dataFilePath, Part storageProperties); } - private static Collection writePositionDeletes(ConnectorPageSink sink, ImmutableLongBitmapDataProvider rowsToDelete) + private static Collection writePositionDeletes(PositionDeleteWriter writer, ImmutableLongBitmapDataProvider rowsToDelete) { try { - return doWritePositionDeletes(sink, rowsToDelete); + return writer.write(rowsToDelete); } catch (Throwable t) { - closeAllSuppress(t, sink::abort); + closeAllSuppress(t, writer::abort); throw t; } } - private static Collection doWritePositionDeletes(ConnectorPageSink sink, ImmutableLongBitmapDataProvider rowsToDelete) - { - PageBuilder pageBuilder = new PageBuilder(ImmutableList.of(BIGINT)); - - rowsToDelete.forEach(rowPosition -> { - BIGINT.writeLong(pageBuilder.getBlockBuilder(0), rowPosition); - pageBuilder.declarePosition(); - if (pageBuilder.isFull()) { - sink.appendPage(pageBuilder.build()); - pageBuilder.reset(); - } - }); - - if (!pageBuilder.isEmpty()) { - sink.appendPage(pageBuilder.build()); - } - - return sink.finish().join(); - } - private static class FileDeletion { private final int partitionSpecId; diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadata.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadata.java index cf57078bdbc6..d2eda85aeb43 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadata.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadata.java @@ -13,7 +13,9 @@ */ package io.trino.plugin.iceberg; +import com.google.common.base.Joiner; import com.google.common.base.Splitter; +import com.google.common.base.Splitter.MapSplitter; import com.google.common.base.Suppliers; import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableList; @@ -23,6 +25,7 @@ import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.collect.Streams; +import io.airlift.concurrent.MoreFutures; import io.airlift.json.JsonCodec; import io.airlift.log.Logger; import io.airlift.slice.Slice; @@ -32,27 +35,48 @@ import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; -import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.metastore.TableInfo; import io.trino.plugin.base.classloader.ClassLoaderSafeSystemTable; import io.trino.plugin.base.projection.ApplyProjectionUtil; import io.trino.plugin.base.projection.ApplyProjectionUtil.ProjectedColumnRepresentation; +import io.trino.plugin.hive.HiveCompressionCodec; +import io.trino.plugin.hive.HiveStorageFormat; +import io.trino.plugin.hive.HiveTimestampPrecision; +import io.trino.plugin.hive.HiveType; import io.trino.plugin.hive.HiveWrittenPartitions; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.HiveMetastoreFactory; import io.trino.plugin.iceberg.aggregation.DataSketchStateSerializer; import io.trino.plugin.iceberg.aggregation.IcebergThetaSketchForStats; import io.trino.plugin.iceberg.catalog.TrinoCatalog; +import io.trino.plugin.iceberg.functions.IcebergFunctionProvider; +import io.trino.plugin.iceberg.procedure.IcebergAddFilesFromTableHandle; +import io.trino.plugin.iceberg.procedure.IcebergAddFilesHandle; import io.trino.plugin.iceberg.procedure.IcebergDropExtendedStatsHandle; import io.trino.plugin.iceberg.procedure.IcebergExpireSnapshotsHandle; import io.trino.plugin.iceberg.procedure.IcebergOptimizeHandle; +import io.trino.plugin.iceberg.procedure.IcebergOptimizeManifestsHandle; import io.trino.plugin.iceberg.procedure.IcebergRemoveOrphanFilesHandle; +import io.trino.plugin.iceberg.procedure.IcebergRollbackToSnapshotHandle; import io.trino.plugin.iceberg.procedure.IcebergTableExecuteHandle; import io.trino.plugin.iceberg.procedure.IcebergTableProcedureId; +import io.trino.plugin.iceberg.procedure.MigrationUtils.RecursiveDirectory; +import io.trino.plugin.iceberg.system.AllManifestsTable; +import io.trino.plugin.iceberg.system.EntriesTable; +import io.trino.plugin.iceberg.system.FilesTable; +import io.trino.plugin.iceberg.system.HistoryTable; +import io.trino.plugin.iceberg.system.ManifestsTable; +import io.trino.plugin.iceberg.system.MetadataLogEntriesTable; +import io.trino.plugin.iceberg.system.PartitionsTable; +import io.trino.plugin.iceberg.system.PropertiesTable; +import io.trino.plugin.iceberg.system.RefsTable; +import io.trino.plugin.iceberg.system.SnapshotsTable; import io.trino.plugin.iceberg.util.DataFileWithDeleteFiles; import io.trino.spi.ErrorCode; import io.trino.spi.TrinoException; import io.trino.spi.block.Block; import io.trino.spi.connector.Assignment; import io.trino.spi.connector.BeginTableExecuteResult; -import io.trino.spi.connector.CatalogHandle; import io.trino.spi.connector.CatalogSchemaTableName; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ColumnMetadata; @@ -91,6 +115,11 @@ import io.trino.spi.expression.ConnectorExpression; import io.trino.spi.expression.FunctionName; import io.trino.spi.expression.Variable; +import io.trino.spi.function.BoundSignature; +import io.trino.spi.function.FunctionDependencyDeclaration; +import io.trino.spi.function.FunctionId; +import io.trino.spi.function.FunctionMetadata; +import io.trino.spi.function.SchemaFunctionName; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.NullableValue; import io.trino.spi.predicate.TupleDomain; @@ -99,9 +128,18 @@ import io.trino.spi.statistics.ComputedStatistics; import io.trino.spi.statistics.TableStatistics; import io.trino.spi.statistics.TableStatisticsMetadata; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.CharType; +import io.trino.spi.type.LongTimestamp; import io.trino.spi.type.LongTimestampWithTimeZone; +import io.trino.spi.type.MapType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.TimeType; +import io.trino.spi.type.TimestampType; import io.trino.spi.type.TimestampWithTimeZoneType; import io.trino.spi.type.TypeManager; +import io.trino.spi.type.TypeSignature; +import io.trino.spi.type.VarcharType; import org.apache.datasketches.theta.CompactSketch; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.BaseTable; @@ -110,11 +148,12 @@ import org.apache.iceberg.DataFiles; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.DeleteFiles; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.FileMetadata; import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.IcebergManifestUtils; import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.ManifestReader; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.PartitionField; @@ -122,13 +161,17 @@ import org.apache.iceberg.PartitionSpecParser; import org.apache.iceberg.ReplaceSortOrder; import org.apache.iceberg.RewriteFiles; +import org.apache.iceberg.RewriteManifests; import org.apache.iceberg.RowDelta; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotUpdate; import org.apache.iceberg.SortField; import org.apache.iceberg.SortOrder; import org.apache.iceberg.StatisticsFile; +import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableScan; @@ -137,11 +180,16 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.UpdateSchema; import org.apache.iceberg.UpdateStatistics; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.exceptions.CommitFailedException; +import org.apache.iceberg.exceptions.CommitStateUnknownException; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.Term; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.NestedField; @@ -151,9 +199,13 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Comparator; import java.util.Deque; import java.util.HashMap; @@ -165,30 +217,50 @@ import java.util.Optional; import java.util.OptionalLong; import java.util.Set; +import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; +import java.util.function.Function; import java.util.function.Predicate; import java.util.function.Supplier; import java.util.function.UnaryOperator; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.IntStream; +import java.util.stream.Stream; import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Verify.verify; import static com.google.common.base.Verify.verifyNotNull; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.getLast; +import static com.google.common.collect.Iterables.getOnlyElement; import static com.google.common.collect.Maps.transformValues; import static com.google.common.collect.Sets.difference; +import static io.trino.filesystem.Locations.isS3Tables; import static io.trino.plugin.base.projection.ApplyProjectionUtil.extractSupportedProjectedColumns; import static io.trino.plugin.base.projection.ApplyProjectionUtil.replaceWithNewVariables; +import static io.trino.plugin.base.util.ExecutorUtil.processWithAdditionalThreads; import static io.trino.plugin.base.util.Procedures.checkProcedureArgument; -import static io.trino.plugin.hive.util.HiveUtil.isStructuralType; +import static io.trino.plugin.hive.HiveMetadata.TRANSACTIONAL; +import static io.trino.plugin.hive.HiveTimestampPrecision.DEFAULT_PRECISION; +import static io.trino.plugin.hive.util.HiveTypeTranslator.toTypeSignature; +import static io.trino.plugin.hive.util.HiveUtil.isDeltaLakeTable; +import static io.trino.plugin.hive.util.HiveUtil.isHudiTable; +import static io.trino.plugin.hive.util.HiveUtil.isIcebergTable; +import static io.trino.plugin.iceberg.ColumnIdentity.createColumnIdentity; import static io.trino.plugin.iceberg.ConstraintExtractor.extractTupleDomain; +import static io.trino.plugin.iceberg.ExpressionConverter.isConvertibleToIcebergExpression; import static io.trino.plugin.iceberg.ExpressionConverter.toIcebergExpression; import static io.trino.plugin.iceberg.IcebergAnalyzeProperties.getColumnNames; import static io.trino.plugin.iceberg.IcebergColumnHandle.TRINO_MERGE_PARTITION_DATA; @@ -196,85 +268,159 @@ import static io.trino.plugin.iceberg.IcebergColumnHandle.TRINO_MERGE_ROW_ID; import static io.trino.plugin.iceberg.IcebergColumnHandle.TRINO_ROW_ID_NAME; import static io.trino.plugin.iceberg.IcebergColumnHandle.fileModifiedTimeColumnHandle; +import static io.trino.plugin.iceberg.IcebergColumnHandle.partitionColumnHandle; import static io.trino.plugin.iceberg.IcebergColumnHandle.pathColumnHandle; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CATALOG_ERROR; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_COMMIT_ERROR; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_METADATA; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_UNSUPPORTED_VIEW_DIALECT; +import static io.trino.plugin.iceberg.IcebergFileFormat.ORC; +import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; import static io.trino.plugin.iceberg.IcebergMetadataColumn.FILE_MODIFIED_TIME; import static io.trino.plugin.iceberg.IcebergMetadataColumn.FILE_PATH; +import static io.trino.plugin.iceberg.IcebergMetadataColumn.PARTITION; import static io.trino.plugin.iceberg.IcebergMetadataColumn.isMetadataColumnId; +import static io.trino.plugin.iceberg.IcebergPartitionFunction.Transform.BUCKET; import static io.trino.plugin.iceberg.IcebergSessionProperties.getExpireSnapshotMinRetention; import static io.trino.plugin.iceberg.IcebergSessionProperties.getHiveCatalogName; +import static io.trino.plugin.iceberg.IcebergSessionProperties.getQueryPartitionFilterRequiredSchemas; import static io.trino.plugin.iceberg.IcebergSessionProperties.getRemoveOrphanFilesMinRetention; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isBucketExecutionEnabled; import static io.trino.plugin.iceberg.IcebergSessionProperties.isCollectExtendedStatisticsOnWrite; import static io.trino.plugin.iceberg.IcebergSessionProperties.isExtendedStatisticsEnabled; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isFileBasedConflictDetectionEnabled; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isIncrementalRefreshEnabled; import static io.trino.plugin.iceberg.IcebergSessionProperties.isMergeManifestsOnWrite; import static io.trino.plugin.iceberg.IcebergSessionProperties.isProjectionPushdownEnabled; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isQueryPartitionFilterRequired; import static io.trino.plugin.iceberg.IcebergSessionProperties.isStatisticsEnabled; +import static io.trino.plugin.iceberg.IcebergTableName.isDataTable; +import static io.trino.plugin.iceberg.IcebergTableName.isIcebergTableName; +import static io.trino.plugin.iceberg.IcebergTableName.isMaterializedViewStorage; +import static io.trino.plugin.iceberg.IcebergTableName.tableNameFrom; +import static io.trino.plugin.iceberg.IcebergTableProperties.COMPRESSION_CODEC; +import static io.trino.plugin.iceberg.IcebergTableProperties.DATA_LOCATION_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.EXTRA_PROPERTIES_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.FILE_FORMAT_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.FORMAT_VERSION_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.MAX_COMMIT_RETRY; +import static io.trino.plugin.iceberg.IcebergTableProperties.OBJECT_STORE_LAYOUT_ENABLED_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.ORC_BLOOM_FILTER_COLUMNS_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.PARTITIONING_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.SORTED_BY_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.getPartitioning; +import static io.trino.plugin.iceberg.IcebergTableProperties.getTableLocation; +import static io.trino.plugin.iceberg.IcebergTableProperties.validateCompression; +import static io.trino.plugin.iceberg.IcebergUtil.buildPath; import static io.trino.plugin.iceberg.IcebergUtil.canEnforceColumnConstraintInSpecs; +import static io.trino.plugin.iceberg.IcebergUtil.checkFormatForProperty; import static io.trino.plugin.iceberg.IcebergUtil.commit; +import static io.trino.plugin.iceberg.IcebergUtil.createColumnHandle; import static io.trino.plugin.iceberg.IcebergUtil.deserializePartitionValue; import static io.trino.plugin.iceberg.IcebergUtil.fileName; import static io.trino.plugin.iceberg.IcebergUtil.firstSnapshot; import static io.trino.plugin.iceberg.IcebergUtil.firstSnapshotAfter; import static io.trino.plugin.iceberg.IcebergUtil.getColumnHandle; import static io.trino.plugin.iceberg.IcebergUtil.getColumnMetadatas; -import static io.trino.plugin.iceberg.IcebergUtil.getColumns; +import static io.trino.plugin.iceberg.IcebergUtil.getCompressionPropertyName; import static io.trino.plugin.iceberg.IcebergUtil.getFileFormat; +import static io.trino.plugin.iceberg.IcebergUtil.getHiveCompressionCodec; import static io.trino.plugin.iceberg.IcebergUtil.getIcebergTableProperties; import static io.trino.plugin.iceberg.IcebergUtil.getPartitionKeys; +import static io.trino.plugin.iceberg.IcebergUtil.getPartitionValues; +import static io.trino.plugin.iceberg.IcebergUtil.getProjectedColumns; import static io.trino.plugin.iceberg.IcebergUtil.getSnapshotIdAsOfTime; import static io.trino.plugin.iceberg.IcebergUtil.getTableComment; +import static io.trino.plugin.iceberg.IcebergUtil.getTopLevelColumns; +import static io.trino.plugin.iceberg.IcebergUtil.isSomeKindOfAView; import static io.trino.plugin.iceberg.IcebergUtil.newCreateTableTransaction; +import static io.trino.plugin.iceberg.IcebergUtil.readerForManifest; import static io.trino.plugin.iceberg.IcebergUtil.schemaFromMetadata; +import static io.trino.plugin.iceberg.IcebergUtil.validateOrcBloomFilterColumns; +import static io.trino.plugin.iceberg.IcebergUtil.validateParquetBloomFilterColumns; +import static io.trino.plugin.iceberg.IcebergUtil.verifyExtraProperties; import static io.trino.plugin.iceberg.PartitionFields.parsePartitionFields; -import static io.trino.plugin.iceberg.PartitionFields.toPartitionFields; import static io.trino.plugin.iceberg.SortFieldUtils.parseSortFields; -import static io.trino.plugin.iceberg.TableStatisticsReader.TRINO_STATS_COLUMN_ID_PATTERN; -import static io.trino.plugin.iceberg.TableStatisticsReader.TRINO_STATS_PREFIX; +import static io.trino.plugin.iceberg.StructLikeWrapperWithFieldIdToIndex.createStructLikeWrapper; +import static io.trino.plugin.iceberg.TableStatisticsReader.readNdvs; import static io.trino.plugin.iceberg.TableStatisticsWriter.StatsUpdateMode.INCREMENTAL_UPDATE; import static io.trino.plugin.iceberg.TableStatisticsWriter.StatsUpdateMode.REPLACE; import static io.trino.plugin.iceberg.TableType.DATA; +import static io.trino.plugin.iceberg.TypeConverter.toIcebergType; import static io.trino.plugin.iceberg.TypeConverter.toIcebergTypeForNewColumn; -import static io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog.DEPENDS_ON_TABLES; -import static io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog.TRINO_QUERY_START_TIME; +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.ADD_FILES; +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.ADD_FILES_FROM_TABLE; import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.DROP_EXTENDED_STATS; import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.EXPIRE_SNAPSHOTS; import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.OPTIMIZE; +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.OPTIMIZE_MANIFESTS; import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.REMOVE_ORPHAN_FILES; +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.ROLLBACK_TO_SNAPSHOT; +import static io.trino.plugin.iceberg.procedure.MigrationUtils.addFiles; +import static io.trino.plugin.iceberg.procedure.MigrationUtils.addFilesFromTable; import static io.trino.spi.StandardErrorCode.COLUMN_ALREADY_EXISTS; +import static io.trino.spi.StandardErrorCode.COLUMN_NOT_FOUND; import static io.trino.spi.StandardErrorCode.INVALID_ANALYZE_PROPERTY; import static io.trino.spi.StandardErrorCode.INVALID_ARGUMENTS; +import static io.trino.spi.StandardErrorCode.INVALID_TABLE_PROPERTY; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.StandardErrorCode.PERMISSION_DENIED; +import static io.trino.spi.StandardErrorCode.QUERY_REJECTED; +import static io.trino.spi.StandardErrorCode.TABLE_ALREADY_EXISTS; +import static io.trino.spi.StandardErrorCode.TABLE_NOT_FOUND; +import static io.trino.spi.StandardErrorCode.TYPE_MISMATCH; import static io.trino.spi.connector.MaterializedViewFreshness.Freshness.FRESH; import static io.trino.spi.connector.MaterializedViewFreshness.Freshness.STALE; import static io.trino.spi.connector.MaterializedViewFreshness.Freshness.UNKNOWN; import static io.trino.spi.connector.RetryMode.NO_RETRIES; import static io.trino.spi.connector.RowChangeParadigm.DELETE_ROW_AND_INSERT_ROW; +import static io.trino.spi.predicate.TupleDomain.withColumnDomains; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc; -import static io.trino.spi.type.UuidType.UUID; +import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.SmallintType.SMALLINT; +import static io.trino.spi.type.TimeType.TIME_MICROS; +import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.TinyintType.TINYINT; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.Boolean.parseBoolean; +import static java.lang.Math.floorDiv; import static java.lang.String.format; import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; import static java.util.function.Function.identity; import static java.util.stream.Collectors.joining; +import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; +import static org.apache.iceberg.MetadataTableType.ENTRIES; import static org.apache.iceberg.ReachableFileUtil.metadataFileLocations; -import static org.apache.iceberg.ReachableFileUtil.versionHintLocation; +import static org.apache.iceberg.ReachableFileUtil.statisticsFilesLocations; import static org.apache.iceberg.SnapshotSummary.DELETED_RECORDS_PROP; import static org.apache.iceberg.SnapshotSummary.REMOVED_EQ_DELETES_PROP; import static org.apache.iceberg.SnapshotSummary.REMOVED_POS_DELETES_PROP; +import static org.apache.iceberg.SnapshotSummary.TOTAL_DATA_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_FILES_PROP; +import static org.apache.iceberg.SnapshotSummary.TOTAL_RECORDS_PROP; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL; import static org.apache.iceberg.TableProperties.DELETE_ISOLATION_LEVEL_DEFAULT; import static org.apache.iceberg.TableProperties.FORMAT_VERSION; +import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.OBJECT_STORE_ENABLED; +import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_COLUMNS; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; +import static org.apache.iceberg.TableProperties.WRITE_DATA_LOCATION; import static org.apache.iceberg.TableProperties.WRITE_LOCATION_PROVIDER_IMPL; +import static org.apache.iceberg.TableUtil.formatVersion; +import static org.apache.iceberg.expressions.Expressions.alwaysTrue; import static org.apache.iceberg.types.TypeUtil.indexParents; +import static org.apache.iceberg.util.LocationUtil.stripTrailingSlash; import static org.apache.iceberg.util.SnapshotUtil.schemaFor; public class IcebergMetadata @@ -286,42 +432,102 @@ public class IcebergMetadata private static final int CLEANING_UP_PROCEDURES_MAX_SUPPORTED_TABLE_VERSION = 2; private static final String RETENTION_THRESHOLD = "retention_threshold"; private static final String UNKNOWN_SNAPSHOT_TOKEN = "UNKNOWN"; - public static final Set UPDATABLE_TABLE_PROPERTIES = ImmutableSet.of(FILE_FORMAT_PROPERTY, FORMAT_VERSION_PROPERTY, PARTITIONING_PROPERTY, SORTED_BY_PROPERTY); - - public static final String ORC_BLOOM_FILTER_COLUMNS_KEY = "orc.bloom.filter.columns"; - public static final String ORC_BLOOM_FILTER_FPP_KEY = "orc.bloom.filter.fpp"; + public static final Set UPDATABLE_TABLE_PROPERTIES = ImmutableSet.builder() + .add(EXTRA_PROPERTIES_PROPERTY) + .add(FILE_FORMAT_PROPERTY) + .add(FORMAT_VERSION_PROPERTY) + .add(COMPRESSION_CODEC) + .add(MAX_COMMIT_RETRY) + .add(OBJECT_STORE_LAYOUT_ENABLED_PROPERTY) + .add(DATA_LOCATION_PROPERTY) + .add(ORC_BLOOM_FILTER_COLUMNS_PROPERTY) + .add(PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY) + .add(PARTITIONING_PROPERTY) + .add(SORTED_BY_PROPERTY) + .build(); + private static final String SYSTEM_SCHEMA = "system"; public static final String NUMBER_OF_DISTINCT_VALUES_NAME = "NUMBER_OF_DISTINCT_VALUES"; private static final FunctionName NUMBER_OF_DISTINCT_VALUES_FUNCTION = new FunctionName(IcebergThetaSketchForStats.NAME); private static final Integer DELETE_BATCH_SIZE = 1000; public static final int GET_METADATA_BATCH_SIZE = 1000; + private static final MapSplitter MAP_SPLITTER = Splitter.on(",").trimResults().omitEmptyStrings().withKeyValueSeparator("="); + + private static final String DEPENDS_ON_TABLES = "dependsOnTables"; + private static final String DEPENDS_ON_TABLE_FUNCTIONS = "dependsOnTableFunctions"; + // Value should be ISO-8601 formatted time instant + private static final String TRINO_QUERY_START_TIME = "trino-query-start-time"; private final TypeManager typeManager; - private final CatalogHandle trinoCatalogHandle; private final JsonCodec commitTaskCodec; private final TrinoCatalog catalog; - private final TrinoFileSystemFactory fileSystemFactory; + private final IcebergFileSystemFactory fileSystemFactory; private final TableStatisticsWriter tableStatisticsWriter; - - private final Map tableStatisticsCache = new ConcurrentHashMap<>(); + private final Optional metastoreFactory; + private final boolean addFilesProcedureEnabled; + private final Predicate allowedExtraProperties; + private final ExecutorService icebergScanExecutor; + private final Executor metadataFetchingExecutor; + private final ExecutorService icebergPlanningExecutor; + private final ExecutorService icebergFileDeleteExecutor; + private final Map> tableStatisticsCache = new ConcurrentHashMap<>(); private Transaction transaction; + private Optional fromSnapshotForRefresh = Optional.empty(); public IcebergMetadata( TypeManager typeManager, - CatalogHandle trinoCatalogHandle, JsonCodec commitTaskCodec, TrinoCatalog catalog, - TrinoFileSystemFactory fileSystemFactory, - TableStatisticsWriter tableStatisticsWriter) + IcebergFileSystemFactory fileSystemFactory, + TableStatisticsWriter tableStatisticsWriter, + Optional metastoreFactory, + boolean addFilesProcedureEnabled, + Predicate allowedExtraProperties, + ExecutorService icebergScanExecutor, + Executor metadataFetchingExecutor, + ExecutorService icebergPlanningExecutor, + ExecutorService icebergFileDeleteExecutor) { this.typeManager = requireNonNull(typeManager, "typeManager is null"); - this.trinoCatalogHandle = requireNonNull(trinoCatalogHandle, "trinoCatalogHandle is null"); this.commitTaskCodec = requireNonNull(commitTaskCodec, "commitTaskCodec is null"); this.catalog = requireNonNull(catalog, "catalog is null"); this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.tableStatisticsWriter = requireNonNull(tableStatisticsWriter, "tableStatisticsWriter is null"); + this.metastoreFactory = requireNonNull(metastoreFactory, "metastoreFactory is null"); + this.addFilesProcedureEnabled = addFilesProcedureEnabled; + this.allowedExtraProperties = requireNonNull(allowedExtraProperties, "allowedExtraProperties is null"); + this.icebergScanExecutor = requireNonNull(icebergScanExecutor, "icebergScanExecutor is null"); + this.metadataFetchingExecutor = requireNonNull(metadataFetchingExecutor, "metadataFetchingExecutor is null"); + this.icebergPlanningExecutor = requireNonNull(icebergPlanningExecutor, "icebergPlanningExecutor is null"); + this.icebergFileDeleteExecutor = requireNonNull(icebergFileDeleteExecutor, "icebergFileDeleteExecutor is null"); + } + + @Override + public Collection listFunctions(ConnectorSession session, String schemaName) + { + return schemaName.equals(SYSTEM_SCHEMA) ? IcebergFunctionProvider.FUNCTIONS : List.of(); + } + + @Override + public Collection getFunctions(ConnectorSession session, SchemaFunctionName name) + { + if (!name.getSchemaName().equals(SYSTEM_SCHEMA)) { + return List.of(); + } + return IcebergFunctionProvider.FUNCTIONS.stream() + .filter(function -> function.getCanonicalName().equals(name.getFunctionName())) + .toList(); + } + + @Override + public FunctionMetadata getFunctionMetadata(ConnectorSession session, FunctionId functionId) + { + return IcebergFunctionProvider.FUNCTIONS.stream() + .filter(function -> function.getFunctionId().equals(functionId)) + .findFirst() + .orElseThrow(); } @Override @@ -348,12 +554,6 @@ public Optional getSchemaOwner(ConnectorSession session, String return catalog.getNamespacePrincipal(session, schemaName); } - @Override - public IcebergTableHandle getTableHandle(ConnectorSession session, SchemaTableName tableName) - { - throw new UnsupportedOperationException("This method is not supported because getTableHandle with versions is implemented instead"); - } - @Override public ConnectorTableHandle getTableHandle( ConnectorSession session, @@ -365,14 +565,32 @@ public ConnectorTableHandle getTableHandle( throw new TrinoException(NOT_SUPPORTED, "Read table with start version is not supported"); } - if (!IcebergTableName.isDataTable(tableName.getTableName())) { + if (!isIcebergTableName(tableName.getTableName())) { + return null; + } + + if (isMaterializedViewStorage(tableName.getTableName())) { + verify(endVersion.isEmpty(), "Materialized views do not support versioned queries"); + + SchemaTableName materializedViewName = new SchemaTableName(tableName.getSchemaName(), tableNameFrom(tableName.getTableName())); + if (getMaterializedView(session, materializedViewName).isEmpty()) { + throw new TableNotFoundException(tableName); + } + + BaseTable storageTable = catalog.getMaterializedViewStorageTable(session, materializedViewName) + .orElseThrow(() -> new TrinoException(TABLE_NOT_FOUND, "Storage table metadata not found for materialized view " + tableName)); + + return tableHandleForCurrentSnapshot(session, tableName, storageTable); + } + + if (!isDataTable(tableName.getTableName())) { // Pretend the table does not exist to produce better error message in case of table redirects to Hive return null; } BaseTable table; try { - table = (BaseTable) catalog.loadTable(session, new SchemaTableName(tableName.getSchemaName(), tableName.getTableName())); + table = catalog.loadTable(session, new SchemaTableName(tableName.getSchemaName(), tableName.getTableName())); } catch (TableNotFoundException e) { return null; @@ -386,66 +604,148 @@ public ConnectorTableHandle getTableHandle( throw e; } - Optional tableSnapshotId; - Schema tableSchema; - Optional partitionSpec; if (endVersion.isPresent()) { - long snapshotId = getSnapshotIdFromVersion(table, endVersion.get()); - tableSnapshotId = Optional.of(snapshotId); - tableSchema = schemaFor(table, snapshotId); - partitionSpec = Optional.empty(); - } - else { - tableSnapshotId = Optional.ofNullable(table.currentSnapshot()).map(Snapshot::snapshotId); - tableSchema = table.schema(); - partitionSpec = Optional.of(table.spec()); + long snapshotId = getSnapshotIdFromVersion(session, table, endVersion.get()); + return tableHandleForSnapshot( + session, + tableName, + table, + Optional.of(snapshotId), + schemaFor(table, snapshotId), + Optional.empty()); } + return tableHandleForCurrentSnapshot(session, tableName, table); + } + + private IcebergTableHandle tableHandleForCurrentSnapshot(ConnectorSession session, SchemaTableName tableName, BaseTable table) + { + return tableHandleForSnapshot( + session, + tableName, + table, + Optional.ofNullable(table.currentSnapshot()).map(Snapshot::snapshotId), + table.schema(), + Optional.of(table.spec())); + } + private IcebergTableHandle tableHandleForSnapshot( + ConnectorSession session, + SchemaTableName tableName, + BaseTable table, + Optional tableSnapshotId, + Schema tableSchema, + Optional partitionSpec) + { Map tableProperties = table.properties(); - String nameMappingJson = tableProperties.get(TableProperties.DEFAULT_NAME_MAPPING); return new IcebergTableHandle( - trinoCatalogHandle, tableName.getSchemaName(), tableName.getTableName(), DATA, tableSnapshotId, SchemaParser.toJson(tableSchema), partitionSpec.map(PartitionSpecParser::toJson), - table.operations().current().formatVersion(), + formatVersion(table), TupleDomain.all(), TupleDomain.all(), OptionalLong.empty(), ImmutableSet.of(), - Optional.ofNullable(nameMappingJson), + Optional.ofNullable(tableProperties.get(TableProperties.DEFAULT_NAME_MAPPING)), table.location(), table.properties(), + getTablePartitioning(session, table), + false, + Optional.empty(), + ImmutableSet.of(), + Optional.of(false)); + } + + private Optional getTablePartitioning(ConnectorSession session, Table icebergTable) + { + if (!isBucketExecutionEnabled(session) || icebergTable.specs().size() != 1) { + return Optional.empty(); + } + PartitionSpec partitionSpec = icebergTable.spec(); + if (partitionSpec.fields().isEmpty()) { + return Optional.empty(); + } + + Schema schema = icebergTable.schema(); + + IcebergPartitioningHandle partitioningHandle = IcebergPartitioningHandle.create(partitionSpec, typeManager, List.of()); + + Map columnById = getProjectedColumns(schema, typeManager).stream() + .collect(toImmutableMap(IcebergColumnHandle::getId, identity())); + List partitionColumns = partitionSpec.fields().stream() + .map(PartitionField::sourceId) + .distinct() + .sorted() + .map(columnById::get) + .collect(toImmutableList()); + + // Partitioning is only activated if it is actually necessary for the query. + // This happens in applyPartitioning + return Optional.of(new IcebergTablePartitioning( false, - Optional.empty()); + partitioningHandle, + partitionColumns, + IntStream.range(0, partitioningHandle.partitionFunctions().size()).boxed().collect(toImmutableList()))); } - private static long getSnapshotIdFromVersion(Table table, ConnectorTableVersion version) + private static long getSnapshotIdFromVersion(ConnectorSession session, Table table, ConnectorTableVersion version) { io.trino.spi.type.Type versionType = version.getVersionType(); return switch (version.getPointerType()) { - case TEMPORAL -> getTemporalSnapshotIdFromVersion(table, version, versionType); + case TEMPORAL -> getTemporalSnapshotIdFromVersion(session, table, version, versionType); case TARGET_ID -> getTargetSnapshotIdFromVersion(table, version, versionType); }; } private static long getTargetSnapshotIdFromVersion(Table table, ConnectorTableVersion version, io.trino.spi.type.Type versionType) { - if (versionType != BIGINT) { + long snapshotId; + if (versionType == BIGINT) { + snapshotId = (long) version.getVersion(); + } + else if (versionType instanceof VarcharType) { + String refName = ((Slice) version.getVersion()).toStringUtf8(); + SnapshotRef ref = table.refs().get(refName); + if (ref == null) { + throw new TrinoException(INVALID_ARGUMENTS, "Cannot find snapshot with reference name: " + refName); + } + snapshotId = ref.snapshotId(); + } + else { throw new TrinoException(NOT_SUPPORTED, "Unsupported type for table version: " + versionType.getDisplayName()); } - long snapshotId = (long) version.getVersion(); + if (table.snapshot(snapshotId) == null) { throw new TrinoException(INVALID_ARGUMENTS, "Iceberg snapshot ID does not exists: " + snapshotId); } return snapshotId; } - private static long getTemporalSnapshotIdFromVersion(Table table, ConnectorTableVersion version, io.trino.spi.type.Type versionType) + private static long getTemporalSnapshotIdFromVersion(ConnectorSession session, Table table, ConnectorTableVersion version, io.trino.spi.type.Type versionType) { + if (versionType.equals(DATE)) { + // Retrieve the latest snapshot made before or at the beginning of the day of the specified date in the session's time zone + long epochMillis = LocalDate.ofEpochDay((Long) version.getVersion()) + .atStartOfDay() + .atZone(session.getTimeZoneKey().getZoneId()) + .toInstant() + .toEpochMilli(); + return getSnapshotIdAsOfTime(table, epochMillis); + } + if (versionType instanceof TimestampType timestampVersionType) { + long epochMicrosUtc = timestampVersionType.isShort() + ? (long) version.getVersion() + : ((LongTimestamp) version.getVersion()).getEpochMicros(); + long epochMillisUtc = floorDiv(epochMicrosUtc, MICROSECONDS_PER_MILLISECOND); + long epochMillis = LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMillisUtc), ZoneOffset.UTC) + .atZone(session.getTimeZoneKey().getZoneId()) + .toInstant() + .toEpochMilli(); + return getSnapshotIdAsOfTime(table, epochMillis); + } if (versionType instanceof TimestampWithTimeZoneType timeZonedVersionType) { long epochMillis = timeZonedVersionType.isShort() ? unpackMillisUtc((long) version.getVersion()) @@ -464,13 +764,13 @@ public Optional getSystemTable(ConnectorSession session, SchemaTabl private Optional getRawSystemTable(ConnectorSession session, SchemaTableName tableName) { - if (IcebergTableName.isDataTable(tableName.getTableName())) { + if (!isIcebergTableName(tableName.getTableName()) || isDataTable(tableName.getTableName()) || isMaterializedViewStorage(tableName.getTableName())) { return Optional.empty(); } // Only when dealing with an actual system table proceed to retrieve the base table for the system table - String name = IcebergTableName.tableNameFrom(tableName.getTableName()); - Table table; + String name = tableNameFrom(tableName.getTableName()); + BaseTable table; try { table = catalog.loadTable(session, new SchemaTableName(tableName.getSchemaName(), name)); } @@ -482,20 +782,20 @@ private Optional getRawSystemTable(ConnectorSession session, Schema return Optional.empty(); } - Optional tableType = IcebergTableName.tableTypeFrom(tableName.getTableName()); - if (tableType.isEmpty()) { - return Optional.empty(); - } - SchemaTableName systemTableName = new SchemaTableName(tableName.getSchemaName(), IcebergTableName.tableNameWithType(name, tableType.get())); - return switch (tableType.get()) { - case DATA -> throw new VerifyException("Unexpected DATA table type"); // Handled above. - case HISTORY -> Optional.of(new HistoryTable(systemTableName, table)); - case SNAPSHOTS -> Optional.of(new SnapshotsTable(systemTableName, typeManager, table)); - case PARTITIONS -> Optional.of(new PartitionTable(systemTableName, typeManager, table, getCurrentSnapshotId(table))); - case MANIFESTS -> Optional.of(new ManifestsTable(systemTableName, table, getCurrentSnapshotId(table))); - case FILES -> Optional.of(new FilesTable(systemTableName, typeManager, table, getCurrentSnapshotId(table))); - case PROPERTIES -> Optional.of(new PropertiesTable(systemTableName, table)); - case REFS -> Optional.of(new RefsTable(systemTableName, table)); + TableType tableType = IcebergTableName.tableTypeFrom(tableName.getTableName()); + return switch (tableType) { + case DATA, MATERIALIZED_VIEW_STORAGE -> throw new VerifyException("Unexpected table type: " + tableType); // Handled above. + case HISTORY -> Optional.of(new HistoryTable(tableName, table)); + case METADATA_LOG_ENTRIES -> Optional.of(new MetadataLogEntriesTable(tableName, table, icebergScanExecutor)); + case SNAPSHOTS -> Optional.of(new SnapshotsTable(tableName, typeManager, table, icebergScanExecutor)); + case PARTITIONS -> Optional.of(new PartitionsTable(tableName, typeManager, table, getCurrentSnapshotId(table), icebergScanExecutor)); + case ALL_MANIFESTS -> Optional.of(new AllManifestsTable(tableName, table, icebergScanExecutor)); + case MANIFESTS -> Optional.of(new ManifestsTable(tableName, table, getCurrentSnapshotId(table))); + case FILES -> Optional.of(new FilesTable(tableName, typeManager, table, getCurrentSnapshotId(table))); + case ALL_ENTRIES -> Optional.of(new EntriesTable(typeManager, tableName, table, ALL_ENTRIES, icebergScanExecutor)); + case ENTRIES -> Optional.of(new EntriesTable(typeManager, tableName, table, ENTRIES, icebergScanExecutor)); + case PROPERTIES -> Optional.of(new PropertiesTable(tableName, table)); + case REFS -> Optional.of(new RefsTable(tableName, table, icebergScanExecutor)); }; } @@ -520,44 +820,49 @@ public ConnectorTableProperties getTableProperties(ConnectorSession session, Con DiscretePredicates discretePredicates = null; if (!partitionSourceIds.isEmpty()) { // Extract identity partition columns - Map columns = getColumns(icebergTable.schema(), typeManager).stream() - .filter(column -> partitionSourceIds.contains(column.getId())) + Map columns = getProjectedColumns(icebergTable.schema(), typeManager, partitionSourceIds).stream() .collect(toImmutableMap(IcebergColumnHandle::getId, identity())); - Supplier> lazyFiles = Suppliers.memoize(() -> { + Supplier> lazyUniquePartitions = Suppliers.memoize(() -> { TableScan tableScan = icebergTable.newScan() .useSnapshot(table.getSnapshotId().get()) - .filter(toIcebergExpression(enforcedPredicate)); - - try (CloseableIterable iterator = tableScan.planFiles()) { - return ImmutableList.copyOf(iterator); + .filter(toIcebergExpression(enforcedPredicate)) + .planWith(icebergPlanningExecutor); + + try (CloseableIterable fileScanTasks = tableScan.planFiles()) { + Map partitions = new HashMap<>(); + for (FileScanTask fileScanTask : fileScanTasks) { + StructLikeWrapperWithFieldIdToIndex structLikeWrapperWithFieldIdToIndex = createStructLikeWrapper(fileScanTask); + partitions.putIfAbsent(structLikeWrapperWithFieldIdToIndex, fileScanTask.spec()); + } + return partitions; } catch (IOException e) { throw new UncheckedIOException(e); } }); - Iterable files = () -> lazyFiles.get().iterator(); - - Iterable> discreteTupleDomain = Iterables.transform(files, fileScan -> { - // Extract partition values in the data file - Map> partitionColumnValueStrings = getPartitionKeys(fileScan); - Map partitionValues = partitionSourceIds.stream() - .filter(partitionColumnValueStrings::containsKey) - .collect(toImmutableMap( - columns::get, - columnId -> { - IcebergColumnHandle column = columns.get(columnId); - Object prestoValue = deserializePartitionValue( - column.getType(), - partitionColumnValueStrings.get(columnId).orElse(null), - column.getName()); - - return NullableValue.of(column.getType(), prestoValue); - })); - - return TupleDomain.fromFixedValues(partitionValues); - }); + Iterable> discreteTupleDomain = Iterables.transform( + () -> lazyUniquePartitions.get().entrySet().iterator(), + entry -> { + // Extract partition values + Map> partitionColumnValueStrings = getPartitionKeys(entry.getKey().getStructLikeWrapper().get(), entry.getValue()); + Map partitionValues = partitionSourceIds.stream() + .filter(partitionColumnValueStrings::containsKey) + .collect(toImmutableMap( + columns::get, + columnId -> { + IcebergColumnHandle column = columns.get(columnId); + Object prestoValue = deserializePartitionValue( + column.getType(), + partitionColumnValueStrings.get(columnId).orElse(null), + column.getName()); + + return new NullableValue(column.getType(), prestoValue); + })); + + return TupleDomain.fromFixedValues(partitionValues); + }); discretePredicates = new DiscretePredicates( columns.values().stream() @@ -572,12 +877,65 @@ public ConnectorTableProperties getTableProperties(ConnectorSession session, Con // can be further optimized by intersecting with partition values at the cost of iterating // over all tableScan.planFiles() and caching partition values in table handle. enforcedPredicate.transformKeys(ColumnHandle.class::cast), - // TODO: implement table partitioning - Optional.empty(), + table.getTablePartitioning().flatMap(IcebergTablePartitioning::toConnectorTablePartitioning), Optional.ofNullable(discretePredicates), + // todo support sorting properties ImmutableList.of()); } + //@Override + public Optional applyPartitioning(ConnectorSession session, ConnectorTableHandle tableHandle, Optional partitioningHandle, List partitioningColumns) + { + IcebergTableHandle icebergTableHandle = checkValidTableHandle(tableHandle); + if (icebergTableHandle.getPartitionSpecJson().isEmpty()) { + return Optional.empty(); + } + + Optional connectorTablePartitioning = icebergTableHandle.getTablePartitioning(); + if (connectorTablePartitioning.isEmpty()) { + return Optional.empty(); + } + IcebergTablePartitioning tablePartitioning = connectorTablePartitioning.get(); + + // Check if the table can be partitioned on the requested columns + if (!new HashSet<>(tablePartitioning.partitioningColumns()).containsAll(partitioningColumns)) { + return Optional.empty(); + } + + Map newPartitioningColumnIndex = IntStream.range(0, partitioningColumns.size()).boxed() + .collect(toImmutableMap(partitioningColumns::get, identity())); + ImmutableList.Builder newPartitionFunctions = ImmutableList.builder(); + ImmutableList.Builder newPartitionStructFields = ImmutableList.builder(); + for (int functionIndex = 0; functionIndex < tablePartitioning.partitioningHandle().partitionFunctions().size(); functionIndex++) { + IcebergPartitionFunction function = tablePartitioning.partitioningHandle().partitionFunctions().get(functionIndex); + int oldColumnIndex = function.dataPath().get(0); + Integer newColumnIndex = newPartitioningColumnIndex.get(tablePartitioning.partitioningColumns().get(oldColumnIndex)); + if (newColumnIndex != null) { + // Change the index of the top level column to the location in the new partitioning columns + newPartitionFunctions.add(function.withTopLevelColumnIndex(newColumnIndex)); + // Some partition functions may be dropped so update the struct fields used in split partitioning must be updated + newPartitionStructFields.add(tablePartitioning.partitionStructFields().get(functionIndex)); + } + } + + IcebergPartitioningHandle newPartitioningHandle = new IcebergPartitioningHandle(false, newPartitionFunctions.build()); + if (partitioningHandle.isPresent() && !partitioningHandle.get().equals(newPartitioningHandle)) { + // todo if bucketing is a power of two, we can adapt the bucketing + return Optional.empty(); + } + if (newPartitioningHandle.partitionFunctions().stream().map(IcebergPartitionFunction::transform).noneMatch(BUCKET::equals)) { + // The table is only using value-based partitioning, and this can hurt performance if there is a filter + // on the partitioning columns. This is something we may be able to support with statistics in the future. + return Optional.empty(); + } + + return Optional.of(icebergTableHandle.withTablePartitioning(Optional.of(new IcebergTablePartitioning( + true, + newPartitioningHandle, + partitioningColumns.stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList()), + newPartitionStructFields.build())))); + } + @Override public SchemaTableName getTableName(ConnectorSession session, ConnectorTableHandle table) { @@ -593,7 +951,7 @@ public ConnectorTableMetadata getTableMetadata(ConnectorSession session, Connect IcebergTableHandle tableHandle = checkValidTableHandle(table); // This method does not calculate column metadata for the projected columns checkArgument(tableHandle.getProjectedColumns().isEmpty(), "Unexpected projected columns"); - Table icebergTable = catalog.loadTable(session, tableHandle.getSchemaTableName()); + BaseTable icebergTable = catalog.loadTable(session, tableHandle.getSchemaTableName()); List columns = getColumnMetadatas(SchemaParser.fromJson(tableHandle.getTableSchemaJson()), typeManager); return new ConnectorTableMetadata(tableHandle.getSchemaTableName(), columns, getIcebergTableProperties(icebergTable), getTableComment(icebergTable)); } @@ -601,17 +959,32 @@ public ConnectorTableMetadata getTableMetadata(ConnectorSession session, Connect @Override public List listTables(ConnectorSession session, Optional schemaName) { - return catalog.listTables(session, schemaName); + return catalog.listTables(session, schemaName).stream() + .map(TableInfo::tableName) + .toList(); + } + + /* + @Override + public Map getRelationTypes(ConnectorSession session, Optional schemaName) + { + ImmutableMap.Builder result = ImmutableMap.builder(); + for (TableInfo info : catalog.listTables(session, schemaName)) { + result.put(info.tableName(), info.extendedRelationType().toRelationType()); + } + return result.buildKeepingLast(); } + */ @Override public Map getColumnHandles(ConnectorSession session, ConnectorTableHandle tableHandle) { IcebergTableHandle table = checkValidTableHandle(tableHandle); ImmutableMap.Builder columnHandles = ImmutableMap.builder(); - for (IcebergColumnHandle columnHandle : getColumns(SchemaParser.fromJson(table.getTableSchemaJson()), typeManager)) { + for (IcebergColumnHandle columnHandle : getTopLevelColumns(SchemaParser.fromJson(table.getTableSchemaJson()), typeManager)) { columnHandles.put(columnHandle.getName(), columnHandle); } + columnHandles.put(PARTITION.getColumnName(), partitionColumnHandle()); columnHandles.put(FILE_PATH.getColumnName(), pathColumnHandle()); columnHandles.put(FILE_MODIFIED_TIME.getColumnName(), fileModifiedTimeColumnHandle()); return columnHandles.buildOrThrow(); @@ -624,10 +997,55 @@ public ColumnMetadata getColumnMetadata(ConnectorSession session, ConnectorTable return ColumnMetadata.builder() .setName(column.getName()) .setType(column.getType()) + .setNullable(column.isNullable()) .setComment(column.getComment()) + .setHidden(isMetadataColumnId(column.getId())) .build(); } + @Override + public void validateScan(ConnectorSession session, ConnectorTableHandle handle) + { + IcebergTableHandle table = (IcebergTableHandle) handle; + if (isQueryPartitionFilterRequiredForTable(session, table) && table.getEnforcedPredicate().isAll() && !table.getForAnalyze().orElseThrow()) { + Schema schema = SchemaParser.fromJson(table.getTableSchemaJson()); + Optional partitionSpec = table.getPartitionSpecJson() + .map(partitionSpecJson -> PartitionSpecParser.fromJson(schema, partitionSpecJson)); + if (partitionSpec.isEmpty() || partitionSpec.get().isUnpartitioned()) { + return; + } + Set columnsWithPredicates = new HashSet<>(); + table.getConstraintColumns().stream() + .map(IcebergColumnHandle::getId) + .forEach(columnsWithPredicates::add); + table.getUnenforcedPredicate().getDomains().ifPresent(domain -> domain.keySet().stream() + .map(IcebergColumnHandle::getId) + .forEach(columnsWithPredicates::add)); + Set partitionColumns = partitionSpec.get().fields().stream() + .filter(field -> !field.transform().isVoid()) + .map(PartitionField::sourceId) + .collect(toImmutableSet()); + if (Collections.disjoint(columnsWithPredicates, partitionColumns)) { + String partitionColumnNames = partitionSpec.get().fields().stream() + .filter(field -> !field.transform().isVoid()) + .map(PartitionField::sourceId) + .map(id -> schema.idToName().get(id)) + .collect(joining(", ")); + throw new TrinoException( + QUERY_REJECTED, + format("Filter required for %s on at least one of the partition columns: %s", table.getSchemaTableName(), partitionColumnNames)); + } + } + } + + private static boolean isQueryPartitionFilterRequiredForTable(ConnectorSession session, IcebergTableHandle table) + { + Set requiredSchemas = getQueryPartitionFilterRequiredSchemas(session); + // If query_partition_filter_required_schemas is empty then we would apply partition filter for all tables. + return isQueryPartitionFilterRequired(session) && + (requiredSchemas.isEmpty() || requiredSchemas.contains(table.getSchemaName())); + } + @Override public Map> listTableColumns(ConnectorSession session, SchemaTablePrefix prefix) { @@ -640,7 +1058,9 @@ public Iterator streamTableColumns(ConnectorSession sessio requireNonNull(prefix, "prefix is null"); List schemaTableNames; if (prefix.getTable().isEmpty()) { - schemaTableNames = catalog.listTables(session, prefix.getSchema()); + schemaTableNames = catalog.listTables(session, prefix.getSchema()).stream() + .map(TableInfo::tableName) + .collect(toImmutableList()); } else { schemaTableNames = ImmutableList.of(prefix.toSchemaTableName()); @@ -665,26 +1085,40 @@ public Iterator streamTableColumns(ConnectorSession sessio tableMetadatas.add(TableColumnsMetadata.forTable(tableName, columns)); }); - for (SchemaTableName tableName : remainingTables) { - try { - Table icebergTable = catalog.loadTable(session, tableName); - List columns = getColumnMetadatas(icebergTable.schema(), typeManager); - tableMetadatas.add(TableColumnsMetadata.forTable(tableName, columns)); - } - catch (TableNotFoundException e) { - // Table disappeared during listing operation - continue; - } - catch (UnknownTableTypeException e) { - // Skip unsupported table type in case that the table redirects are not enabled - continue; - } - catch (RuntimeException e) { - // Table can be being removed and this may cause all sorts of exceptions. Log, because we're catching broadly. - log.warn(e, "Failed to access metadata of table %s during streaming table columns for %s", tableName, prefix); - continue; - } + List>> tasks = remainingTables.stream() + .map(tableName -> (Callable>) () -> { + try { + Table icebergTable = catalog.loadTable(session, tableName); + List columns = getColumnMetadatas(icebergTable.schema(), typeManager); + return Optional.of(TableColumnsMetadata.forTable(tableName, columns)); + } + catch (TableNotFoundException e) { + // Table disappeared during listing operation + return Optional.empty(); + } + catch (UnknownTableTypeException e) { + // Skip unsupported table type in case that the table redirects are not enabled + return Optional.empty(); + } + catch (RuntimeException e) { + // Table can be being removed and this may cause all sorts of exceptions. Log, because we're catching broadly. + log.warn(e, "Failed to access metadata of table %s during streaming table columns for %s", tableName, prefix); + return Optional.empty(); + } + }) + .collect(toImmutableList()); + + try { + List taskResults = processWithAdditionalThreads(tasks, metadataFetchingExecutor).stream() + .flatMap(Optional::stream) // Flatten the Optionals into a stream + .collect(toImmutableList()); + + tableMetadatas.addAll(taskResults); + } + catch (ExecutionException e) { + throw new RuntimeException(e.getCause()); } + return tableMetadatas.build(); }) .flatMap(List::stream) @@ -718,8 +1152,26 @@ public void createSchema(ConnectorSession session, String schemaName, Map nestedNamespaces = getChildNamespaces(session, schemaName); + if (!nestedNamespaces.isEmpty()) { + throw new TrinoException( + ICEBERG_CATALOG_ERROR, + format("Cannot drop non-empty schema: %s, contains %s nested schema(s)", schemaName, Joiner.on(", ").join(nestedNamespaces))); + } + + for (SchemaTableName materializedView : listMaterializedViews(session, Optional.of(schemaName))) { + dropMaterializedView(session, materializedView); + } + for (SchemaTableName viewName : listViews(session, Optional.of(schemaName))) { + dropView(session, viewName); + } + for (SchemaTableName tableName : listTables(session, Optional.of(schemaName))) { + dropTable(session, getTableHandle(session, tableName, Optional.empty(), Optional.empty())); + } + } catalog.dropNamespace(session, schemaName); } @@ -739,7 +1191,7 @@ public void setSchemaAuthorization(ConnectorSession session, String schemaName, public void createTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, boolean ignoreExisting) { Optional layout = getNewTableLayout(session, tableMetadata); - finishCreateTable(session, beginCreateTable(session, tableMetadata, layout, NO_RETRIES), ImmutableList.of(), ImmutableList.of()); + finishCreateTable(session, beginCreateTable(session, tableMetadata, layout, NO_RETRIES, ignoreExisting), ImmutableList.of(), ImmutableList.of()); } @Override @@ -775,26 +1227,97 @@ public Optional getNewTableLayout(ConnectorSession session return getWriteLayout(schema, partitionSpec, false); } + //@Override + public Optional getSupportedType(ConnectorSession session, Map tableProperties, io.trino.spi.type.Type type) + { + io.trino.spi.type.Type newType = coerceType(type); + if (type.getTypeSignature().equals(newType.getTypeSignature())) { + return Optional.empty(); + } + return Optional.of(newType); + } + + private io.trino.spi.type.Type coerceType(io.trino.spi.type.Type type) + { + if (type == TINYINT || type == SMALLINT) { + return INTEGER; + } + if (type instanceof TimestampWithTimeZoneType) { + return TIMESTAMP_TZ_MICROS; + } + if (type instanceof TimestampType) { + return TIMESTAMP_MICROS; + } + if (type instanceof TimeType) { + return TIME_MICROS; + } + if (type instanceof CharType) { + return VARCHAR; + } + if (type instanceof ArrayType arrayType) { + return new ArrayType(coerceType(arrayType.getElementType())); + } + if (type instanceof MapType mapType) { + return new MapType(coerceType(mapType.getKeyType()), coerceType(mapType.getValueType()), typeManager.getTypeOperators()); + } + if (type instanceof RowType rowType) { + return RowType.from(rowType.getFields().stream() + .map(field -> new RowType.Field(field.getName(), coerceType(field.getType()))) + .collect(toImmutableList())); + } + return type; + } + @Override - public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional layout, RetryMode retryMode) + public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional layout, RetryMode retryModee) + { + return beginCreateTable(session, tableMetadata, layout, retryModee, false); + } + + public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional layout, RetryMode retryModee, boolean replace) { verify(transaction == null, "transaction already set"); String schemaName = tableMetadata.getTable().getSchemaName(); if (!schemaExists(session, schemaName)) { throw new SchemaNotFoundException(schemaName); } - transaction = newCreateTableTransaction(catalog, tableMetadata, session); + + String tableLocation = null; + if (replace) { + ConnectorTableHandle tableHandle = getTableHandle(session, tableMetadata.getTableSchema().getTable(), Optional.empty(), Optional.empty()); + if (tableHandle != null) { + checkValidTableHandle(tableHandle); + IcebergTableHandle table = (IcebergTableHandle) tableHandle; + verifyTableVersionForUpdate(table); + Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); + Optional providedTableLocation = getTableLocation(tableMetadata.getProperties()); + if (providedTableLocation.isPresent() && !stripTrailingSlash(providedTableLocation.get()).equals(icebergTable.location())) { + throw new TrinoException(INVALID_TABLE_PROPERTY, format("The provided location '%s' does not match the existing table location '%s'", providedTableLocation.get(), icebergTable.location())); + } + validateNotModifyingOldSnapshot(table, icebergTable); + tableLocation = icebergTable.location(); + } + } + + if (tableLocation == null) { + tableLocation = getTableLocation(tableMetadata.getProperties()) + .orElseGet(() -> catalog.defaultTableLocation(session, tableMetadata.getTable())); + } + transaction = newCreateTableTransaction(catalog, tableMetadata, session, replace, tableLocation, allowedExtraProperties); Location location = Location.of(transaction.table().location()); - TrinoFileSystem fileSystem = fileSystemFactory.create(session); try { - if (fileSystem.listFiles(location).hasNext()) { - throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, format("" + - "Cannot create a table on a non-empty location: %s, set 'iceberg.unique-table-location=true' in your Iceberg catalog properties " + - "to use unique table locations for every table.", location)); + // S3 Tables internally assigns a unique location for each table + if (!isS3Tables(location.toString())) { + TrinoFileSystem fileSystem = fileSystemFactory.create(session.getIdentity(), transaction.table().io().properties()); + if (!replace && fileSystem.listFiles(location).hasNext()) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, format("" + + "Cannot create a table on a non-empty location: %s, set 'iceberg.unique-table-location=true' in your Iceberg catalog properties " + + "to use unique table locations for every table.", location)); + } } return newWritableTableHandle(tableMetadata.getTable(), transaction.table(), retryMode); } - catch (IOException e) { + catch (IOException | UncheckedIOException e) { throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed checking new table's location: " + location, e); } } @@ -802,16 +1325,22 @@ public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, Con @Override public Optional finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection fragments, Collection computedStatistics) { - if (fragments.isEmpty()) { - // Commit the transaction if the table is being created without data - AppendFiles appendFiles = transaction.newFastAppend(); - commit(appendFiles, session); - transaction.commitTransaction(); - transaction = null; - return Optional.empty(); - } + IcebergWritableTableHandle icebergTableHandle = (IcebergWritableTableHandle) tableHandle; + try { + if (fragments.isEmpty()) { + // Commit the transaction if the table is being created without data + AppendFiles appendFiles = transaction.newFastAppend(); + commitUpdateAndTransaction(appendFiles, session, transaction, "create table"); + transaction = null; + return Optional.empty(); + } - return finishInsert(session, (IcebergWritableTableHandle) tableHandle, fragments, computedStatistics); + return finishInsert(session, icebergTableHandle, fragments, computedStatistics); + } + catch (AlreadyExistsException e) { + // May happen when table has been already created concurrently. + throw new TrinoException(TABLE_ALREADY_EXISTS, format("Table %s already exists", icebergTableHandle.name()), e); + } } @Override @@ -831,13 +1360,35 @@ private Optional getWriteLayout(Schema tableSchema, Partit return Optional.empty(); } - validateNotPartitionedByNestedField(tableSchema, partitionSpec); - Map columnById = getColumns(tableSchema, typeManager).stream() - .collect(toImmutableMap(IcebergColumnHandle::getId, identity())); + StructType schemaAsStruct = tableSchema.asStruct(); + Map indexById = TypeUtil.indexById(schemaAsStruct); + Map indexParents = indexParents(schemaAsStruct); + Map> indexPaths = indexById.entrySet().stream() + .collect(toImmutableMap(Map.Entry::getKey, entry -> ImmutableList.copyOf(buildPath(indexParents, entry.getKey())))); List partitioningColumns = partitionSpec.fields().stream() .sorted(Comparator.comparing(PartitionField::sourceId)) - .map(field -> requireNonNull(columnById.get(field.sourceId()), () -> "Cannot find source column for partitioning field " + field)) + .map(field -> { + boolean isBaseColumn = !indexParents.containsKey(field.sourceId()); + int sourceId; + if (isBaseColumn) { + sourceId = field.sourceId(); + } + else { + sourceId = getRootFieldId(indexParents, field.sourceId()); + } + Type sourceType = tableSchema.findType(sourceId); + // The source column, must be a primitive type and cannot be contained in a map or list, but may be nested in a struct. + // https://iceberg.apache.org/spec/#partitioning + if (sourceType.isMapType()) { + throw new TrinoException(NOT_SUPPORTED, "Partitioning field [" + field.name() + "] cannot be contained in a map"); + } + if (sourceType.isListType()) { + throw new TrinoException(NOT_SUPPORTED, "Partitioning field [" + field.name() + "] cannot be contained in a array"); + } + verify(indexById.containsKey(sourceId), "Cannot find source column for partition field " + field); + return createColumnHandle(typeManager, sourceId, indexById, indexPaths); + }) .distinct() .collect(toImmutableList()); List partitioningColumnNames = partitioningColumns.stream() @@ -848,10 +1399,19 @@ private Optional getWriteLayout(Schema tableSchema, Partit // Do not set partitioningHandle, to let engine determine whether to repartition data or not, on stat-based basis. return Optional.of(new ConnectorTableLayout(partitioningColumnNames)); } - IcebergPartitioningHandle partitioningHandle = new IcebergPartitioningHandle(toPartitionFields(partitionSpec), partitioningColumns); + IcebergPartitioningHandle partitioningHandle = IcebergPartitioningHandle.create(partitionSpec, typeManager, List.of()); return Optional.of(new ConnectorTableLayout(partitioningHandle, partitioningColumnNames, true)); } + private static int getRootFieldId(Map indexParents, int fieldId) + { + int rootFieldId = fieldId; + while (indexParents.containsKey(rootFieldId)) { + rootFieldId = indexParents.get(rootFieldId); + } + return rootFieldId; + } + @Override public ConnectorInsertTableHandle beginInsert(ConnectorSession session, ConnectorTableHandle tableHandle, List columns, RetryMode retryMode) { @@ -859,13 +1419,25 @@ public ConnectorInsertTableHandle beginInsert(ConnectorSession session, Connecto Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); validateNotModifyingOldSnapshot(table, icebergTable); - validateNotPartitionedByNestedField(icebergTable.schema(), icebergTable.spec()); beginTransaction(icebergTable); return newWritableTableHandle(table.getSchemaTableName(), icebergTable, retryMode); } + private List getChildNamespaces(ConnectorSession session, String parentNamespace) + { + Optional namespaceSeparator = catalog.getNamespaceSeparator(); + + if (namespaceSeparator.isEmpty()) { + return ImmutableList.of(); + } + + return catalog.listNamespaces(session).stream() + .filter(namespace -> namespace.startsWith(parentNamespace + namespaceSeparator.get())) + .collect(toImmutableList()); + } + private IcebergWritableTableHandle newWritableTableHandle(SchemaTableName name, Table table, RetryMode retryMode) { return new IcebergWritableTableHandle( @@ -874,11 +1446,12 @@ private IcebergWritableTableHandle newWritableTableHandle(SchemaTableName name, transformValues(table.specs(), PartitionSpecParser::toJson), table.spec().specId(), getSupportedSortFields(table.schema(), table.sortOrder()), - getColumns(table.schema(), typeManager), + getProjectedColumns(table.schema(), typeManager), table.location(), getFileFormat(table), table.properties(), - retryMode); + retryMode, + table.io().properties()); } private static List getSupportedSortFields(Schema schema, SortOrder sortOrder) @@ -890,7 +1463,7 @@ private static List getSupportedSortFields(Schema schema, SortOr .map(Types.NestedField::fieldId) .collect(toImmutableSet()); - ImmutableList.Builder sortFields = ImmutableList.builder(); + ImmutableList.Builder sortFields = ImmutableList.builder(); for (SortField sortField : sortOrder.fields()) { if (!sortField.transform().isIdentity()) { continue; @@ -906,10 +1479,15 @@ private static List getSupportedSortFields(Schema schema, SortOr } @Override - public Optional finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection fragments, Collection computedStatistics) + public Optional finishInsert( + ConnectorSession session, + ConnectorInsertTableHandle insertHandle, + Collection fragments, + Collection computedStatistics) { List commitTasks = fragments.stream() - .map(slice -> commitTaskCodec.fromJson(slice.getBytes())) + .map(Slice::getBytes) + .map(commitTaskCodec::fromJson) .collect(toImmutableList()); if (commitTasks.isEmpty()) { @@ -930,39 +1508,43 @@ public Optional finishInsert(ConnectorSession session, ImmutableSet.Builder writtenFiles = ImmutableSet.builder(); for (CommitTaskData task : commitTasks) { DataFiles.Builder builder = DataFiles.builder(icebergTable.spec()) - .withPath(task.getPath()) - .withFileSizeInBytes(task.getFileSizeInBytes()) - .withFormat(table.getFileFormat().toIceberg()) - .withMetrics(task.getMetrics().metrics()); + .withPath(task.path()) + .withFileSizeInBytes(task.fileSizeInBytes()) + .withFormat(table.fileFormat().toIceberg()) + .withMetrics(task.metrics().metrics()); + task.fileSplitOffsets().ifPresent(builder::withSplitOffsets); if (!icebergTable.spec().fields().isEmpty()) { - String partitionDataJson = task.getPartitionDataJson() + String partitionDataJson = task.partitionDataJson() .orElseThrow(() -> new VerifyException("No partition data for partitioned table")); builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes)); } appendFiles.appendFile(builder.build()); - writtenFiles.add(task.getPath()); + writtenFiles.add(task.path()); } // try to leave as little garbage as possible behind - if (table.getRetryMode() != NO_RETRIES) { + if (table.retryMode() != NO_RETRIES) { cleanExtraOutputFiles(session, writtenFiles.build()); } - commit(appendFiles, session); - transaction.commitTransaction(); + appendFiles.scanManifestsWith(icebergScanExecutor); + commitUpdateAndTransaction(appendFiles, session, transaction, "insert"); // TODO (https://github.com/trinodb/trino/issues/15439) this may not exactly be the snapshot we committed, if there is another writer long newSnapshotId = transaction.table().currentSnapshot().snapshotId(); transaction = null; // TODO (https://github.com/trinodb/trino/issues/15439): it would be good to publish data and stats atomically beforeWriteSnapshotId.ifPresent(previous -> - verify(previous != newSnapshotId, "Failed to get new snapshot ID ")); + verify(previous != newSnapshotId, "Failed to get new snapshot ID")); - if (!computedStatistics.isEmpty()) { + if (isS3Tables(icebergTable.location())) { + log.debug("S3 Tables does not support statistics: %s", table.name()); + } + else if (!computedStatistics.isEmpty()) { try { - beginTransaction(catalog.loadTable(session, table.getName())); + beginTransaction(catalog.loadTable(session, table.name())); Table reloadedTable = transaction.table(); CollectedStatistics collectedStatistics = processComputedTableStatistics(reloadedTable, computedStatistics); StatisticsFile statisticsFile = tableStatisticsWriter.writeStatisticsFile( @@ -972,10 +1554,10 @@ public Optional finishInsert(ConnectorSession session, INCREMENTAL_UPDATE, collectedStatistics); transaction.updateStatistics() - .setStatistics(newSnapshotId, statisticsFile) + .setStatistics(statisticsFile) .commit(); - transaction.commitTransaction(); + commitTransaction(transaction, "update statistics on insert"); } catch (Exception e) { // Write was committed, so at this point we cannot fail the query @@ -986,13 +1568,13 @@ public Optional finishInsert(ConnectorSession session, } return Optional.of(new HiveWrittenPartitions(commitTasks.stream() - .map(CommitTaskData::getPath) + .map(CommitTaskData::path) .collect(toImmutableList()))); } private void cleanExtraOutputFiles(ConnectorSession session, Set writtenFiles) { - TrinoFileSystem fileSystem = fileSystemFactory.create(session); + TrinoFileSystem fileSystem = fileSystemFactory.create(session.getIdentity(), transaction.table().io().properties()); Set locations = getOutputFilesLocations(writtenFiles); Set fileNames = getOutputFilesFileNames(writtenFiles); for (String location : locations) { @@ -1045,7 +1627,7 @@ private static void cleanExtraOutputFiles(TrinoFileSystem fileSystem, String que log.info("Deleted failed attempt files %s from %s for query %s", deletedFiles, location, queryId); } } - catch (IOException e) { + catch (IOException | UncheckedIOException e) { throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, format("Could not clean up extraneous output files; remaining files: %s", filesToDelete), e); } @@ -1097,9 +1679,13 @@ public Optional getTableHandleForExecute( return switch (procedureId) { case OPTIMIZE -> getTableHandleForOptimize(tableHandle, icebergTable, executeProperties, retryMode); + case OPTIMIZE_MANIFESTS -> getTableHandleForOptimizeManifests(session, tableHandle); case DROP_EXTENDED_STATS -> getTableHandleForDropExtendedStats(session, tableHandle); + case ROLLBACK_TO_SNAPSHOT -> getTableHandleForRollbackToSnapshot(session, tableHandle, executeProperties); case EXPIRE_SNAPSHOTS -> getTableHandleForExpireSnapshots(session, tableHandle, executeProperties); case REMOVE_ORPHAN_FILES -> getTableHandleForRemoveOrphanFiles(session, tableHandle, executeProperties); + case ADD_FILES -> getTableHandleForAddFiles(session, tableHandle, executeProperties); + case ADD_FILES_FROM_TABLE -> getTableHandleForAddFilesFromTable(session, tableHandle, executeProperties); }; } @@ -1118,7 +1704,7 @@ private Optional getTableHandleForOptimize( tableHandle.getSnapshotId(), tableHandle.getTableSchemaJson(), tableHandle.getPartitionSpecJson().orElseThrow(() -> new VerifyException("Partition spec missing in the table handle")), - getColumns(SchemaParser.fromJson(tableHandle.getTableSchemaJson()), typeManager), + getProjectedColumns(SchemaParser.fromJson(tableHandle.getTableSchemaJson()), typeManager), icebergTable.sortOrder().fields().stream() .map(TrinoSortField::fromIceberg) .collect(toImmutableList()), @@ -1126,7 +1712,20 @@ private Optional getTableHandleForOptimize( tableHandle.getStorageProperties(), maxScannedFileSize, retryMode != NO_RETRIES), - tableHandle.getTableLocation())); + tableHandle.getTableLocation(), + icebergTable.io().properties())); + } + + private Optional getTableHandleForOptimizeManifests(ConnectorSession session, IcebergTableHandle tableHandle) + { + Table icebergTable = catalog.loadTable(session, tableHandle.getSchemaTableName()); + + return Optional.of(new IcebergTableExecuteHandle( + tableHandle.getSchemaTableName(), + OPTIMIZE_MANIFESTS, + new IcebergOptimizeManifestsHandle(), + icebergTable.location(), + icebergTable.io().properties())); } private Optional getTableHandleForDropExtendedStats(ConnectorSession session, IcebergTableHandle tableHandle) @@ -1137,7 +1736,8 @@ private Optional getTableHandleForDropExtendedStats tableHandle.getSchemaTableName(), DROP_EXTENDED_STATS, new IcebergDropExtendedStatsHandle(), - icebergTable.location())); + icebergTable.location(), + icebergTable.io().properties())); } private Optional getTableHandleForExpireSnapshots(ConnectorSession session, IcebergTableHandle tableHandle, Map executeProperties) @@ -1149,7 +1749,8 @@ private Optional getTableHandleForExpireSnapshots(C tableHandle.getSchemaTableName(), EXPIRE_SNAPSHOTS, new IcebergExpireSnapshotsHandle(retentionThreshold), - icebergTable.location())); + icebergTable.location(), + icebergTable.io().properties())); } private Optional getTableHandleForRemoveOrphanFiles(ConnectorSession session, IcebergTableHandle tableHandle, Map executeProperties) @@ -1161,27 +1762,155 @@ private Optional getTableHandleForRemoveOrphanFiles tableHandle.getSchemaTableName(), REMOVE_ORPHAN_FILES, new IcebergRemoveOrphanFilesHandle(retentionThreshold), - icebergTable.location())); + icebergTable.location(), + icebergTable.io().properties())); + } + + private Optional getTableHandleForAddFiles(ConnectorSession session, IcebergTableHandle tableHandle, Map executeProperties) + { + if (!addFilesProcedureEnabled) { + throw new TrinoException(PERMISSION_DENIED, "add_files procedure is disabled"); + } + + //accessControl.checkCanInsertIntoTable(null, tableHandle.getSchemaTableName()); + + String location = (String) requireProcedureArgument(executeProperties, "location"); + HiveStorageFormat format = (HiveStorageFormat) requireProcedureArgument(executeProperties, "format"); + RecursiveDirectory recursiveDirectory = (RecursiveDirectory) executeProperties.getOrDefault("recursive_directory", "fail"); + + Table icebergTable = catalog.loadTable(session, tableHandle.getSchemaTableName()); + + return Optional.of(new IcebergTableExecuteHandle( + tableHandle.getSchemaTableName(), + ADD_FILES, + new IcebergAddFilesHandle(location, format, recursiveDirectory), + icebergTable.location(), + icebergTable.io().properties())); + } + + public static TypeSignature getTypeSignature(HiveType type, HiveTimestampPrecision timestampPrecision) + { + return toTypeSignature(type.getTypeInfo(), timestampPrecision); + } + + private Optional getTableHandleForAddFilesFromTable(ConnectorSession session, IcebergTableHandle tableHandle, Map executeProperties) + { + //accessControl.checkCanInsertIntoTable(null, tableHandle.getSchemaTableName()); + + String schemaName = (String) requireProcedureArgument(executeProperties, "schema_name"); + String tableName = (String) requireProcedureArgument(executeProperties, "table_name"); + @SuppressWarnings("unchecked") + Map partitionFilter = (Map) executeProperties.get("partition_filter"); + RecursiveDirectory recursiveDirectory = (RecursiveDirectory) executeProperties.getOrDefault("recursive_directory", "fail"); + + HiveMetastore metastore = metastoreFactory.orElseThrow(() -> new TrinoException(NOT_SUPPORTED, "This catalog does not support add_files_from_table procedure")) + .createMetastore(Optional.of(session.getIdentity())); + SchemaTableName sourceName = new SchemaTableName(schemaName, tableName); + io.trino.plugin.hive.metastore.Table sourceTable = metastore.getTable(schemaName, tableName).orElseThrow(() -> new TableNotFoundException(sourceName)); + //accessControl.checkCanSelectFromColumns(null, sourceName, Stream.concat(sourceTable.getDataColumns().stream(), sourceTable.getPartitionColumns().stream()) + // .map(Column::getName) + // .collect(toImmutableSet())); + + Table icebergTable = catalog.loadTable(session, tableHandle.getSchemaTableName()); + + checkProcedureArgument( + icebergTable.schemas().size() >= sourceTable.getDataColumns().size(), + "Target table should have at least %d columns but got %d", sourceTable.getDataColumns().size(), icebergTable.schemas().size()); + checkProcedureArgument( + icebergTable.spec().fields().size() == sourceTable.getPartitionColumns().size(), + "Numbers of partition columns should be equivalent. target: %d, source: %d", icebergTable.spec().fields().size(), sourceTable.getPartitionColumns().size()); + + // TODO Add files from all partitions when partition filter is not provided + checkProcedureArgument( + sourceTable.getPartitionColumns().isEmpty() || partitionFilter != null, + "partition_filter argument must be provided for partitioned tables"); + + String transactionalProperty = sourceTable.getParameters().get(TRANSACTIONAL); + if (parseBoolean(transactionalProperty)) { + throw new TrinoException(NOT_SUPPORTED, "Adding files from transactional tables is unsupported"); + } + if (!"MANAGED_TABLE".equalsIgnoreCase(sourceTable.getTableType()) && !"EXTERNAL_TABLE".equalsIgnoreCase(sourceTable.getTableType())) { + throw new TrinoException(NOT_SUPPORTED, "The procedure doesn't support adding files from %s table type".formatted(sourceTable.getTableType())); + } + if (isSomeKindOfAView(sourceTable) || isIcebergTable(sourceTable) || isDeltaLakeTable(sourceTable) || isHudiTable(sourceTable)) { + throw new TrinoException(NOT_SUPPORTED, "Adding files from non-Hive tables is unsupported"); + } + if (sourceTable.getPartitionColumns().isEmpty() && partitionFilter != null && !partitionFilter.isEmpty()) { + throw new TrinoException(NOT_SUPPORTED, "Partition filter is not supported for non-partitioned tables"); + } + + Set missingDataColumns = new HashSet<>(); + Stream.of(sourceTable.getDataColumns(), sourceTable.getPartitionColumns()) + .flatMap(List::stream) + .forEach(sourceColumn -> { + Types.NestedField targetColumn = icebergTable.schema().caseInsensitiveFindField(sourceColumn.getName()); + if (targetColumn == null) { + if (sourceTable.getPartitionColumns().contains(sourceColumn)) { + throw new TrinoException(COLUMN_NOT_FOUND, "Partition column '%s' does not exist".formatted(sourceColumn.getName())); + } + missingDataColumns.add(sourceColumn.getName()); + return; + } + ColumnIdentity columnIdentity = createColumnIdentity(targetColumn); + org.apache.iceberg.types.Type sourceColumnType = toIcebergType(typeManager.getType(getTypeSignature(sourceColumn.getType(), DEFAULT_PRECISION)), columnIdentity); + if (!targetColumn.type().equals(sourceColumnType)) { + throw new TrinoException(TYPE_MISMATCH, "Target '%s' column is '%s' type, but got source '%s' type".formatted(targetColumn.name(), targetColumn.type(), sourceColumnType)); + } + }); + if (missingDataColumns.size() == sourceTable.getDataColumns().size()) { + throw new TrinoException(COLUMN_NOT_FOUND, "All columns in the source table do not exist in the target table"); + } + + return Optional.of(new IcebergTableExecuteHandle( + tableHandle.getSchemaTableName(), + ADD_FILES_FROM_TABLE, + new IcebergAddFilesFromTableHandle(sourceTable, partitionFilter, recursiveDirectory), + icebergTable.location(), + icebergTable.io().properties())); + } + + private Optional getTableHandleForRollbackToSnapshot(ConnectorSession session, IcebergTableHandle tableHandle, Map executeProperties) + { + long snapshotId = (long) executeProperties.get("snapshot_id"); + Table icebergTable = catalog.loadTable(session, tableHandle.getSchemaTableName()); + + return Optional.of(new IcebergTableExecuteHandle( + tableHandle.getSchemaTableName(), + ROLLBACK_TO_SNAPSHOT, + new IcebergRollbackToSnapshotHandle(snapshotId), + icebergTable.location(), + icebergTable.io().properties())); + } + + private static Object requireProcedureArgument(Map properties, String name) + { + Object value = properties.get(name); + checkProcedureArgument(value != null, "Required procedure argument '%s' is missing", name); + return value; } @Override public Optional getLayoutForTableExecute(ConnectorSession session, ConnectorTableExecuteHandle tableExecuteHandle) { IcebergTableExecuteHandle executeHandle = (IcebergTableExecuteHandle) tableExecuteHandle; - switch (executeHandle.getProcedureId()) { + switch (executeHandle.procedureId()) { case OPTIMIZE: return getLayoutForOptimize(session, executeHandle); + case OPTIMIZE_MANIFESTS: case DROP_EXTENDED_STATS: + case ROLLBACK_TO_SNAPSHOT: case EXPIRE_SNAPSHOTS: case REMOVE_ORPHAN_FILES: + case ADD_FILES: + case ADD_FILES_FROM_TABLE: // handled via executeTableExecute } - throw new IllegalArgumentException("Unknown procedure '" + executeHandle.getProcedureId() + "'"); + throw new IllegalArgumentException("Unknown procedure '" + executeHandle.procedureId() + "'"); } private Optional getLayoutForOptimize(ConnectorSession session, IcebergTableExecuteHandle executeHandle) { - Table icebergTable = catalog.loadTable(session, executeHandle.getSchemaTableName()); + Table icebergTable = catalog.loadTable(session, executeHandle.schemaTableName()); // from performance perspective it is better to have lower number of bigger files than other way around // thus we force repartitioning for optimize to achieve this return getWriteLayout(icebergTable.schema(), icebergTable.spec(), true); @@ -1195,15 +1924,19 @@ public BeginTableExecuteResult beginOptimize( @@ -1211,13 +1944,12 @@ private BeginTableExecuteResult OPTIMIZE_MAX_SUPPORTED_TABLE_VERSION) { throw new TrinoException(NOT_SUPPORTED, format( "%s is not supported for Iceberg table format version > %d. Table %s format version is %s.", @@ -1231,43 +1963,49 @@ private BeginTableExecuteResult( executeHandle, - table.forOptimize(true, optimizeHandle.getMaxScannedFileSize())); + table.forOptimize(true, optimizeHandle.maxScannedFileSize())); } @Override public void finishTableExecute(ConnectorSession session, ConnectorTableExecuteHandle tableExecuteHandle, Collection fragments, List splitSourceInfo) { IcebergTableExecuteHandle executeHandle = (IcebergTableExecuteHandle) tableExecuteHandle; - switch (executeHandle.getProcedureId()) { + switch (executeHandle.procedureId()) { case OPTIMIZE: finishOptimize(session, executeHandle, fragments, splitSourceInfo); return; + case OPTIMIZE_MANIFESTS: case DROP_EXTENDED_STATS: + case ROLLBACK_TO_SNAPSHOT: case EXPIRE_SNAPSHOTS: case REMOVE_ORPHAN_FILES: + case ADD_FILES: + case ADD_FILES_FROM_TABLE: // handled via executeTableExecute } - throw new IllegalArgumentException("Unknown procedure '" + executeHandle.getProcedureId() + "'"); + throw new IllegalArgumentException("Unknown procedure '" + executeHandle.procedureId() + "'"); } private void finishOptimize(ConnectorSession session, IcebergTableExecuteHandle executeHandle, Collection fragments, List splitSourceInfo) { - IcebergOptimizeHandle optimizeHandle = (IcebergOptimizeHandle) executeHandle.getProcedureHandle(); + IcebergOptimizeHandle optimizeHandle = (IcebergOptimizeHandle) executeHandle.procedureHandle(); Table icebergTable = transaction.table(); + Optional beforeWriteSnapshotId = getCurrentSnapshotId(icebergTable); // files to be deleted ImmutableSet.Builder scannedDataFilesBuilder = ImmutableSet.builder(); ImmutableSet.Builder scannedDeleteFilesBuilder = ImmutableSet.builder(); splitSourceInfo.stream().map(DataFileWithDeleteFiles.class::cast).forEach(dataFileWithDeleteFiles -> { - scannedDataFilesBuilder.add(dataFileWithDeleteFiles.getDataFile()); - scannedDeleteFilesBuilder.addAll(dataFileWithDeleteFiles.getDeleteFiles()); + scannedDataFilesBuilder.add(dataFileWithDeleteFiles.dataFile()); + scannedDeleteFilesBuilder.addAll(dataFileWithDeleteFiles.deleteFiles()); }); Set scannedDataFiles = scannedDataFilesBuilder.build(); Set fullyAppliedDeleteFiles = scannedDeleteFilesBuilder.build(); List commitTasks = fragments.stream() - .map(slice -> commitTaskCodec.fromJson(slice.getBytes())) + .map(Slice::getBytes) + .map(commitTaskCodec::fromJson) .collect(toImmutableList()); Type[] partitionColumnTypes = icebergTable.spec().fields().stream() @@ -1278,13 +2016,14 @@ private void finishOptimize(ConnectorSession session, IcebergTableExecuteHandle Set newFiles = new HashSet<>(); for (CommitTaskData task : commitTasks) { DataFiles.Builder builder = DataFiles.builder(icebergTable.spec()) - .withPath(task.getPath()) - .withFileSizeInBytes(task.getFileSizeInBytes()) - .withFormat(optimizeHandle.getFileFormat().toIceberg()) - .withMetrics(task.getMetrics().metrics()); + .withPath(task.path()) + .withFileSizeInBytes(task.fileSizeInBytes()) + .withFormat(optimizeHandle.fileFormat().toIceberg()) + .withMetrics(task.metrics().metrics()); + task.fileSplitOffsets().ifPresent(builder::withSplitOffsets); if (!icebergTable.spec().fields().isEmpty()) { - String partitionDataJson = task.getPartitionDataJson() + String partitionDataJson = task.partitionDataJson() .orElseThrow(() -> new VerifyException("No partition data for partitioned table")); builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes)); } @@ -1292,118 +2031,192 @@ private void finishOptimize(ConnectorSession session, IcebergTableExecuteHandle newFiles.add(builder.build()); } - if (optimizeHandle.getSnapshotId().isEmpty() || scannedDataFiles.isEmpty() && fullyAppliedDeleteFiles.isEmpty() && newFiles.isEmpty()) { + if (optimizeHandle.snapshotId().isEmpty() || scannedDataFiles.isEmpty() && fullyAppliedDeleteFiles.isEmpty() && newFiles.isEmpty()) { // Either the table is empty, or the table scan turned out to be empty, nothing to commit transaction = null; return; } // try to leave as little garbage as possible behind - if (optimizeHandle.isRetriesEnabled()) { + if (optimizeHandle.retriesEnabled()) { cleanExtraOutputFiles( session, newFiles.stream() - .map(dataFile -> dataFile.path().toString()) + .map(ContentFile::location) .collect(toImmutableSet())); } RewriteFiles rewriteFiles = transaction.newRewrite(); - rewriteFiles.rewriteFiles(scannedDataFiles, fullyAppliedDeleteFiles, newFiles, ImmutableSet.of()); + scannedDataFiles.forEach(rewriteFiles::deleteFile); + fullyAppliedDeleteFiles.forEach(rewriteFiles::deleteFile); + newFiles.forEach(rewriteFiles::addFile); + // Table.snapshot method returns null if there is no matching snapshot - Snapshot snapshot = requireNonNull(icebergTable.snapshot(optimizeHandle.getSnapshotId().get()), "snapshot is null"); + Snapshot snapshot = requireNonNull(icebergTable.snapshot(optimizeHandle.snapshotId().get()), "snapshot is null"); + // Set dataSequenceNumber to avoid contention between OPTIMIZE and concurrent writing of equality deletes + rewriteFiles.dataSequenceNumber(snapshot.sequenceNumber()); rewriteFiles.validateFromSnapshot(snapshot.snapshotId()); - commit(rewriteFiles, session); - transaction.commitTransaction(); + rewriteFiles.scanManifestsWith(icebergScanExecutor); + commitUpdateAndTransaction(rewriteFiles, session, transaction, "optimize"); + + // TODO (https://github.com/trinodb/trino/issues/15439) this may not exactly be the snapshot we committed, if there is another writer + long newSnapshotId = transaction.table().currentSnapshot().snapshotId(); transaction = null; + + // TODO (https://github.com/trinodb/trino/issues/15439): it would be good to publish data and stats atomically + beforeWriteSnapshotId.ifPresent(previous -> + verify(previous != newSnapshotId, "Failed to get new snapshot ID")); + + try { + beginTransaction(catalog.loadTable(session, executeHandle.schemaTableName())); + Table reloadedTable = transaction.table(); + StatisticsFile newStatsFile = tableStatisticsWriter.rewriteStatisticsFile(session, reloadedTable, newSnapshotId); + + transaction.updateStatistics() + .setStatistics(newStatsFile) + .commit(); + commitTransaction(transaction, "update statistics after optimize"); + } + catch (Exception e) { + // Write was committed, so at this point we cannot fail the query + // TODO (https://github.com/trinodb/trino/issues/15439): it would be good to publish data and stats atomically + log.error(e, "Failed to save table statistics"); + } + transaction = null; + } + + private static void commitUpdateAndTransaction(SnapshotUpdate update, ConnectorSession session, Transaction transaction, String operation) + { + commitUpdate(update, session, operation); + commitTransaction(transaction, operation); + } + + private static void commitUpdate(SnapshotUpdate update, ConnectorSession session, String operation) + { + try { + commit(update, session); + } + catch (UncheckedIOException | ValidationException | CommitFailedException | CommitStateUnknownException e) { + throw new TrinoException(ICEBERG_COMMIT_ERROR, format("Failed to commit during %s: %s", operation, firstNonNull(e.getMessage(), e)), e); + } + } + + private static void commitTransaction(Transaction transaction, String operation) + { + try { + transaction.commitTransaction(); + } + catch (UncheckedIOException | ValidationException | CommitFailedException | CommitStateUnknownException e) { + throw new TrinoException(ICEBERG_COMMIT_ERROR, format("Failed to commit the transaction during %s: %s", operation, firstNonNull(e.getMessage(), e)), e); + } } @Override public void executeTableExecute(ConnectorSession session, ConnectorTableExecuteHandle tableExecuteHandle) { IcebergTableExecuteHandle executeHandle = (IcebergTableExecuteHandle) tableExecuteHandle; - switch (executeHandle.getProcedureId()) { + switch (executeHandle.procedureId()) { + case OPTIMIZE_MANIFESTS: + executeOptimizeManifests(session, executeHandle); + return; case DROP_EXTENDED_STATS: executeDropExtendedStats(session, executeHandle); return; + case ROLLBACK_TO_SNAPSHOT: + executeRollbackToSnapshot(session, executeHandle); + return; case EXPIRE_SNAPSHOTS: executeExpireSnapshots(session, executeHandle); return; case REMOVE_ORPHAN_FILES: executeRemoveOrphanFiles(session, executeHandle); return; + case ADD_FILES: + executeAddFiles(session, executeHandle); + return; + case ADD_FILES_FROM_TABLE: + executeAddFilesFromTable(session, executeHandle); + return; default: - throw new IllegalArgumentException("Unknown procedure '" + executeHandle.getProcedureId() + "'"); + throw new IllegalArgumentException("Unknown procedure '" + executeHandle.procedureId() + "'"); + } + } + + private void executeOptimizeManifests(ConnectorSession session, IcebergTableExecuteHandle executeHandle) + { + checkArgument(executeHandle.procedureHandle() instanceof IcebergOptimizeManifestsHandle, "Unexpected procedure handle %s", executeHandle.procedureHandle()); + + BaseTable icebergTable = catalog.loadTable(session, executeHandle.schemaTableName()); + List manifests = loadAllManifestsFromSnapshot(icebergTable, icebergTable.currentSnapshot()); + if (manifests.isEmpty()) { + return; + } + if (manifests.size() == 1 && manifests.get(0).length() < icebergTable.operations().current().propertyAsLong(MANIFEST_TARGET_SIZE_BYTES, MANIFEST_TARGET_SIZE_BYTES_DEFAULT)) { + return; } + + beginTransaction(icebergTable); + RewriteManifests rewriteManifests = transaction.rewriteManifests(); + rewriteManifests + .clusterBy(file -> { + // Use the first partition field as the clustering key + StructLike partition = file.partition(); + return partition.size() > 1 ? Optional.ofNullable(partition.get(0, Object.class)) : partition; + }) + .scanManifestsWith(icebergScanExecutor) + .commit(); + commitTransaction(transaction, "optimize manifests"); + transaction = null; } private void executeDropExtendedStats(ConnectorSession session, IcebergTableExecuteHandle executeHandle) { - checkArgument(executeHandle.getProcedureHandle() instanceof IcebergDropExtendedStatsHandle, "Unexpected procedure handle %s", executeHandle.getProcedureHandle()); + checkArgument(executeHandle.procedureHandle() instanceof IcebergDropExtendedStatsHandle, "Unexpected procedure handle %s", executeHandle.procedureHandle()); - Table icebergTable = catalog.loadTable(session, executeHandle.getSchemaTableName()); + Table icebergTable = catalog.loadTable(session, executeHandle.schemaTableName()); beginTransaction(icebergTable); UpdateStatistics updateStatistics = transaction.updateStatistics(); for (StatisticsFile statisticsFile : icebergTable.statisticsFiles()) { updateStatistics.removeStatistics(statisticsFile.snapshotId()); } updateStatistics.commit(); - UpdateProperties updateProperties = transaction.updateProperties(); - for (String key : transaction.table().properties().keySet()) { - if (key.startsWith(TRINO_STATS_PREFIX)) { - updateProperties.remove(key); - } - } - updateProperties.commit(); - transaction.commitTransaction(); + commitTransaction(transaction, "drop extended stats"); transaction = null; } + private void executeRollbackToSnapshot(ConnectorSession session, IcebergTableExecuteHandle executeHandle) + { + checkArgument(executeHandle.procedureHandle() instanceof IcebergRollbackToSnapshotHandle, "Unexpected procedure handle %s", executeHandle.procedureHandle()); + long snapshotId = ((IcebergRollbackToSnapshotHandle) executeHandle.procedureHandle()).snapshotId(); + + Table icebergTable = catalog.loadTable(session, executeHandle.schemaTableName()); + icebergTable.manageSnapshots().setCurrentSnapshot(snapshotId).commit(); + } + private void executeExpireSnapshots(ConnectorSession session, IcebergTableExecuteHandle executeHandle) { - IcebergExpireSnapshotsHandle expireSnapshotsHandle = (IcebergExpireSnapshotsHandle) executeHandle.getProcedureHandle(); + IcebergExpireSnapshotsHandle expireSnapshotsHandle = (IcebergExpireSnapshotsHandle) executeHandle.procedureHandle(); - Table table = catalog.loadTable(session, executeHandle.getSchemaTableName()); - Duration retention = requireNonNull(expireSnapshotsHandle.getRetentionThreshold(), "retention is null"); + BaseTable table = catalog.loadTable(session, executeHandle.schemaTableName()); + Duration retention = requireNonNull(expireSnapshotsHandle.retentionThreshold(), "retention is null"); validateTableExecuteParameters( table, - executeHandle.getSchemaTableName(), + executeHandle.schemaTableName(), EXPIRE_SNAPSHOTS.name(), retention, getExpireSnapshotMinRetention(session), IcebergConfig.EXPIRE_SNAPSHOTS_MIN_RETENTION, IcebergSessionProperties.EXPIRE_SNAPSHOTS_MIN_RETENTION); - long expireTimestampMillis = session.getStart().toEpochMilli() - retention.toMillis(); - TrinoFileSystem fileSystem = fileSystemFactory.create(session); - List pathsToDelete = new ArrayList<>(); - // deleteFunction is not accessed from multiple threads unless .executeDeleteWith() is used - Consumer deleteFunction = path -> { - pathsToDelete.add(Location.of(path)); - if (pathsToDelete.size() == DELETE_BATCH_SIZE) { - try { - fileSystem.deleteFiles(pathsToDelete); - pathsToDelete.clear(); - } - catch (IOException e) { - throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed to delete files during snapshot expiration", e); - } - } - }; - + // ForwardingFileIo handles bulk operations so no separate function implementation is needed table.expireSnapshots() - .expireOlderThan(expireTimestampMillis) - .deleteWith(deleteFunction) + .expireOlderThan(session.getStart().toEpochMilli() - retention.toMillis()) + .planWith(icebergScanExecutor) .commit(); - try { - fileSystem.deleteFiles(pathsToDelete); - } - catch (IOException e) { - throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed to delete files during snapshot expiration", e); - } } private static void validateTableExecuteParameters( - Table table, + BaseTable table, SchemaTableName schemaTableName, String procedureName, Duration retentionThreshold, @@ -1411,7 +2224,7 @@ private static void validateTableExecuteParameters( String minRetentionParameterName, String sessionMinRetentionParameterName) { - int tableFormatVersion = ((BaseTable) table).operations().current().formatVersion(); + int tableFormatVersion = formatVersion(table); if (tableFormatVersion > CLEANING_UP_PROCEDURES_MAX_SUPPORTED_TABLE_VERSION) { // It is not known if future version won't bring any new kind of metadata or data files // because of the way procedures are implemented it is safer to fail here than to potentially remove @@ -1441,13 +2254,13 @@ private static void validateTableExecuteParameters( public void executeRemoveOrphanFiles(ConnectorSession session, IcebergTableExecuteHandle executeHandle) { - IcebergRemoveOrphanFilesHandle removeOrphanFilesHandle = (IcebergRemoveOrphanFilesHandle) executeHandle.getProcedureHandle(); + IcebergRemoveOrphanFilesHandle removeOrphanFilesHandle = (IcebergRemoveOrphanFilesHandle) executeHandle.procedureHandle(); - Table table = catalog.loadTable(session, executeHandle.getSchemaTableName()); - Duration retention = requireNonNull(removeOrphanFilesHandle.getRetentionThreshold(), "retention is null"); + BaseTable table = catalog.loadTable(session, executeHandle.schemaTableName()); + Duration retention = requireNonNull(removeOrphanFilesHandle.retentionThreshold(), "retention is null"); validateTableExecuteParameters( table, - executeHandle.getSchemaTableName(), + executeHandle.schemaTableName(), REMOVE_ORPHAN_FILES.name(), retention, getRemoveOrphanFilesMinRetention(session), @@ -1460,72 +2273,122 @@ public void executeRemoveOrphanFiles(ConnectorSession session, IcebergTableExecu } Instant expiration = session.getStart().minusMillis(retention.toMillis()); - removeOrphanFiles(table, session, executeHandle.getSchemaTableName(), expiration); + removeOrphanFiles(table, session, executeHandle.schemaTableName(), expiration, executeHandle.fileIoProperties()); } - private void removeOrphanFiles(Table table, ConnectorSession session, SchemaTableName schemaTableName, Instant expiration) + private void removeOrphanFiles(Table table, ConnectorSession session, SchemaTableName schemaTableName, Instant expiration, Map fileIoProperties) { Set processedManifestFilePaths = new HashSet<>(); // Similarly to issues like https://github.com/trinodb/trino/issues/13759, equivalent paths may have different String // representations due to things like double slashes. Using file names may result in retaining files which could be removed. // However, in practice Iceberg metadata and data files have UUIDs in their names which makes this unlikely. - ImmutableSet.Builder validMetadataFileNames = ImmutableSet.builder(); - ImmutableSet.Builder validDataFileNames = ImmutableSet.builder(); + Set validFileNames = Sets.newConcurrentHashSet(); + List> manifestScanFutures = new ArrayList<>(); for (Snapshot snapshot : table.snapshots()) { - if (snapshot.manifestListLocation() != null) { - validMetadataFileNames.add(fileName(snapshot.manifestListLocation())); + String manifestListLocation = snapshot.manifestListLocation(); + List allManifests; + if (manifestListLocation != null) { + validFileNames.add(fileName(manifestListLocation)); + allManifests = loadAllManifestsFromManifestList(table, manifestListLocation); + } + else { + // This is to maintain support for V1 tables which have embedded manifest lists + allManifests = loadAllManifestsFromSnapshot(table, snapshot); } - for (ManifestFile manifest : snapshot.allManifests(table.io())) { + for (ManifestFile manifest : allManifests) { if (!processedManifestFilePaths.add(manifest.path())) { // Already read this manifest continue; } - validMetadataFileNames.add(fileName(manifest.path())); - try (ManifestReader> manifestReader = readerForManifest(table, manifest)) { - for (ContentFile contentFile : manifestReader) { - validDataFileNames.add(fileName(contentFile.path().toString())); + validFileNames.add(fileName(manifest.path())); + manifestScanFutures.add(icebergScanExecutor.submit(() -> { + try (ManifestReader> manifestReader = readerForManifest(manifest, table)) { + for (ContentFile contentFile : manifestReader.select(ImmutableList.of("file_path"))) { + validFileNames.add(fileName(contentFile.location())); + } } - } - catch (IOException e) { - throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Unable to list manifest file content from " + manifest.path(), e); - } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Unable to list manifest file content from " + manifest.path(), e); + } + catch (NotFoundException e) { + throw new TrinoException(ICEBERG_INVALID_METADATA, "Manifest file does not exist: " + manifest.path()); + } + })); } } metadataFileLocations(table, false).stream() .map(IcebergUtil::fileName) - .forEach(validMetadataFileNames::add); - validMetadataFileNames.add(fileName(versionHintLocation(table))); + .forEach(validFileNames::add); - scanAndDeleteInvalidFiles(table, session, schemaTableName, expiration, validDataFileNames.build(), "data"); - scanAndDeleteInvalidFiles(table, session, schemaTableName, expiration, validMetadataFileNames.build(), "metadata"); + statisticsFilesLocations(table).stream() + .map(IcebergUtil::fileName) + .forEach(validFileNames::add); + + validFileNames.add("version-hint.text"); + + try { + manifestScanFutures.forEach(MoreFutures::getFutureValue); + // All futures completed normally + manifestScanFutures.clear(); + } + finally { + // Ensure any futures still running are canceled in case of failure + manifestScanFutures.forEach(future -> future.cancel(true)); + } + scanAndDeleteInvalidFiles(table, session, schemaTableName, expiration, validFileNames, fileIoProperties); } - private static ManifestReader> readerForManifest(Table table, ManifestFile manifest) + public void executeAddFiles(ConnectorSession session, IcebergTableExecuteHandle executeHandle) { - return switch (manifest.content()) { - case DATA -> ManifestFiles.read(manifest, table.io()); - case DELETES -> ManifestFiles.readDeleteManifest(manifest, table.io(), table.specs()); - }; + IcebergAddFilesHandle addFilesHandle = (IcebergAddFilesHandle) executeHandle.procedureHandle(); + Table table = catalog.loadTable(session, executeHandle.schemaTableName()); + TrinoFileSystem fileSystem = fileSystemFactory.create(session.getIdentity(), table.io().properties()); + addFiles( + session, + fileSystem, + catalog, + executeHandle.schemaTableName(), + addFilesHandle.location(), + addFilesHandle.format(), + addFilesHandle.recursiveDirectory(), + icebergScanExecutor); } - private void scanAndDeleteInvalidFiles(Table table, ConnectorSession session, SchemaTableName schemaTableName, Instant expiration, Set validFiles, String subfolder) + public void executeAddFilesFromTable(ConnectorSession session, IcebergTableExecuteHandle executeHandle) { + IcebergAddFilesFromTableHandle addFilesHandle = (IcebergAddFilesFromTableHandle) executeHandle.procedureHandle(); + Table table = catalog.loadTable(session, executeHandle.schemaTableName()); + TrinoFileSystem fileSystem = fileSystemFactory.create(session.getIdentity(), table.io().properties()); + addFilesFromTable( + session, + fileSystem, + metastoreFactory.orElseThrow(), + table, + addFilesHandle.table(), + addFilesHandle.partitionFilter(), + addFilesHandle.recursiveDirectory(), + icebergScanExecutor); + } + + private void scanAndDeleteInvalidFiles(Table table, ConnectorSession session, SchemaTableName schemaTableName, Instant expiration, Set validFiles, Map fileIoProperties) + { + List> deleteFutures = new ArrayList<>(); try { - List filesToDelete = new ArrayList<>(); - TrinoFileSystem fileSystem = fileSystemFactory.create(session); - FileIterator allFiles = fileSystem.listFiles(Location.of(table.location()).appendPath(subfolder)); + List filesToDelete = new ArrayList<>(DELETE_BATCH_SIZE); + TrinoFileSystem fileSystem = fileSystemFactory.create(session.getIdentity(), fileIoProperties); + FileIterator allFiles = fileSystem.listFiles(Location.of(table.location())); while (allFiles.hasNext()) { FileEntry entry = allFiles.next(); if (entry.lastModified().isBefore(expiration) && !validFiles.contains(entry.location().fileName())) { filesToDelete.add(entry.location()); if (filesToDelete.size() >= DELETE_BATCH_SIZE) { - log.debug("Deleting files while removing orphan files for table %s [%s]", schemaTableName, filesToDelete); - fileSystem.deleteFiles(filesToDelete); - filesToDelete.clear(); + List finalFilesToDelete = filesToDelete; + deleteFutures.add(icebergFileDeleteExecutor.submit(() -> deleteFiles(finalFilesToDelete, schemaTableName, fileSystem))); + filesToDelete = new ArrayList<>(DELETE_BATCH_SIZE); } } else { @@ -1536,23 +2399,66 @@ private void scanAndDeleteInvalidFiles(Table table, ConnectorSession session, Sc log.debug("Deleting files while removing orphan files for table %s %s", schemaTableName, filesToDelete); fileSystem.deleteFiles(filesToDelete); } + + deleteFutures.forEach(MoreFutures::getFutureValue); + // All futures completed normally + deleteFutures.clear(); + } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed removing orphan files for table: " + schemaTableName, e); + } + finally { + // Ensure any futures still running are canceled in case of failure + deleteFutures.forEach(future -> future.cancel(true)); } - catch (IOException e) { - throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed accessing data for table: " + schemaTableName, e); + } + + private void deleteFiles(List files, SchemaTableName schemaTableName, TrinoFileSystem fileSystem) + { + log.debug("Deleting files while removing orphan files for table %s [%s]", schemaTableName, files); + try { + fileSystem.deleteFiles(files); } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed removing orphan files for table: " + schemaTableName, e); + } + } + + @Override + public FunctionDependencyDeclaration getFunctionDependencies(ConnectorSession session, FunctionId functionId, BoundSignature boundSignature) + { + return FunctionDependencyDeclaration.NO_DEPENDENCIES; } @Override public Optional getInfo(ConnectorTableHandle tableHandle) { IcebergTableHandle icebergTableHandle = (IcebergTableHandle) tableHandle; - Optional partitioned = icebergTableHandle.getPartitionSpecJson() - .map(partitionSpecJson -> PartitionSpecParser.fromJson(SchemaParser.fromJson(icebergTableHandle.getTableSchemaJson()), partitionSpecJson).isPartitioned()); + List partitionFields = icebergTableHandle.getPartitionSpecJson() + .map(partitionSpecJson -> PartitionSpecParser.fromJson(SchemaParser.fromJson(icebergTableHandle.getTableSchemaJson()), partitionSpecJson) + .fields().stream() + .map(field -> field.name() + ": " + field.transform()) + .collect(toImmutableList())) + .orElse(ImmutableList.of()); + + Map summary = ImmutableMap.of(); + if (icebergTableHandle.getSnapshotId().isPresent()) { + //Table table = catalog.loadTable(session, icebergTableHandle.getSchemaTableName()); + //summary = table.snapshot(icebergTableHandle.getSnapshotId().get()).summary(); + } + Optional totalRecords = Optional.ofNullable(summary.get(TOTAL_RECORDS_PROP)); + Optional deletedRecords = Optional.ofNullable(summary.get(DELETED_RECORDS_PROP)); + Optional totalDataFiles = Optional.ofNullable(summary.get(TOTAL_DATA_FILES_PROP)); + Optional totalDeleteFiles = Optional.ofNullable(summary.get(TOTAL_DELETE_FILES_PROP)); return Optional.of(new IcebergInputInfo( icebergTableHandle.getSnapshotId(), - partitioned, - getFileFormat(icebergTableHandle.getStorageProperties()).name())); + partitionFields, + getFileFormat(icebergTableHandle.getStorageProperties()).name(), + totalRecords, + deletedRecords, + totalDataFiles, + totalDeleteFiles)); } @Override @@ -1587,10 +2493,51 @@ public void setTableProperties(ConnectorSession session, ConnectorTableHandle ta beginTransaction(icebergTable); UpdateProperties updateProperties = transaction.updateProperties(); + if (properties.containsKey(EXTRA_PROPERTIES_PROPERTY)) { + //noinspection unchecked + Map extraProperties = (Map) properties.get(EXTRA_PROPERTIES_PROPERTY) + .orElseThrow(() -> new IllegalArgumentException("The extra_properties property cannot be empty")); + verifyExtraProperties(properties.keySet(), extraProperties, allowedExtraProperties); + extraProperties.forEach(updateProperties::set); + } + + if (properties.containsKey(PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY)) { + checkFormatForProperty(getFileFormat(icebergTable).toIceberg(), FileFormat.PARQUET, PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY); + //noinspection unchecked + List parquetBloomFilterColumns = (List) properties.get(PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY) + .orElseThrow(() -> new IllegalArgumentException("The parquet_bloom_filter_columns property cannot be empty")); + validateParquetBloomFilterColumns(getColumnMetadatas(SchemaParser.fromJson(table.getTableSchemaJson()), typeManager), parquetBloomFilterColumns); + + Set existingParquetBloomFilterColumns = icebergTable.properties().keySet().stream() + .filter(key -> key.startsWith(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX)) + .map(key -> key.substring(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX.length())) + .collect(toImmutableSet()); + Set removeParquetBloomFilterColumns = Sets.difference(existingParquetBloomFilterColumns, Set.copyOf(parquetBloomFilterColumns)); + removeParquetBloomFilterColumns.forEach(column -> updateProperties.remove(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + column)); + parquetBloomFilterColumns.forEach(column -> updateProperties.set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + column, "true")); + } + + if (properties.containsKey(ORC_BLOOM_FILTER_COLUMNS_PROPERTY)) { + checkFormatForProperty(getFileFormat(icebergTable).toIceberg(), FileFormat.ORC, ORC_BLOOM_FILTER_COLUMNS_PROPERTY); + //noinspection unchecked + List orcBloomFilterColumns = (List) properties.get(ORC_BLOOM_FILTER_COLUMNS_PROPERTY) + .orElseThrow(() -> new IllegalArgumentException("The orc_bloom_filter_columns property cannot be empty")); + if (orcBloomFilterColumns.isEmpty()) { + updateProperties.remove(ORC_BLOOM_FILTER_COLUMNS); + } + else { + validateOrcBloomFilterColumns(getColumnMetadatas(SchemaParser.fromJson(table.getTableSchemaJson()), typeManager), orcBloomFilterColumns); + updateProperties.set(ORC_BLOOM_FILTER_COLUMNS, Joiner.on(",").join(orcBloomFilterColumns)); + } + } + + IcebergFileFormat oldFileFormat = getFileFormat(icebergTable.properties()); + IcebergFileFormat newFileFormat = oldFileFormat; + if (properties.containsKey(FILE_FORMAT_PROPERTY)) { - IcebergFileFormat fileFormat = (IcebergFileFormat) properties.get(FILE_FORMAT_PROPERTY) + newFileFormat = (IcebergFileFormat) properties.get(FILE_FORMAT_PROPERTY) .orElseThrow(() -> new IllegalArgumentException("The format property cannot be empty")); - updateProperties.defaultFormat(fileFormat.toIceberg()); + updateProperties.defaultFormat(newFileFormat.toIceberg()); } if (properties.containsKey(FORMAT_VERSION_PROPERTY)) { @@ -1600,6 +2547,38 @@ public void setTableProperties(ConnectorSession session, ConnectorTableHandle ta updateProperties.set(FORMAT_VERSION, Integer.toString(formatVersion)); } + Map propertiesForCompression = calculateTableCompressionProperties(oldFileFormat, newFileFormat, icebergTable.properties(), properties.entrySet().stream() + .filter(e -> e.getValue().isPresent()) + .collect(toImmutableMap( + Map.Entry::getKey, + e -> e.getValue().get()))); + + propertiesForCompression.forEach(updateProperties::set); + + if (properties.containsKey(MAX_COMMIT_RETRY)) { + int maxCommitRetry = (int) properties.get(MAX_COMMIT_RETRY) + .orElseThrow(() -> new IllegalArgumentException("The max_commit_retry property cannot be empty")); + updateProperties.set(COMMIT_NUM_RETRIES, Integer.toString(maxCommitRetry)); + } + + if (properties.containsKey(OBJECT_STORE_LAYOUT_ENABLED_PROPERTY)) { + boolean objectStoreEnabled = (boolean) properties.get(OBJECT_STORE_LAYOUT_ENABLED_PROPERTY) + .orElseThrow(() -> new IllegalArgumentException("The object_store_enabled property cannot be empty")); + updateProperties.set(OBJECT_STORE_ENABLED, Boolean.toString(objectStoreEnabled)); + } + + if (properties.containsKey(DATA_LOCATION_PROPERTY)) { + String dataLocation = (String) properties.get(DATA_LOCATION_PROPERTY) + .orElseThrow(() -> new IllegalArgumentException("The data_location property cannot be empty")); + boolean objectStoreEnabled = (boolean) properties.getOrDefault( + OBJECT_STORE_LAYOUT_ENABLED_PROPERTY, + Optional.of(Boolean.parseBoolean(icebergTable.properties().get(OBJECT_STORE_ENABLED)))).orElseThrow(); + if (!objectStoreEnabled) { + throw new TrinoException(INVALID_TABLE_PROPERTY, "Data location can only be set when object store layout is enabled"); + } + updateProperties.set(WRITE_DATA_LOCATION, dataLocation); + } + try { updateProperties.commit(); } @@ -1628,12 +2607,23 @@ public void setTableProperties(ConnectorSession session, ConnectorTableHandle ta } } - try { - transaction.commitTransaction(); - } - catch (RuntimeException e) { - throw new TrinoException(ICEBERG_COMMIT_ERROR, "Failed to commit new table properties", e); - } + commitTransaction(transaction, "set table properties"); + } + + public static Map calculateTableCompressionProperties(IcebergFileFormat oldFileFormat, IcebergFileFormat newFileFormat, Map existingProperties, Map inputProperties) + { + ImmutableMap.Builder newCompressionProperties = ImmutableMap.builder(); + + Optional oldCompressionCodec = getHiveCompressionCodec(oldFileFormat, existingProperties); + Optional newCompressionCodec = IcebergTableProperties.getCompressionCodec(inputProperties); + + Optional compressionCodec = newCompressionCodec.or(() -> oldCompressionCodec); + + validateCompression(newFileFormat, compressionCodec); + + compressionCodec.ifPresent(hiveCompressionCodec -> newCompressionProperties.put(getCompressionPropertyName(newFileFormat), hiveCompressionCodec.name())); + + return newCompressionProperties.buildOrThrow(); } private static void updatePartitioning(Table icebergTable, Transaction transaction, List partitionColumns) @@ -1648,14 +2638,12 @@ private static void updatePartitioning(Table icebergTable, Transaction transacti } else { PartitionSpec partitionSpec = parsePartitionFields(schema, partitionColumns); - validateNotPartitionedByNestedField(schema, partitionSpec); Set partitionFields = ImmutableSet.copyOf(partitionSpec.fields()); difference(existingPartitionFields, partitionFields).stream() .map(PartitionField::name) .forEach(updatePartitionSpec::removeField); - difference(partitionFields, existingPartitionFields).stream() - .map(partitionField -> toIcebergTerm(schema, partitionField)) - .forEach(updatePartitionSpec::addField); + difference(partitionFields, existingPartitionFields) + .forEach(partitionField -> updatePartitionSpec.addField(partitionField.name(), toIcebergTerm(schema, partitionField))); } try { @@ -1686,9 +2674,17 @@ public void addColumn(ConnectorSession session, ConnectorTableHandle tableHandle // added - instead of relying on addColumn in iceberg library to assign Ids AtomicInteger nextFieldId = new AtomicInteger(icebergTable.schema().highestFieldId() + 2); try { - icebergTable.updateSchema() - .addColumn(column.getName(), toIcebergTypeForNewColumn(column.getType(), nextFieldId), column.getComment()) - .commit(); + UpdateSchema updateSchema = icebergTable.updateSchema(); + updateSchema.addColumn(null, column.getName(), toIcebergTypeForNewColumn(column.getType(), nextFieldId), column.getComment()); + /* + if (position == ColumnPosition.First) { + updateSchema.moveFirst(column.getName()); + } + else if (position == ColumnPosition.After) { + updateSchema.moveAfter(column.getName(), position.columnName()); + } + */ + updateSchema.commit(); } catch (RuntimeException e) { throw new TrinoException(ICEBERG_COMMIT_ERROR, "Failed to add column: " + firstNonNull(e.getMessage(), e), e); @@ -1705,7 +2701,18 @@ public void addField(ConnectorSession session, ConnectorTableHandle tableHandle, NestedField parent = icebergTable.schema().caseInsensitiveFindField(parentName); String caseSensitiveParentName = icebergTable.schema().findColumnName(parent.fieldId()); - NestedField field = parent.type().asStructType().caseInsensitiveField(fieldName); + + Types.StructType structType; + if (parent.type().isListType()) { + // list(struct...) + structType = parent.type().asListType().elementType().asStructType(); + } + else { + // just struct + structType = parent.type().asStructType(); + } + + NestedField field = structType.caseInsensitiveField(fieldName); if (field != null) { if (ignoreExisting) { return; @@ -1887,7 +2894,19 @@ public void setFieldType(ConnectorSession session, ConnectorTableHandle tableHan NestedField parent = icebergTable.schema().caseInsensitiveFindField(parentPath); String caseSensitiveParentName = icebergTable.schema().findColumnName(parent.fieldId()); - NestedField field = parent.type().asStructType().caseInsensitiveField(getLast(fieldPath)); + + Types.StructType structType; + if (parent.type().isListType()) { + // list(struct...) + structType = parent.type().asListType().elementType().asStructType(); + caseSensitiveParentName += ".element"; + } + else { + // just struct + structType = parent.type().asStructType(); + } + NestedField field = structType.caseInsensitiveField(getLast(fieldPath)); + // TODO: Add support for changing non-primitive field type if (!field.type().isPrimitiveType()) { throw new TrinoException(NOT_SUPPORTED, "Iceberg doesn't support changing field type from non-primitive types"); @@ -1909,6 +2928,25 @@ public void setFieldType(ConnectorSession session, ConnectorTableHandle tableHan } } + //@Override + public void dropNotNullConstraint(ConnectorSession session, ConnectorTableHandle tableHandle, ColumnHandle columnHandle) + { + IcebergTableHandle table = (IcebergTableHandle) tableHandle; + IcebergColumnHandle column = (IcebergColumnHandle) columnHandle; + + Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); + verify(column.isBaseColumn(), "Cannot drop a not null constraint on nested fields"); + + try { + icebergTable.updateSchema() + .makeColumnOptional(column.getName()) + .commit(); + } + catch (RuntimeException e) { + throw new TrinoException(ICEBERG_COMMIT_ERROR, "Failed to drop a not null constraint: " + firstNonNull(e.getMessage(), e), e); + } + } + @Override public TableStatisticsMetadata getStatisticsCollectionMetadataForWrite(ConnectorSession session, ConnectorTableMetadata tableMetadata) { @@ -1918,17 +2956,34 @@ public TableStatisticsMetadata getStatisticsCollectionMetadataForWrite(Connector ConnectorTableHandle tableHandle = getTableHandle(session, tableMetadata.getTable(), Optional.empty(), Optional.empty()); if (tableHandle == null) { - // Assume new table (CTAS), collect all stats possible + // Assume new table (CTAS), collect NDV stats on all columns + return getStatisticsCollectionMetadata(tableMetadata, Optional.empty(), availableColumnNames -> {}); + } + IcebergTableHandle table = checkValidTableHandle(tableHandle); + if (table.getSnapshotId().isEmpty()) { + // Table has no data (empty, or wiped out). Collect NDV stats on all columns return getStatisticsCollectionMetadata(tableMetadata, Optional.empty(), availableColumnNames -> {}); } - TableStatistics tableStatistics = getTableStatistics(session, checkValidTableHandle(tableHandle)); - if (tableStatistics.getRowCount().getValue() == 0.0) { - // Table has no data (empty, or wiped out). Collect all stats possible + + Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); + long snapshotId = table.getSnapshotId().orElseThrow(); + Snapshot snapshot = icebergTable.snapshot(snapshotId); + String totalRecords = snapshot.summary().get(TOTAL_RECORDS_PROP); + if (totalRecords != null && Long.parseLong(totalRecords) == 0) { + // Table has no data (empty, or wiped out). Collect NDV stats on all columns return getStatisticsCollectionMetadata(tableMetadata, Optional.empty(), availableColumnNames -> {}); } - Set columnsWithExtendedStatistics = tableStatistics.getColumnStatistics().entrySet().stream() - .filter(entry -> !entry.getValue().getDistinctValuesCount().isUnknown()) - .map(entry -> ((IcebergColumnHandle) entry.getKey()).getName()) + + Schema schema = SchemaParser.fromJson(table.getTableSchemaJson()); + List columns = getTopLevelColumns(schema, typeManager); + Set columnIds = columns.stream() + .map(IcebergColumnHandle::getId) + .collect(toImmutableSet()); + Map ndvs = readNdvs(icebergTable, snapshotId, columnIds, true); + // Avoid collecting NDV stats on columns where we don't know the existing NDV count + Set columnsWithExtendedStatistics = columns.stream() + .filter(column -> ndvs.containsKey(column.getId())) + .map(IcebergColumnHandle::getName) .collect(toImmutableSet()); return getStatisticsCollectionMetadata(tableMetadata, Optional.of(columnsWithExtendedStatistics), availableColumnNames -> {}); } @@ -1961,7 +3016,7 @@ public ConnectorAnalyzeMetadata getStatisticsCollectionMetadata(ConnectorSession }); return new ConnectorAnalyzeMetadata( - tableHandle, + handle.forAnalyze(), getStatisticsCollectionMetadata( tableMetadata, analyzeColumnNames, @@ -2005,6 +3060,9 @@ public ConnectorTableHandle beginStatisticsCollection(ConnectorSession session, { IcebergTableHandle handle = (IcebergTableHandle) tableHandle; Table icebergTable = catalog.loadTable(session, handle.getSchemaTableName()); + if (isS3Tables(icebergTable.location())) { + throw new TrinoException(NOT_SUPPORTED, "S3 Tables does not support analyze"); + } beginTransaction(icebergTable); return handle; } @@ -2017,45 +3075,23 @@ public void finishStatisticsCollection(ConnectorSession session, ConnectorTableH if (handle.getSnapshotId().isEmpty()) { // No snapshot, table is empty verify( - computedStatistics.isEmpty(), - "Unexpected computed statistics that cannot be attached to a snapshot because none exists: %s", + computedStatistics.size() == 1, + "The computedStatistics size must be 1: %s", + computedStatistics); + ComputedStatistics statistics = getOnlyElement(computedStatistics); + verify(statistics.getGroupingColumns().isEmpty() && + statistics.getGroupingValues().isEmpty() && + statistics.getColumnStatistics().isEmpty() && + statistics.getTableStatistics().isEmpty(), + "Unexpected non-empty statistics that cannot be attached to a snapshot because none exists: %s", computedStatistics); - // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties - // Drop all stats. Empty table needs none - UpdateProperties updateProperties = transaction.updateProperties(); - table.properties().keySet().stream() - .filter(key -> key.startsWith(TRINO_STATS_PREFIX)) - .forEach(updateProperties::remove); - updateProperties.commit(); - - transaction.commitTransaction(); + commitTransaction(transaction, "statistics collection"); transaction = null; return; } long snapshotId = handle.getSnapshotId().orElseThrow(); - Set columnIds = table.schema().columns().stream() - .map(Types.NestedField::fieldId) - .collect(toImmutableSet()); - - // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties - // Drop stats for obsolete columns - UpdateProperties updateProperties = transaction.updateProperties(); - table.properties().keySet().stream() - .filter(key -> { - if (!key.startsWith(TRINO_STATS_PREFIX)) { - return false; - } - Matcher matcher = TRINO_STATS_COLUMN_ID_PATTERN.matcher(key); - if (!matcher.matches()) { - return false; - } - return !columnIds.contains(Integer.parseInt(matcher.group("columnId"))); - }) - .forEach(updateProperties::remove); - updateProperties.commit(); - CollectedStatistics collectedStatistics = processComputedTableStatistics(table, computedStatistics); StatisticsFile statisticsFile = tableStatisticsWriter.writeStatisticsFile( session, @@ -2064,10 +3100,10 @@ public void finishStatisticsCollection(ConnectorSession session, ConnectorTableH REPLACE, collectedStatistics); transaction.updateStatistics() - .setStatistics(snapshotId, statisticsFile) + .setStatistics(statisticsFile) .commit(); - transaction.commitTransaction(); + commitTransaction(transaction, "statistics collection"); transaction = null; } @@ -2105,7 +3141,10 @@ public ColumnHandle getMergeRowIdColumnHandle(ConnectorSession session, Connecto @Override public Optional getUpdateLayout(ConnectorSession session, ConnectorTableHandle tableHandle) { - return Optional.of(IcebergUpdateHandle.INSTANCE); + return getInsertLayout(session, tableHandle) + .flatMap(ConnectorTableLayout::getPartitioning) + .map(IcebergPartitioningHandle.class::cast) + .map(IcebergPartitioningHandle::forUpdate); } @Override @@ -2116,7 +3155,6 @@ public ConnectorMergeTableHandle beginMerge(ConnectorSession session, ConnectorT Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); validateNotModifyingOldSnapshot(table, icebergTable); - validateNotPartitionedByNestedField(icebergTable.schema(), icebergTable.spec()); beginTransaction(icebergTable); @@ -2129,7 +3167,7 @@ public void finishMerge(ConnectorSession session, ConnectorMergeTableHandle merg { IcebergMergeTableHandle mergeHandle = (IcebergMergeTableHandle) mergeTableHandle; IcebergTableHandle handle = mergeHandle.getTableHandle(); - RetryMode retryMode = mergeHandle.getInsertTableHandle().getRetryMode(); + RetryMode retryMode = mergeHandle.getInsertTableHandle().retryMode(); finishWrite(session, handle, fragments, retryMode); } @@ -2147,22 +3185,13 @@ private static void validateNotModifyingOldSnapshot(IcebergTableHandle table, Ta } } - public static void validateNotPartitionedByNestedField(Schema schema, PartitionSpec partitionSpec) - { - Map indexParents = indexParents(schema.asStruct()); - for (PartitionField field : partitionSpec.fields()) { - if (indexParents.containsKey(field.sourceId())) { - throw new TrinoException(NOT_SUPPORTED, "Partitioning by nested field is unsupported: " + field.name()); - } - } - } - private void finishWrite(ConnectorSession session, IcebergTableHandle table, Collection fragments, RetryMode retryMode) { Table icebergTable = transaction.table(); List commitTasks = fragments.stream() - .map(slice -> commitTaskCodec.fromJson(slice.getBytes())) + .map(Slice::getBytes) + .map(commitTaskCodec::fromJson) .collect(toImmutableList()); if (commitTasks.isEmpty()) { @@ -2176,8 +3205,15 @@ private void finishWrite(ConnectorSession session, IcebergTableHandle table, Col RowDelta rowDelta = transaction.newRowDelta(); table.getSnapshotId().map(icebergTable::snapshot).ifPresent(s -> rowDelta.validateFromSnapshot(s.snapshotId())); TupleDomain dataColumnPredicate = table.getEnforcedPredicate().filter((column, domain) -> !isMetadataColumnId(column.getId())); - if (!dataColumnPredicate.isAll()) { - rowDelta.conflictDetectionFilter(toIcebergExpression(dataColumnPredicate)); + TupleDomain effectivePredicate = dataColumnPredicate.intersect(table.getUnenforcedPredicate()); + if (isFileBasedConflictDetectionEnabled(session)) { + effectivePredicate = effectivePredicate.intersect(extractTupleDomainsFromCommitTasks(table, icebergTable, commitTasks, typeManager)); + } + + effectivePredicate = effectivePredicate.filter((ignore, domain) -> isConvertibleToIcebergExpression(domain)); + + if (!effectivePredicate.isAll()) { + rowDelta.conflictDetectionFilter(toIcebergExpression(effectivePredicate)); } IsolationLevel isolationLevel = IsolationLevel.fromName(icebergTable.properties().getOrDefault(DELETE_ISOLATION_LEVEL, DELETE_ISOLATION_LEVEL_DEFAULT)); if (isolationLevel == IsolationLevel.SERIALIZABLE) { @@ -2187,46 +3223,48 @@ private void finishWrite(ConnectorSession session, IcebergTableHandle table, Col // Ensure a row that is updated by this commit was not deleted by a separate commit rowDelta.validateDeletedFiles(); rowDelta.validateNoConflictingDeleteFiles(); + rowDelta.scanManifestsWith(icebergScanExecutor); ImmutableSet.Builder writtenFiles = ImmutableSet.builder(); ImmutableSet.Builder referencedDataFiles = ImmutableSet.builder(); for (CommitTaskData task : commitTasks) { - PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, task.getPartitionSpecJson()); + PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, task.partitionSpecJson()); Type[] partitionColumnTypes = partitionSpec.fields().stream() .map(field -> field.transform().getResultType(schema.findType(field.sourceId()))) .toArray(Type[]::new); - switch (task.getContent()) { + switch (task.content()) { case POSITION_DELETES -> { FileMetadata.Builder deleteBuilder = FileMetadata.deleteFileBuilder(partitionSpec) - .withPath(task.getPath()) - .withFormat(task.getFileFormat().toIceberg()) + .withPath(task.path()) + .withFormat(task.fileFormat().toIceberg()) .ofPositionDeletes() - .withFileSizeInBytes(task.getFileSizeInBytes()) - .withMetrics(task.getMetrics().metrics()); + .withFileSizeInBytes(task.fileSizeInBytes()) + .withMetrics(task.metrics().metrics()); + task.fileSplitOffsets().ifPresent(deleteBuilder::withSplitOffsets); if (!partitionSpec.fields().isEmpty()) { - String partitionDataJson = task.getPartitionDataJson() + String partitionDataJson = task.partitionDataJson() .orElseThrow(() -> new VerifyException("No partition data for partitioned table")); deleteBuilder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes)); } rowDelta.addDeletes(deleteBuilder.build()); - writtenFiles.add(task.getPath()); - task.getReferencedDataFile().ifPresent(referencedDataFiles::add); + writtenFiles.add(task.path()); + task.referencedDataFile().ifPresent(referencedDataFiles::add); } case DATA -> { DataFiles.Builder builder = DataFiles.builder(partitionSpec) - .withPath(task.getPath()) - .withFormat(task.getFileFormat().toIceberg()) - .withFileSizeInBytes(task.getFileSizeInBytes()) - .withMetrics(task.getMetrics().metrics()); + .withPath(task.path()) + .withFormat(task.fileFormat().toIceberg()) + .withFileSizeInBytes(task.fileSizeInBytes()) + .withMetrics(task.metrics().metrics()); if (!icebergTable.spec().fields().isEmpty()) { - String partitionDataJson = task.getPartitionDataJson() + String partitionDataJson = task.partitionDataJson() .orElseThrow(() -> new VerifyException("No partition data for partitioned table")); builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes)); } rowDelta.addRows(builder.build()); - writtenFiles.add(task.getPath()); + writtenFiles.add(task.path()); } - default -> throw new UnsupportedOperationException("Unsupported task content: " + task.getContent()); + default -> throw new UnsupportedOperationException("Unsupported task content: " + task.content()); } } @@ -2236,13 +3274,41 @@ private void finishWrite(ConnectorSession session, IcebergTableHandle table, Col } rowDelta.validateDataFilesExist(referencedDataFiles.build()); - try { - commit(rowDelta, session); - transaction.commitTransaction(); - } - catch (ValidationException e) { - throw new TrinoException(ICEBERG_COMMIT_ERROR, "Failed to commit Iceberg update to table: " + table.getSchemaTableName(), e); + commitUpdateAndTransaction(rowDelta, session, transaction, "write"); + } + + static TupleDomain extractTupleDomainsFromCommitTasks(IcebergTableHandle table, Table icebergTable, List commitTasks, TypeManager typeManager) + { + Set partitionColumns = new HashSet<>(getProjectedColumns(icebergTable.schema(), typeManager, identityPartitionColumnsInAllSpecs(icebergTable))); + PartitionSpec partitionSpec = icebergTable.spec(); + Type[] partitionColumnTypes = partitionSpec.fields().stream() + .map(field -> field.transform().getResultType(icebergTable.schema().findType(field.sourceId()))) + .toArray(Type[]::new); + Schema schema = SchemaParser.fromJson(table.getTableSchemaJson()); + Map> domainsFromTasks = new HashMap<>(); + for (CommitTaskData commitTask : commitTasks) { + PartitionSpec taskPartitionSpec = PartitionSpecParser.fromJson(schema, commitTask.partitionSpecJson()); + if (commitTask.partitionDataJson().isEmpty() || taskPartitionSpec.isUnpartitioned() || !taskPartitionSpec.equals(partitionSpec)) { + // We should not produce any specific domains if there are no partitions or current partitions does not match task partitions for any of tasks + // As each partition value narrows down conflict scope we should produce values from all commit tasks or not at all, to avoid partial information + return TupleDomain.all(); + } + + PartitionData partitionData = PartitionData.fromJson(commitTask.partitionDataJson().get(), partitionColumnTypes); + Map> partitionKeys = getPartitionKeys(partitionData, partitionSpec); + Map partitionValues = getPartitionValues(partitionColumns, partitionKeys); + + for (Map.Entry entry : partitionValues.entrySet()) { + IcebergColumnHandle columnHandle = (IcebergColumnHandle) entry.getKey(); + NullableValue value = entry.getValue(); + Domain newDomain = value.isNull() ? Domain.onlyNull(columnHandle.getType()) : Domain.singleValue(columnHandle.getType(), value.getValue()); + domainsFromTasks.computeIfAbsent(columnHandle, ignore -> new ArrayList<>()).add(newDomain); + } } + return withColumnDomains(domainsFromTasks.entrySet().stream() + .collect(toImmutableMap( + Map.Entry::getKey, + entry -> Domain.union(entry.getValue())))); } @Override @@ -2281,6 +3347,20 @@ public Map getViews(ConnectorSession s return catalog.getViews(session, schemaName); } + //@Override + public boolean isView(ConnectorSession session, SchemaTableName viewName) + { + try { + return catalog.getView(session, viewName).isPresent(); + } + catch (TrinoException e) { + if (e.getErrorCode() == ICEBERG_UNSUPPORTED_VIEW_DIALECT.toErrorCode()) { + return true; + } + throw e; + } + } + @Override public Optional getView(ConnectorSession session, SchemaTableName viewName) { @@ -2296,7 +3376,7 @@ public OptionalLong executeDelete(ConnectorSession session, ConnectorTableHandle DeleteFiles deleteFiles = icebergTable.newDelete() .deleteFromRowFilter(toIcebergExpression(handle.getEnforcedPredicate())); - commit(deleteFiles, session); + commitUpdate(deleteFiles, session, "delete"); Map summary = icebergTable.currentSnapshot().summary(); String deletedRowsStr = summary.get(DELETED_RECORDS_PROP); @@ -2310,6 +3390,16 @@ public OptionalLong executeDelete(ConnectorSession session, ConnectorTableHandle return OptionalLong.of(deletedRecords - removedPositionDeletes - removedEqualityDeletes); } + @Override + public void truncateTable(ConnectorSession session, ConnectorTableHandle tableHandle) + { + IcebergTableHandle table = checkValidTableHandle(tableHandle); + Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); + DeleteFiles deleteFiles = icebergTable.newDelete() + .deleteFromRowFilter(alwaysTrue()); + commitUpdate(deleteFiles, session, "truncate"); + } + public void rollback() { // TODO: cleanup open transaction @@ -2328,7 +3418,6 @@ public Optional> applyLimit(Connect } table = new IcebergTableHandle( - table.getCatalog(), table.getSchemaName(), table.getTableName(), table.getTableType(), @@ -2343,8 +3432,11 @@ public Optional> applyLimit(Connect table.getNameMappingJson(), table.getTableLocation(), table.getStorageProperties(), + table.getTablePartitioning(), table.isRecordScannedFiles(), - table.getMaxScannedFileSize()); + table.getMaxScannedFileSize(), + table.getConstraintColumns(), + table.getForAnalyze()); return Optional.of(new LimitApplicationResult<>(table, false, false)); } @@ -2354,8 +3446,9 @@ public Optional> applyFilter(C { IcebergTableHandle table = (IcebergTableHandle) handle; ConstraintExtractor.ExtractionResult extractionResult = extractTupleDomain(constraint); - TupleDomain predicate = extractionResult.tupleDomain(); - if (predicate.isAll()) { + TupleDomain predicate = extractionResult.tupleDomain() + .transformKeys(IcebergColumnHandle.class::cast); + if (predicate.isAll() && constraint.getPredicateColumns().isEmpty()) { return Optional.empty(); } if (table.getLimit().isPresent()) { @@ -2377,7 +3470,7 @@ public Optional> applyFilter(C Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); Set partitionSpecIds = table.getSnapshotId().map( - snapshot -> icebergTable.snapshot(snapshot).allManifests(icebergTable.io()).stream() + snapshot -> loadAllManifestsFromSnapshot(icebergTable, icebergTable.snapshot(snapshot)).stream() .map(ManifestFile::partitionSpecId) .collect(toImmutableSet())) // No snapshot, so no data. This case doesn't matter. @@ -2388,17 +3481,14 @@ public Optional> applyFilter(C Map newUnenforced = new LinkedHashMap<>(); Map domains = predicate.getDomains().orElseThrow(() -> new VerifyException("No domains")); domains.forEach((columnHandle, domain) -> { - // structural types cannot be used to filter a table scan in Iceberg library. - if (isStructuralType(columnHandle.getType()) || - // Iceberg orders UUID values differently than Trino (perhaps due to https://bugs.openjdk.org/browse/JDK-7025832), so allow only IS NULL / IS NOT NULL checks - (columnHandle.getType() == UUID && !(domain.isOnlyNull() || domain.getValues().isAll()))) { + if (!isConvertibleToIcebergExpression(domain)) { unsupported.put(columnHandle, domain); } else if (canEnforceColumnConstraintInSpecs(typeManager.getTypeOperators(), icebergTable, partitionSpecIds, columnHandle, domain)) { newEnforced.put(columnHandle, domain); } else if (isMetadataColumnId(columnHandle.getId())) { - if (columnHandle.isPathColumn() || columnHandle.isFileModifiedTimeColumn()) { + if (columnHandle.isPartitionColumn() || columnHandle.isPathColumn() || columnHandle.isFileModifiedTimeColumn()) { newEnforced.put(columnHandle, domain); } else { @@ -2415,14 +3505,20 @@ else if (isMetadataColumnId(columnHandle.getId())) { remainingConstraint = TupleDomain.withColumnDomains(newUnenforced).intersect(TupleDomain.withColumnDomains(unsupported)); } + Set newConstraintColumns = Streams.concat( + table.getConstraintColumns().stream(), + constraint.getPredicateColumns().orElseGet(ImmutableSet::of).stream() + .map(columnHandle -> (IcebergColumnHandle) columnHandle)) + .collect(toImmutableSet()); + if (newEnforcedConstraint.equals(table.getEnforcedPredicate()) - && newUnenforcedConstraint.equals(table.getUnenforcedPredicate())) { + && newUnenforcedConstraint.equals(table.getUnenforcedPredicate()) + && newConstraintColumns.equals(table.getConstraintColumns())) { return Optional.empty(); } return Optional.of(new ConstraintApplicationResult<>( new IcebergTableHandle( - table.getCatalog(), table.getSchemaName(), table.getTableName(), table.getTableType(), @@ -2437,13 +3533,41 @@ else if (isMetadataColumnId(columnHandle.getId())) { table.getNameMappingJson(), table.getTableLocation(), table.getStorageProperties(), + table.getTablePartitioning(), table.isRecordScannedFiles(), - table.getMaxScannedFileSize()), + table.getMaxScannedFileSize(), + newConstraintColumns, + table.getForAnalyze()), remainingConstraint.transformKeys(ColumnHandle.class::cast), extractionResult.remainingExpression(), false)); } + private static List loadAllManifestsFromSnapshot(Table icebergTable, Snapshot snapshot) + { + try { + return snapshot.allManifests(icebergTable.io()); + } + catch (NotFoundException | UncheckedIOException e) { + throw new TrinoException(ICEBERG_INVALID_METADATA, "Error accessing manifest file for table %s".formatted(icebergTable.name()), e); + } + } + + /** + * Use instead of loadAllManifestsFromSnapshot when loading manifests from multiple distinct snapshots + * Each BaseSnapshot object caches manifest files separately, so loading manifests from multiple distinct snapshots + * results in O(num_snapshots^2) copies of the same manifest file metadata in memory + */ + private static List loadAllManifestsFromManifestList(Table icebergTable, String manifestListLocation) + { + try { + return IcebergManifestUtils.read(icebergTable.io(), manifestListLocation); + } + catch (NotFoundException | UncheckedIOException e) { + throw new TrinoException(ICEBERG_INVALID_METADATA, "Error accessing manifest file for table %s".formatted(icebergTable.name()), e); + } + } + private static Set identityPartitionColumnsInAllSpecs(Table table) { // Extract identity partition column source ids common to ALL specs @@ -2547,12 +3671,10 @@ private static IcebergColumnHandle createProjectedColumnHandle(IcebergColumnHand fullPath.add(projectedColumnIdentity.getId()); } - return new IcebergColumnHandle( - column.getBaseColumnIdentity(), - column.getBaseType(), - fullPath.build(), - projectedColumnType, - Optional.empty()); + return IcebergColumnHandle.optional(column.getBaseColumnIdentity()) + .fieldType(column.getBaseType(), projectedColumnType) + .path(fullPath.build()) + .build(); } @Override @@ -2568,29 +3690,42 @@ public TableStatistics getTableStatistics(ConnectorSession session, ConnectorTab checkArgument(!originalHandle.isRecordScannedFiles(), "Unexpected scanned files recording set"); checkArgument(originalHandle.getMaxScannedFileSize().isEmpty(), "Unexpected max scanned file size set"); - return tableStatisticsCache.computeIfAbsent( - new IcebergTableHandle( - originalHandle.getCatalog(), - originalHandle.getSchemaName(), - originalHandle.getTableName(), - originalHandle.getTableType(), - originalHandle.getSnapshotId(), - originalHandle.getTableSchemaJson(), - originalHandle.getPartitionSpecJson(), - originalHandle.getFormatVersion(), - originalHandle.getUnenforcedPredicate(), - originalHandle.getEnforcedPredicate(), - OptionalLong.empty(), // limit is currently not included in stats and is not enforced by the connector - ImmutableSet.of(), // projectedColumns don't affect stats - originalHandle.getNameMappingJson(), - originalHandle.getTableLocation(), - originalHandle.getStorageProperties(), - originalHandle.isRecordScannedFiles(), - originalHandle.getMaxScannedFileSize()), - handle -> { - Table icebergTable = catalog.loadTable(session, handle.getSchemaTableName()); - return TableStatisticsReader.getTableStatistics(typeManager, session, handle, icebergTable); - }); + IcebergTableHandle cacheKey = new IcebergTableHandle( + originalHandle.getSchemaName(), + originalHandle.getTableName(), + originalHandle.getTableType(), + originalHandle.getSnapshotId(), + originalHandle.getTableSchemaJson(), + originalHandle.getPartitionSpecJson(), + originalHandle.getFormatVersion(), + originalHandle.getUnenforcedPredicate(), + originalHandle.getEnforcedPredicate(), + OptionalLong.empty(), // limit is currently not included in stats and is not enforced by the connector + ImmutableSet.of(), // projectedColumns are used to request statistics only for the required columns, but are not part of cache key + originalHandle.getNameMappingJson(), + originalHandle.getTableLocation(), + originalHandle.getStorageProperties(), + Optional.empty(), // requiredTablePartitioning does not affect stats + false, // recordScannedFiles does not affect stats + originalHandle.getMaxScannedFileSize(), + ImmutableSet.of(), // constraintColumns do not affect stats + Optional.empty()); // forAnalyze does not affect stats + return getIncrementally( + tableStatisticsCache, + cacheKey, + currentStatistics -> currentStatistics.getColumnStatistics().keySet().containsAll(originalHandle.getProjectedColumns()), + projectedColumns -> { + Table icebergTable = catalog.loadTable(session, originalHandle.getSchemaTableName()); + return TableStatisticsReader.getTableStatistics( + typeManager, + session, + originalHandle, + projectedColumns, + icebergTable, + icebergPlanningExecutor, + fileSystemFactory.create(session.getIdentity(), icebergTable.io().properties())); + }, + originalHandle.getProjectedColumns()); } @Override @@ -2610,9 +3745,14 @@ Table getIcebergTable(ConnectorSession session, SchemaTableName schemaTableName) } @Override - public void createMaterializedView(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition, boolean replace, boolean ignoreExisting) + public void createMaterializedView( + ConnectorSession session, + SchemaTableName viewName, + ConnectorMaterializedViewDefinition definition, + boolean replace, + boolean ignoreExisting) { - catalog.createMaterializedView(session, viewName, definition, replace, ignoreExisting); + catalog.createMaterializedView(session, viewName, definition, Map.of(), replace, ignoreExisting); } @Override @@ -2628,12 +3768,41 @@ public boolean delegateMaterializedViewRefreshToConnector(ConnectorSession sessi } @Override - public ConnectorInsertTableHandle beginRefreshMaterializedView(ConnectorSession session, ConnectorTableHandle tableHandle, List sourceTableHandles, RetryMode retryMode) + public ConnectorInsertTableHandle beginRefreshMaterializedView( + ConnectorSession session, + ConnectorTableHandle tableHandle, + List sourceTableHandles, + RetryMode retryMode) { + checkState(fromSnapshotForRefresh.isEmpty(), "From Snapshot must be empty at the start of MV refresh operation."); IcebergTableHandle table = (IcebergTableHandle) tableHandle; Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); beginTransaction(icebergTable); + Optional dependencies = Optional.ofNullable(icebergTable.currentSnapshot()) + .map(Snapshot::summary) + .map(summary -> summary.get(DEPENDS_ON_TABLES)); + + boolean shouldUseIncremental = isIncrementalRefreshEnabled(session) + //&& refreshType == RefreshType.INCREMENTAL + // there is a single source table + && sourceTableHandles.size() == 1 + // and there are no other foreign sources + //&& !hasForeignSourceTables + // and the source table's fromSnapshot is available in the MV snapshot summary + && dependencies.isPresent() && !dependencies.get().equals(UNKNOWN_SNAPSHOT_TOKEN); + + if (shouldUseIncremental) { + Map sourceTableToSnapshot = MAP_SPLITTER.split(dependencies.get()); + checkState(sourceTableToSnapshot.size() == 1, "Expected %s to contain only single source table in snapshot summary", sourceTableToSnapshot); + Map.Entry sourceTable = getOnlyElement(sourceTableToSnapshot.entrySet()); + String[] schemaTable = sourceTable.getKey().split("\\."); + IcebergTableHandle handle = (IcebergTableHandle) getOnlyElement(sourceTableHandles); + SchemaTableName sourceSchemaTable = new SchemaTableName(schemaTable[0], schemaTable[1]); + checkState(sourceSchemaTable.equals(handle.getSchemaTableName()), "Source table name %s doesn't match handle table name %s", sourceSchemaTable, handle.getSchemaTableName()); + fromSnapshotForRefresh = Optional.of(Long.parseLong(sourceTable.getValue())); + } + return newWritableTableHandle(table.getSchemaTableName(), icebergTable, retryMode); } @@ -2649,13 +3818,21 @@ public Optional finishRefreshMaterializedView( IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle; Table icebergTable = transaction.table(); - // delete before insert .. simulating overwrite - transaction.newDelete() - .deleteFromRowFilter(Expressions.alwaysTrue()) - .commit(); + boolean isFullRefresh = fromSnapshotForRefresh.isEmpty(); + if (isFullRefresh) { + // delete before insert .. simulating overwrite + log.info("Performing full MV refresh for storage table: %s", table.name()); + transaction.newDelete() + .deleteFromRowFilter(Expressions.alwaysTrue()) + .commit(); + } + else { + log.info("Performing incremental MV refresh for storage table: %s", table.name()); + } List commitTasks = fragments.stream() - .map(slice -> commitTaskCodec.fromJson(slice.getBytes())) + .map(Slice::getBytes) + .map(commitTaskCodec::fromJson) .collect(toImmutableList()); Type[] partitionColumnTypes = icebergTable.spec().fields().stream() @@ -2663,61 +3840,61 @@ public Optional finishRefreshMaterializedView( icebergTable.schema().findType(field.sourceId()))) .toArray(Type[]::new); - AppendFiles appendFiles = transaction.newFastAppend(); + AppendFiles appendFiles = isMergeManifestsOnWrite(session) ? transaction.newAppend() : transaction.newFastAppend(); ImmutableSet.Builder writtenFiles = ImmutableSet.builder(); for (CommitTaskData task : commitTasks) { DataFiles.Builder builder = DataFiles.builder(icebergTable.spec()) - .withPath(task.getPath()) - .withFileSizeInBytes(task.getFileSizeInBytes()) - .withFormat(table.getFileFormat().toIceberg()) - .withMetrics(task.getMetrics().metrics()); + .withPath(task.path()) + .withFileSizeInBytes(task.fileSizeInBytes()) + .withFormat(table.fileFormat().toIceberg()) + .withMetrics(task.metrics().metrics()); if (!icebergTable.spec().fields().isEmpty()) { - String partitionDataJson = task.getPartitionDataJson() + String partitionDataJson = task.partitionDataJson() .orElseThrow(() -> new VerifyException("No partition data for partitioned table")); builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes)); } appendFiles.appendFile(builder.build()); - writtenFiles.add(task.getPath()); + writtenFiles.add(task.path()); } - String dependencies = sourceTableHandles.stream() - .map(handle -> { - if (!(handle instanceof IcebergTableHandle icebergHandle)) { - return UNKNOWN_SNAPSHOT_TOKEN; - } - // Currently the catalogs are isolated in separate classloaders, and the above instanceof check is sufficient to know "our" handles. - // This isolation will be removed after we remove Hadoop dependencies, so check that this is "our" handle explicitly. - if (!trinoCatalogHandle.equals(icebergHandle.getCatalog())) { - return UNKNOWN_SNAPSHOT_TOKEN; - } - return icebergHandle.getSchemaTableName() + "=" + icebergHandle.getSnapshotId().map(Object.class::cast).orElse(""); - }) - .distinct() - .collect(joining(",")); + List tableDependencies = new ArrayList<>(); + sourceTableHandles.stream() + .map(IcebergTableHandle.class::cast) + .map(handle -> "%s=%s".formatted( + handle.getSchemaTableName(), + handle.getSnapshotId().map(Object::toString).orElse(""))) + .forEach(tableDependencies::add); + //if (hasForeignSourceTables) { + // tableDependencies.add(UNKNOWN_SNAPSHOT_TOKEN); + //} // try to leave as little garbage as possible behind - if (table.getRetryMode() != NO_RETRIES) { + if (table.retryMode() != NO_RETRIES) { cleanExtraOutputFiles(session, writtenFiles.build()); } // Update the 'dependsOnTables' property that tracks tables on which the materialized view depends and the corresponding snapshot ids of the tables - appendFiles.set(DEPENDS_ON_TABLES, dependencies); + appendFiles.set(DEPENDS_ON_TABLES, String.join(",", tableDependencies)); + //appendFiles.set(DEPENDS_ON_TABLE_FUNCTIONS, String.valueOf(hasSourceTableFunctions)); appendFiles.set(TRINO_QUERY_START_TIME, session.getStart().toString()); - commit(appendFiles, session); - - transaction.commitTransaction(); + appendFiles.scanManifestsWith(icebergScanExecutor); + commitUpdateAndTransaction(appendFiles, session, transaction, "refresh materialized view"); transaction = null; + fromSnapshotForRefresh = Optional.empty(); return Optional.of(new HiveWrittenPartitions(commitTasks.stream() - .map(CommitTaskData::getPath) + .map(CommitTaskData::path) .collect(toImmutableList()))); } @Override public List listMaterializedViews(ConnectorSession session, Optional schemaName) { - return catalog.listMaterializedViews(session, schemaName); + return catalog.listTables(session, schemaName).stream() + .filter(info -> info.extendedRelationType() == TableInfo.ExtendedRelationType.TRINO_MATERIALIZED_VIEW) + .map(TableInfo::tableName) + .toList(); } @Override @@ -2742,6 +3919,12 @@ public Optional getMaterializedView(Connect return catalog.getMaterializedView(session, viewName); } + //@Override + public Map getMaterializedViewProperties(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition) + { + return catalog.getMaterializedViewProperties(session, viewName, definition); + } + @Override public void renameMaterializedView(ConnectorSession session, SchemaTableName source, SchemaTableName target) { @@ -2766,85 +3949,118 @@ public MaterializedViewFreshness getMaterializedViewFreshness(ConnectorSession s .orElseThrow(() -> new IllegalStateException("Storage table missing in definition of materialized view " + materializedViewName)); Table icebergTable = catalog.loadTable(session, storageTableName); - String dependsOnTables = icebergTable.currentSnapshot().summary().getOrDefault(DEPENDS_ON_TABLES, ""); + Optional currentSnapshot = Optional.ofNullable(icebergTable.currentSnapshot()); + String dependsOnTables = currentSnapshot + .map(snapshot -> snapshot.summary().getOrDefault(DEPENDS_ON_TABLES, "")) + .orElse(""); + boolean dependsOnTableFunctions = currentSnapshot + .map(snapshot -> Boolean.valueOf(snapshot.summary().getOrDefault(DEPENDS_ON_TABLE_FUNCTIONS, "false"))) + .orElse(false); + + Optional refreshTime = currentSnapshot.map(snapshot -> snapshot.summary().get(TRINO_QUERY_START_TIME)) + .map(Instant::parse) + .or(() -> currentSnapshot.map(snapshot -> Instant.ofEpochMilli(snapshot.timestampMillis()))); + + if (dependsOnTableFunctions) { + // It can't be determined whether a value returned by table function is STALE or not + return new MaterializedViewFreshness(UNKNOWN, refreshTime); + } + if (dependsOnTables.isEmpty()) { - // Information missing. While it's "unknown" whether storage is stale, we return "stale": under no normal circumstances dependsOnTables should be missing. + // Information missing. While it's "unknown" whether storage is stale, we return "stale". + // Normally dependsOnTables may be missing only when there was no refresh yet. return new MaterializedViewFreshness(STALE, Optional.empty()); } - Instant refreshTime = Optional.ofNullable(icebergTable.currentSnapshot().summary().get(TRINO_QUERY_START_TIME)) - .map(Instant::parse) - .orElseGet(() -> Instant.ofEpochMilli(icebergTable.currentSnapshot().timestampMillis())); + boolean hasUnknownTables = false; - boolean hasStaleIcebergTables = false; Optional firstTableChange = Optional.of(Long.MAX_VALUE); - - Iterable tableToSnapshotIds = Splitter.on(',').split(dependsOnTables); - for (String entry : tableToSnapshotIds) { - if (entry.equals(UNKNOWN_SNAPSHOT_TOKEN)) { - // This is a "federated" materialized view (spanning across connectors). Trust user's choice and assume "fresh or fresh enough". + ImmutableList.Builder> tableChangeInfoTasks = ImmutableList.builder(); + for (String tableToSnapShot : Splitter.on(',').split(dependsOnTables)) { + if (tableToSnapShot.equals(UNKNOWN_SNAPSHOT_TOKEN)) { hasUnknownTables = true; firstTableChange = Optional.empty(); continue; } - List keyValue = Splitter.on("=").splitToList(entry); - if (keyValue.size() != 2) { - throw new TrinoException(ICEBERG_INVALID_METADATA, format("Invalid entry in '%s' property: %s'", DEPENDS_ON_TABLES, entry)); - } - String tableName = keyValue.get(0); - String value = keyValue.get(1); - List strings = Splitter.on(".").splitToList(tableName); - if (strings.size() == 3) { - strings = strings.subList(1, 3); - } - else if (strings.size() != 2) { - throw new TrinoException(ICEBERG_INVALID_METADATA, format("Invalid table name in '%s' property: %s'", DEPENDS_ON_TABLES, strings)); - } - String schema = strings.get(0); - String name = strings.get(1); - SchemaTableName schemaTableName = new SchemaTableName(schema, name); - ConnectorTableHandle tableHandle = getTableHandle(session, schemaTableName, Optional.empty(), Optional.empty()); - if (tableHandle == null || tableHandle instanceof CorruptedIcebergTableHandle) { - // Base table is gone or table is corrupted - return new MaterializedViewFreshness(STALE, Optional.empty()); - } - Optional snapshotAtRefresh; - if (value.isEmpty()) { - snapshotAtRefresh = Optional.empty(); - } - else { - snapshotAtRefresh = Optional.of(Long.parseLong(value)); - } - TableChangeInfo tableChangeInfo = getTableChangeInfo(session, (IcebergTableHandle) tableHandle, snapshotAtRefresh); + tableChangeInfoTasks.add(() -> getTableChangeInfo(session, tableToSnapShot)); + } + + boolean hasStaleIcebergTables = false; + List tableChangeInfos; + + try { + tableChangeInfos = processWithAdditionalThreads(tableChangeInfoTasks.build(), metadataFetchingExecutor); + } + catch (ExecutionException e) { + throw new RuntimeException(e.getCause()); + } + + verifyNotNull(tableChangeInfos); + + for (TableChangeInfo tableChangeInfo : tableChangeInfos) { if (tableChangeInfo instanceof NoTableChange) { // Fresh } - else if (tableChangeInfo instanceof FirstChangeSnapshot firstChange) { + else if (tableChangeInfo instanceof FirstChangeSnapshot firstChangeSnapshot) { hasStaleIcebergTables = true; firstTableChange = firstTableChange - .map(epochMilli -> Math.min(epochMilli, firstChange.snapshot().timestampMillis())); + .map(epochMilli -> Math.min(epochMilli, firstChangeSnapshot.snapshot().timestampMillis())); } else if (tableChangeInfo instanceof UnknownTableChange) { hasStaleIcebergTables = true; firstTableChange = Optional.empty(); } - else { - throw new IllegalStateException("Unhandled table change info " + tableChangeInfo); + else if (tableChangeInfo instanceof CorruptedTableChange) { + return new MaterializedViewFreshness(STALE, Optional.empty()); } } - Instant lastFreshTime = firstTableChange + Optional lastFreshTime = firstTableChange .map(Instant::ofEpochMilli) - .orElse(refreshTime); + .or(() -> refreshTime); if (hasStaleIcebergTables) { - return new MaterializedViewFreshness(STALE, Optional.of(lastFreshTime)); + return new MaterializedViewFreshness(STALE, lastFreshTime); } if (hasUnknownTables) { - return new MaterializedViewFreshness(UNKNOWN, Optional.of(lastFreshTime)); + return new MaterializedViewFreshness(UNKNOWN, lastFreshTime); } return new MaterializedViewFreshness(FRESH, Optional.empty()); } + private TableChangeInfo getTableChangeInfo(ConnectorSession session, String entry) + { + List keyValue = Splitter.on("=").splitToList(entry); + if (keyValue.size() != 2) { + throw new TrinoException(ICEBERG_INVALID_METADATA, format("Invalid entry in '%s' property: %s'", DEPENDS_ON_TABLES, entry)); + } + String tableName = keyValue.get(0); + String value = keyValue.get(1); + List strings = Splitter.on(".").splitToList(tableName); + if (strings.size() == 3) { + strings = strings.subList(1, 3); + } + else if (strings.size() != 2) { + throw new TrinoException(ICEBERG_INVALID_METADATA, format("Invalid table name in '%s' property: %s'", DEPENDS_ON_TABLES, strings)); + } + String schema = strings.get(0); + String name = strings.get(1); + SchemaTableName schemaTableName = new SchemaTableName(schema, name); + ConnectorTableHandle tableHandle = getTableHandle(session, schemaTableName, Optional.empty(), Optional.empty()); + + if (tableHandle == null || tableHandle instanceof CorruptedIcebergTableHandle) { + // Base table is gone or table is corrupted + return new CorruptedTableChange(); + } + Optional snapshotAtRefresh; + if (value.isEmpty()) { + snapshotAtRefresh = Optional.empty(); + } + else { + snapshotAtRefresh = Optional.of(Long.parseLong(value)); + } + return getTableChangeInfo(session, (IcebergTableHandle) tableHandle, snapshotAtRefresh); + } + private TableChangeInfo getTableChangeInfo(ConnectorSession session, IcebergTableHandle table, Optional snapshotAtRefresh) { Table icebergTable = catalog.loadTable(session, table.getSchemaTableName()); @@ -2884,6 +4100,39 @@ public Optional redirectTable(ConnectorSession session, return catalog.redirectTable(session, tableName, targetCatalogName.get()); } + //@Override + public boolean allowSplittingReadIntoMultipleSubQueries(ConnectorSession session, ConnectorTableHandle connectorTableHandle) + { + IcebergTableHandle tableHandle = (IcebergTableHandle) connectorTableHandle; + IcebergFileFormat storageFormat = getFileFormat(tableHandle.getStorageProperties()); + + return storageFormat == ORC || storageFormat == PARQUET; + } + + /* + @Override + public WriterScalingOptions getNewTableWriterScalingOptions(ConnectorSession session, SchemaTableName tableName, Map tableProperties) + { + return WriterScalingOptions.ENABLED; + } + + @Override + public WriterScalingOptions getInsertWriterScalingOptions(ConnectorSession session, ConnectorTableHandle tableHandle) + { + return WriterScalingOptions.ENABLED; + } + */ + + public Optional getIncrementalRefreshFromSnapshot() + { + return fromSnapshotForRefresh; + } + + public void disableIncrementalRefresh() + { + fromSnapshotForRefresh = Optional.empty(); + } + private static CollectedStatistics processComputedTableStatistics(Table table, Collection computedStatistics) { Map columnNameToId = table.schema().columns().stream() @@ -2915,7 +4164,7 @@ private static CollectedStatistics processComputedTableStatistics(Table table, C private void beginTransaction(Table icebergTable) { verify(transaction == null, "transaction already set"); - transaction = icebergTable.newTransaction(); + transaction = catalog.newTransaction(icebergTable); } private static IcebergTableHandle checkValidTableHandle(ConnectorTableHandle tableHandle) @@ -2928,9 +4177,9 @@ private static IcebergTableHandle checkValidTableHandle(ConnectorTableHandle tab } private sealed interface TableChangeInfo - permits NoTableChange, FirstChangeSnapshot, UnknownTableChange {} + permits NoTableChange, FirstChangeSnapshot, UnknownTableChange, CorruptedTableChange {} - private static final class NoTableChange + private record NoTableChange() implements TableChangeInfo {} private record FirstChangeSnapshot(Snapshot snapshot) @@ -2942,6 +4191,53 @@ private record FirstChangeSnapshot(Snapshot snapshot) } } - private static final class UnknownTableChange + private record UnknownTableChange() + implements TableChangeInfo {} + + private record CorruptedTableChange() implements TableChangeInfo {} + + private static TableStatistics getIncrementally( + Map> cache, + IcebergTableHandle key, + Predicate isSufficient, + Function, TableStatistics> columnStatisticsLoader, + Set projectedColumns) + { + AtomicReference valueHolder = cache.computeIfAbsent(key, ignore -> new AtomicReference<>()); + TableStatistics oldValue = valueHolder.get(); + if (oldValue != null && isSufficient.test(oldValue)) { + return oldValue; + } + + TableStatistics newValue; + if (oldValue == null) { + newValue = columnStatisticsLoader.apply(projectedColumns); + } + else { + Sets.SetView missingColumns = difference(projectedColumns, oldValue.getColumnStatistics().keySet()); + newValue = columnStatisticsLoader.apply(missingColumns); + } + + verifyNotNull(newValue, "loader returned null for %s", key); + + TableStatistics merged = mergeColumnStatistics(oldValue, newValue); + if (!valueHolder.compareAndSet(oldValue, merged)) { + // if the value changed in the valueHolder, we only add newly loaded value to be sure we have up-to-date value + valueHolder.accumulateAndGet(newValue, IcebergMetadata::mergeColumnStatistics); + } + return merged; + } + + private static TableStatistics mergeColumnStatistics(TableStatistics currentStats, TableStatistics newStats) + { + requireNonNull(newStats, "newStats is null"); + TableStatistics.Builder statisticsBuilder = TableStatistics.builder(); + if (currentStats != null) { + currentStats.getColumnStatistics().forEach(statisticsBuilder::setColumnStatistics); + } + statisticsBuilder.setRowCount(newStats.getRowCount()); + newStats.getColumnStatistics().forEach(statisticsBuilder::setColumnStatistics); + return statisticsBuilder.build(); + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadataColumn.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadataColumn.java index 57543c76ed42..049414faf5cf 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadataColumn.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadataColumn.java @@ -27,6 +27,7 @@ public enum IcebergMetadataColumn { + PARTITION(MetadataColumns.PARTITION_COLUMN_ID, "$partition", VARCHAR, PRIMITIVE), // Avoid row type considering partition evolutions FILE_PATH(MetadataColumns.FILE_PATH.fieldId(), "$path", VARCHAR, PRIMITIVE), FILE_MODIFIED_TIME(Integer.MAX_VALUE - 1001, "$file_modified_time", TIMESTAMP_TZ_MILLIS, PRIMITIVE), // https://github.com/apache/iceberg/issues/5240 /**/; diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergNodePartitioningProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergNodePartitioningProvider.java index 2ad39c2f426c..68e856789db5 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergNodePartitioningProvider.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergNodePartitioningProvider.java @@ -15,19 +15,22 @@ import com.google.inject.Inject; import io.trino.spi.connector.BucketFunction; +import io.trino.spi.connector.ConnectorBucketNodeMap; import io.trino.spi.connector.ConnectorNodePartitioningProvider; import io.trino.spi.connector.ConnectorPartitioningHandle; import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; import io.trino.spi.connector.ConnectorTransactionHandle; import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; import io.trino.spi.type.TypeOperators; -import org.apache.iceberg.Schema; import java.util.List; +import java.util.Optional; +import java.util.function.ToIntFunction; -import static io.trino.plugin.iceberg.IcebergUtil.schemaFromHandles; -import static io.trino.plugin.iceberg.PartitionFields.parsePartitionFields; +import static io.trino.plugin.iceberg.IcebergPartitionFunction.Transform.BUCKET; +import static io.trino.spi.connector.ConnectorBucketNodeMap.createBucketNodeMap; public class IcebergNodePartitioningProvider implements ConnectorNodePartitioningProvider @@ -40,6 +43,23 @@ public IcebergNodePartitioningProvider(TypeManager typeManager) this.typeOperators = typeManager.getTypeOperators(); } + @Override + public Optional getBucketNodeMapping( + ConnectorTransactionHandle transactionHandle, + ConnectorSession session, + ConnectorPartitioningHandle partitioningHandle) + { + IcebergPartitioningHandle handle = (IcebergPartitioningHandle) partitioningHandle; + + List partitionFunctions = handle.partitionFunctions(); + // when there is a single bucket partition function, inform the engine there is a limit on the number of buckets + // TODO: when there are multiple bucket partition functions, we could compute the product of bucket counts, but this causes the engine to create too many writers + if (partitionFunctions.size() == 1 && partitionFunctions.get(0).transform() == BUCKET) { + return Optional.of(createBucketNodeMap(partitionFunctions.get(0).size().orElseThrow())); //.withCacheKeyHint(handle.getCacheKeyHint())); + } + return Optional.empty(); + } + @Override public BucketFunction getBucketFunction( ConnectorTransactionHandle transactionHandle, @@ -48,16 +68,20 @@ public BucketFunction getBucketFunction( List partitionChannelTypes, int bucketCount) { - if (partitioningHandle instanceof IcebergUpdateHandle) { + IcebergPartitioningHandle handle = (IcebergPartitioningHandle) partitioningHandle; + if (handle.update()) { return new IcebergUpdateBucketFunction(bucketCount); } - IcebergPartitioningHandle handle = (IcebergPartitioningHandle) partitioningHandle; - Schema schema = schemaFromHandles(handle.getPartitioningColumns()); - return new IcebergBucketFunction( - typeOperators, - parsePartitionFields(schema, handle.getPartitioning()), - handle.getPartitioningColumns(), - bucketCount); + return new IcebergBucketFunction(handle, typeOperators, bucketCount); + } + + @Override + public ToIntFunction getSplitBucketFunction( + ConnectorTransactionHandle transactionHandle, + ConnectorSession session, + ConnectorPartitioningHandle partitioningHandle) + { + return new IcebergBucketFunction((IcebergPartitioningHandle) partitioningHandle, typeOperators, 0 /* FIXME: bucketCount */); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSinkProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSinkProvider.java index 1242c9dd4c19..be6b8cb9dfa7 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSinkProvider.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSinkProvider.java @@ -16,7 +16,6 @@ import com.google.inject.Inject; import io.airlift.json.JsonCodec; import io.airlift.units.DataSize; -import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.plugin.hive.SortingFileWriterConfig; import io.trino.plugin.iceberg.procedure.IcebergOptimizeHandle; import io.trino.plugin.iceberg.procedure.IcebergTableExecuteHandle; @@ -42,17 +41,17 @@ import java.util.Map; import static com.google.common.collect.Maps.transformValues; +import static io.trino.plugin.iceberg.IcebergSessionProperties.maxPartitionsPerWriter; import static io.trino.plugin.iceberg.IcebergUtil.getLocationProvider; import static java.util.Objects.requireNonNull; public class IcebergPageSinkProvider implements ConnectorPageSinkProvider { - private final TrinoFileSystemFactory fileSystemFactory; + private final IcebergFileSystemFactory fileSystemFactory; private final JsonCodec jsonCodec; private final IcebergFileWriterFactory fileWriterFactory; private final PageIndexerFactory pageIndexerFactory; - private final int maxOpenPartitions; private final DataSize sortingFileWriterBufferSize; private final int sortingFileWriterMaxOpenFiles; private final TypeManager typeManager; @@ -60,11 +59,10 @@ public class IcebergPageSinkProvider @Inject public IcebergPageSinkProvider( - TrinoFileSystemFactory fileSystemFactory, + IcebergFileSystemFactory fileSystemFactory, JsonCodec jsonCodec, IcebergFileWriterFactory fileWriterFactory, PageIndexerFactory pageIndexerFactory, - IcebergConfig config, SortingFileWriterConfig sortingFileWriterConfig, TypeManager typeManager, PageSorter pageSorter) @@ -73,7 +71,6 @@ public IcebergPageSinkProvider( this.jsonCodec = requireNonNull(jsonCodec, "jsonCodec is null"); this.fileWriterFactory = requireNonNull(fileWriterFactory, "fileWriterFactory is null"); this.pageIndexerFactory = requireNonNull(pageIndexerFactory, "pageIndexerFactory is null"); - this.maxOpenPartitions = config.getMaxPartitionsPerWriter(); this.sortingFileWriterBufferSize = sortingFileWriterConfig.getWriterSortBufferSize(); this.sortingFileWriterMaxOpenFiles = sortingFileWriterConfig.getMaxOpenSortFiles(); this.typeManager = requireNonNull(typeManager, "typeManager is null"); @@ -94,24 +91,24 @@ public ConnectorPageSink createPageSink(ConnectorTransactionHandle transactionHa private ConnectorPageSink createPageSink(ConnectorSession session, IcebergWritableTableHandle tableHandle) { - Schema schema = SchemaParser.fromJson(tableHandle.getSchemaAsJson()); - String partitionSpecJson = tableHandle.getPartitionsSpecsAsJson().get(tableHandle.getPartitionSpecId()); + Schema schema = SchemaParser.fromJson(tableHandle.schemaAsJson()); + String partitionSpecJson = tableHandle.partitionsSpecsAsJson().get(tableHandle.partitionSpecId()); PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, partitionSpecJson); - LocationProvider locationProvider = getLocationProvider(tableHandle.getName(), tableHandle.getOutputPath(), tableHandle.getStorageProperties()); + LocationProvider locationProvider = getLocationProvider(tableHandle.name(), tableHandle.outputPath(), tableHandle.storageProperties()); return new IcebergPageSink( schema, partitionSpec, locationProvider, fileWriterFactory, pageIndexerFactory, - fileSystemFactory.create(session), - tableHandle.getInputColumns(), + fileSystemFactory.create(session.getIdentity(), tableHandle.fileIoProperties()), + tableHandle.inputColumns(), jsonCodec, session, - tableHandle.getFileFormat(), - tableHandle.getStorageProperties(), - maxOpenPartitions, - tableHandle.getSortOrder(), + tableHandle.fileFormat(), + tableHandle.storageProperties(), + maxPartitionsPerWriter(session), + tableHandle.sortOrder(), sortingFileWriterBufferSize, sortingFileWriterMaxOpenFiles, typeManager, @@ -122,37 +119,41 @@ private ConnectorPageSink createPageSink(ConnectorSession session, IcebergWritab public ConnectorPageSink createPageSink(ConnectorTransactionHandle transactionHandle, ConnectorSession session, ConnectorTableExecuteHandle tableExecuteHandle, ConnectorPageSinkId pageSinkId) { IcebergTableExecuteHandle executeHandle = (IcebergTableExecuteHandle) tableExecuteHandle; - switch (executeHandle.getProcedureId()) { + switch (executeHandle.procedureId()) { case OPTIMIZE: - IcebergOptimizeHandle optimizeHandle = (IcebergOptimizeHandle) executeHandle.getProcedureHandle(); - Schema schema = SchemaParser.fromJson(optimizeHandle.getSchemaAsJson()); - PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, optimizeHandle.getPartitionSpecAsJson()); - LocationProvider locationProvider = getLocationProvider(executeHandle.getSchemaTableName(), - executeHandle.getTableLocation(), optimizeHandle.getTableStorageProperties()); + IcebergOptimizeHandle optimizeHandle = (IcebergOptimizeHandle) executeHandle.procedureHandle(); + Schema schema = SchemaParser.fromJson(optimizeHandle.schemaAsJson()); + PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, optimizeHandle.partitionSpecAsJson()); + LocationProvider locationProvider = getLocationProvider(executeHandle.schemaTableName(), + executeHandle.tableLocation(), optimizeHandle.tableStorageProperties()); return new IcebergPageSink( schema, partitionSpec, locationProvider, fileWriterFactory, pageIndexerFactory, - fileSystemFactory.create(session), - optimizeHandle.getTableColumns(), + fileSystemFactory.create(session.getIdentity(), executeHandle.fileIoProperties()), + optimizeHandle.tableColumns(), jsonCodec, session, - optimizeHandle.getFileFormat(), - optimizeHandle.getTableStorageProperties(), - maxOpenPartitions, - optimizeHandle.getSortOrder(), + optimizeHandle.fileFormat(), + optimizeHandle.tableStorageProperties(), + maxPartitionsPerWriter(session), + optimizeHandle.sortOrder(), sortingFileWriterBufferSize, sortingFileWriterMaxOpenFiles, typeManager, pageSorter); + case OPTIMIZE_MANIFESTS: case DROP_EXTENDED_STATS: + case ROLLBACK_TO_SNAPSHOT: case EXPIRE_SNAPSHOTS: case REMOVE_ORPHAN_FILES: + case ADD_FILES: + case ADD_FILES_FROM_TABLE: // handled via ConnectorMetadata.executeTableExecute } - throw new IllegalArgumentException("Unknown procedure: " + executeHandle.getProcedureId()); + throw new IllegalArgumentException("Unknown procedure: " + executeHandle.procedureId()); } @Override @@ -160,22 +161,22 @@ public ConnectorMergeSink createMergeSink(ConnectorTransactionHandle transaction { IcebergMergeTableHandle merge = (IcebergMergeTableHandle) mergeHandle; IcebergWritableTableHandle tableHandle = merge.getInsertTableHandle(); - LocationProvider locationProvider = getLocationProvider(tableHandle.getName(), tableHandle.getOutputPath(), tableHandle.getStorageProperties()); - Schema schema = SchemaParser.fromJson(tableHandle.getSchemaAsJson()); - Map partitionsSpecs = transformValues(tableHandle.getPartitionsSpecsAsJson(), json -> PartitionSpecParser.fromJson(schema, json)); + LocationProvider locationProvider = getLocationProvider(tableHandle.name(), tableHandle.outputPath(), tableHandle.storageProperties()); + Schema schema = SchemaParser.fromJson(tableHandle.schemaAsJson()); + Map partitionsSpecs = transformValues(tableHandle.partitionsSpecsAsJson(), json -> PartitionSpecParser.fromJson(schema, json)); ConnectorPageSink pageSink = createPageSink(session, tableHandle); return new IcebergMergeSink( locationProvider, fileWriterFactory, - fileSystemFactory.create(session), + fileSystemFactory.create(session.getIdentity(), tableHandle.fileIoProperties()), jsonCodec, session, - tableHandle.getFileFormat(), - tableHandle.getStorageProperties(), + tableHandle.fileFormat(), + tableHandle.storageProperties(), schema, partitionsSpecs, pageSink, - tableHandle.getInputColumns().size()); + schema.columns().size()); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProvider.java index 259ea8d5db6d..baacd6d9c641 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProvider.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProvider.java @@ -13,20 +13,17 @@ */ package io.trino.plugin.iceberg; -import com.google.common.base.Suppliers; -import com.google.common.base.VerifyException; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.AbstractIterator; import com.google.common.collect.BiMap; import com.google.common.collect.ImmutableBiMap; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; -import com.google.common.graph.Traverser; -import com.google.inject.Inject; import io.airlift.slice.Slice; +import io.trino.connector.system.SystemColumnHandle; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; -import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.filesystem.TrinoInputFile; import io.trino.memory.context.AggregatedMemoryContext; import io.trino.orc.OrcColumn; @@ -34,37 +31,43 @@ import io.trino.orc.OrcDataSource; import io.trino.orc.OrcDataSourceId; import io.trino.orc.OrcReader; +import io.trino.orc.OrcReader.ProjectedLayout; import io.trino.orc.OrcReaderOptions; import io.trino.orc.OrcRecordReader; import io.trino.orc.TupleDomainOrcPredicate; import io.trino.orc.TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder; -import io.trino.orc.metadata.OrcType; -import io.trino.parquet.BloomFilterStore; +import io.trino.parquet.Column; import io.trino.parquet.Field; import io.trino.parquet.ParquetCorruptionException; import io.trino.parquet.ParquetDataSource; import io.trino.parquet.ParquetDataSourceId; import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.metadata.FileMetadata; +import io.trino.parquet.metadata.ParquetMetadata; import io.trino.parquet.predicate.TupleDomainParquetPredicate; import io.trino.parquet.reader.MetadataReader; -import io.trino.parquet.reader.ParquetReader; +import io.trino.parquet.reader.ParquetReaderNew; +import io.trino.parquet.reader.RowGroupInfo; import io.trino.plugin.hive.FileFormatDataSourceStats; -import io.trino.plugin.hive.ReaderColumns; -import io.trino.plugin.hive.ReaderPageSource; -import io.trino.plugin.hive.ReaderProjectionsAdapter; +import io.trino.plugin.hive.TransformConnectorPageSource; import io.trino.plugin.hive.orc.OrcPageSource; -import io.trino.plugin.hive.orc.OrcPageSource.ColumnAdaptation; -import io.trino.plugin.hive.orc.OrcReaderConfig; -import io.trino.plugin.hive.parquet.ParquetPageSource; -import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hive.parquet.MemoryParquetDataSource; +import io.trino.plugin.hive.parquet.ParquetPageSourceNew; import io.trino.plugin.hive.parquet.TrinoParquetDataSource; import io.trino.plugin.iceberg.IcebergParquetColumnIOConverter.FieldContext; import io.trino.plugin.iceberg.delete.DeleteFile; -import io.trino.plugin.iceberg.delete.DeleteFilter; -import io.trino.plugin.iceberg.delete.PositionDeleteFilter; +import io.trino.plugin.iceberg.delete.DeleteManager; import io.trino.plugin.iceberg.delete.RowPredicate; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; import io.trino.plugin.iceberg.fileio.ForwardingInputFile; +import io.trino.plugin.iceberg.system.files.FilesTablePageSource; +import io.trino.plugin.iceberg.system.files.FilesTableSplit; +import io.trino.spi.Page; import io.trino.spi.TrinoException; +import io.trino.spi.block.Block; +import io.trino.spi.block.IntArrayBlock; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.block.VariableWidthBlock; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ConnectorPageSource; import io.trino.spi.connector.ConnectorPageSourceProvider; @@ -74,20 +77,17 @@ import io.trino.spi.connector.ConnectorTransactionHandle; import io.trino.spi.connector.DynamicFilter; import io.trino.spi.connector.EmptyPageSource; +import io.trino.spi.connector.FixedPageSource; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.NullableValue; -import io.trino.spi.predicate.Range; import io.trino.spi.predicate.TupleDomain; -import io.trino.spi.predicate.ValueSet; import io.trino.spi.type.ArrayType; import io.trino.spi.type.MapType; import io.trino.spi.type.RowType; -import io.trino.spi.type.StandardTypes; import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericDatumReader; -import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.PartitionSpecParser; import org.apache.iceberg.Schema; @@ -99,22 +99,16 @@ import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.parquet.ParquetSchemaUtil; -import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructLikeWrapper; import org.apache.parquet.column.ColumnDescriptor; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.FileMetaData; -import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.parquet.io.ColumnIO; import org.apache.parquet.io.MessageColumnIO; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; -import org.roaringbitmap.longlong.LongBitmapDataProvider; -import org.roaringbitmap.longlong.Roaring64Bitmap; import java.io.IOException; import java.io.UncheckedIOException; -import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -123,31 +117,39 @@ import java.util.Optional; import java.util.OptionalLong; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Function; +import java.util.function.ObjLongConsumer; import java.util.function.Supplier; +import java.util.stream.IntStream; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Suppliers.memoize; +import static com.google.common.base.Throwables.throwIfInstanceOf; import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Maps.uniqueIndex; +import static io.airlift.slice.SizeOf.SIZE_OF_LONG; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.SizeOf.sizeOf; import static io.airlift.slice.Slices.utf8Slice; import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; import static io.trino.orc.OrcReader.INITIAL_BATCH_SIZE; -import static io.trino.orc.OrcReader.ProjectedLayout; import static io.trino.orc.OrcReader.fullyProjectedLayout; -import static io.trino.parquet.BloomFilterStore.getBloomFilterStore; import static io.trino.parquet.ParquetTypeUtils.getColumnIO; import static io.trino.parquet.ParquetTypeUtils.getDescriptors; import static io.trino.parquet.predicate.PredicateUtils.buildPredicate; -import static io.trino.parquet.predicate.PredicateUtils.predicateMatches; -import static io.trino.plugin.iceberg.IcebergColumnHandle.TRINO_MERGE_PARTITION_DATA; -import static io.trino.plugin.iceberg.IcebergColumnHandle.TRINO_MERGE_PARTITION_SPEC_ID; +import static io.trino.parquet.predicate.PredicateUtils.getFilteredRowGroups; +import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.PRIMITIVE; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CURSOR_ERROR; import static io.trino.plugin.iceberg.IcebergMetadataColumn.FILE_MODIFIED_TIME; import static io.trino.plugin.iceberg.IcebergMetadataColumn.FILE_PATH; +import static io.trino.plugin.iceberg.IcebergMetadataColumn.PARTITION; import static io.trino.plugin.iceberg.IcebergSessionProperties.getOrcLazyReadSmallRanges; import static io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxBufferSize; import static io.trino.plugin.iceberg.IcebergSessionProperties.getOrcMaxMergeDistance; @@ -156,42 +158,43 @@ import static io.trino.plugin.iceberg.IcebergSessionProperties.getOrcTinyStripeThreshold; import static io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockRowCount; import static io.trino.plugin.iceberg.IcebergSessionProperties.getParquetMaxReadBlockSize; +import static io.trino.plugin.iceberg.IcebergSessionProperties.getParquetSmallFileThreshold; import static io.trino.plugin.iceberg.IcebergSessionProperties.isOrcBloomFiltersEnabled; import static io.trino.plugin.iceberg.IcebergSessionProperties.isOrcNestedLazy; -import static io.trino.plugin.iceberg.IcebergSessionProperties.isParquetOptimizedNestedReaderEnabled; -import static io.trino.plugin.iceberg.IcebergSessionProperties.isParquetOptimizedReaderEnabled; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isParquetIgnoreStatistics; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isParquetVectorizedDecodingEnabled; import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; import static io.trino.plugin.iceberg.IcebergSessionProperties.useParquetBloomFilter; import static io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD; +import static io.trino.plugin.iceberg.IcebergSplitSource.partitionMatchesPredicate; +import static io.trino.plugin.iceberg.IcebergUtil.createNullBlock; import static io.trino.plugin.iceberg.IcebergUtil.deserializePartitionValue; +import static io.trino.plugin.iceberg.IcebergUtil.fromFieldBlocks; import static io.trino.plugin.iceberg.IcebergUtil.getColumnHandle; +import static io.trino.plugin.iceberg.IcebergUtil.getDomain; import static io.trino.plugin.iceberg.IcebergUtil.getPartitionKeys; +import static io.trino.plugin.iceberg.IcebergUtil.getPartitionValues; import static io.trino.plugin.iceberg.IcebergUtil.schemaFromHandles; -import static io.trino.plugin.iceberg.TypeConverter.ICEBERG_BINARY_TYPE; -import static io.trino.plugin.iceberg.TypeConverter.ORC_ICEBERG_ID_KEY; -import static io.trino.plugin.iceberg.delete.EqualityDeleteFilter.readEqualityDeletes; -import static io.trino.plugin.iceberg.delete.PositionDeleteFilter.readPositionDeletes; -import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.plugin.iceberg.util.OrcIcebergIds.fileColumnsByIcebergId; +import static io.trino.plugin.iceberg.util.OrcTypeConverter.ORC_ICEBERG_ID_KEY; +import static io.trino.spi.block.PageBuilderStatus.DEFAULT_MAX_PAGE_SIZE_IN_BYTES; import static io.trino.spi.predicate.Utils.nativeValueToBlock; -import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.BooleanType.BOOLEAN; import static io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone; -import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.spi.type.TimeZoneKey.UTC_KEY; -import static io.trino.spi.type.UuidType.UUID; -import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.Math.min; +import static java.lang.Math.toIntExact; import static java.lang.String.format; import static java.util.Locale.ENGLISH; +import static java.util.Objects.checkIndex; import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; import static java.util.function.Predicate.not; import static java.util.stream.Collectors.groupingBy; import static java.util.stream.Collectors.mapping; -import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toUnmodifiableList; import static org.apache.iceberg.FileContent.EQUALITY_DELETES; import static org.apache.iceberg.FileContent.POSITION_DELETES; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_PATH; -import static org.apache.iceberg.MetadataColumns.DELETE_FILE_POS; import static org.apache.iceberg.MetadataColumns.ROW_POSITION; import static org.joda.time.DateTimeZone.UTC; @@ -200,25 +203,37 @@ public class IcebergPageSourceProvider { private static final String AVRO_FIELD_ID = "field-id"; - private final TrinoFileSystemFactory fileSystemFactory; + // This is used whenever a query doesn't reference any data columns. + // We need to limit the number of rows per page in case there are projections + // in the query that can cause page sizes to explode. For example: SELECT rand() FROM some_table + // TODO (https://github.com/trinodb/trino/issues/16824) allow connector to return pages of arbitrary row count and handle this gracefully in engine + private static final int MAX_RLE_PAGE_SIZE = DEFAULT_MAX_PAGE_SIZE_IN_BYTES / SIZE_OF_LONG; + + private final IcebergFileSystemFactory fileSystemFactory; + private final ForwardingFileIoFactory fileIoFactory; private final FileFormatDataSourceStats fileFormatDataSourceStats; private final OrcReaderOptions orcReaderOptions; private final ParquetReaderOptions parquetReaderOptions; private final TypeManager typeManager; + private final DeleteManager unpartitionedTableDeleteManager; + private final Map> partitionKeyFactories = new ConcurrentHashMap<>(); + private final Map partitionedDeleteManagers = new ConcurrentHashMap<>(); - @Inject public IcebergPageSourceProvider( - TrinoFileSystemFactory fileSystemFactory, + IcebergFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory, FileFormatDataSourceStats fileFormatDataSourceStats, - OrcReaderConfig orcReaderConfig, - ParquetReaderConfig parquetReaderConfig, + OrcReaderOptions orcReaderOptions, + ParquetReaderOptions parquetReaderOptions, TypeManager typeManager) { this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.fileIoFactory = requireNonNull(fileIoFactory, "fileIoFactory is null"); this.fileFormatDataSourceStats = requireNonNull(fileFormatDataSourceStats, "fileFormatDataSourceStats is null"); - this.orcReaderOptions = orcReaderConfig.toOrcReaderOptions(); - this.parquetReaderOptions = parquetReaderConfig.toParquetReaderOptions(); + this.orcReaderOptions = requireNonNull(orcReaderOptions, "orcReaderOptions is null"); + this.parquetReaderOptions = requireNonNull(parquetReaderOptions, "parquetReaderOptions is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.unpartitionedTableDeleteManager = new DeleteManager(typeManager); } @Override @@ -230,115 +245,222 @@ public ConnectorPageSource createPageSource( List columns, DynamicFilter dynamicFilter) { - IcebergSplit split = (IcebergSplit) connectorSplit; - IcebergTableHandle table = (IcebergTableHandle) connectorTable; + if (connectorSplit instanceof FilesTableSplit filesTableSplit) { + return new FilesTablePageSource( + typeManager, + fileSystemFactory.create(session.getIdentity(), filesTableSplit.fileIoProperties()), + fileIoFactory, + columns.stream().map(SystemColumnHandle.class::cast).map(SystemColumnHandle::getColumnName).collect(toImmutableList()), + filesTableSplit); + } + IcebergSplit split = (IcebergSplit) connectorSplit; List icebergColumns = columns.stream() .map(IcebergColumnHandle.class::cast) .collect(toImmutableList()); - - Schema tableSchema = SchemaParser.fromJson(table.getTableSchemaJson()); - - Set deleteFilterRequiredColumns = requiredColumnsForDeletes(tableSchema, split.getDeletes()); - - PartitionSpec partitionSpec = PartitionSpecParser.fromJson(tableSchema, split.getPartitionSpecJson()); + IcebergTableHandle tableHandle = (IcebergTableHandle) connectorTable; + Schema schema = SchemaParser.fromJson(tableHandle.getTableSchemaJson()); + PartitionSpec partitionSpec = PartitionSpecParser.fromJson(schema, split.getPartitionSpecJson()); org.apache.iceberg.types.Type[] partitionColumnTypes = partitionSpec.fields().stream() - .map(field -> field.transform().getResultType(tableSchema.findType(field.sourceId()))) + .map(field -> field.transform().getResultType(schema.findType(field.sourceId()))) .toArray(org.apache.iceberg.types.Type[]::new); - PartitionData partitionData = PartitionData.fromJson(split.getPartitionDataJson(), partitionColumnTypes); + + return createPageSource( + session, + icebergColumns, + schema, + partitionSpec, + PartitionData.fromJson(split.getPartitionDataJson(), partitionColumnTypes), + split.getDeletes(), + dynamicFilter, + tableHandle.getUnenforcedPredicate(), + split.getFileStatisticsDomain(), + split.getPath(), + split.getStart(), + split.getLength(), + split.getFileSize(), + split.getFileRecordCount(), + split.getPartitionDataJson(), + split.getFileFormat(), + split.getFileIoProperties(), + split.getDataSequenceNumber(), + tableHandle.getNameMappingJson().map(NameMappingParser::fromJson)); + } + + public ConnectorPageSource createPageSource( + ConnectorSession session, + List icebergColumns, + Schema tableSchema, + PartitionSpec partitionSpec, + PartitionData partitionData, + List deletes, + DynamicFilter dynamicFilter, + TupleDomain unenforcedPredicate, + TupleDomain fileStatisticsDomain, + String path, + long start, + long length, + long fileSize, + long fileRecordCount, + String partitionDataJson, + IcebergFileFormat fileFormat, + Map fileIoProperties, + long dataSequenceNumber, + Optional nameMapping) + { Map> partitionKeys = getPartitionKeys(partitionData, partitionSpec); + TupleDomain effectivePredicate = getUnenforcedPredicate( + tableSchema, + partitionKeys, + dynamicFilter, + unenforcedPredicate, + fileStatisticsDomain); + if (effectivePredicate.isNone()) { + return new EmptyPageSource(); + } + + // exit early when only reading partition keys from a simple split + String partition = partitionSpec.partitionToPath(partitionData); + TrinoFileSystem fileSystem = fileSystemFactory.create(session.getIdentity(), fileIoProperties); + TrinoInputFile inputFile = isUseFileSizeFromMetadata(session) + ? fileSystem.newInputFile(Location.of(path), fileSize) + : fileSystem.newInputFile(Location.of(path)); + try { + if (effectivePredicate.isAll() && + start == 0 && length == inputFile.length() && + deletes.isEmpty() && + icebergColumns.stream().allMatch(column -> partitionKeys.containsKey(column.getId()))) { + return generatePages( + fileRecordCount, + icebergColumns, + partitionKeys); + } + } + catch (IOException e) { + throw new UncheckedIOException(e); + } List requiredColumns = new ArrayList<>(icebergColumns); + Set deleteFilterRequiredColumns = requiredColumnsForDeletes(tableSchema, deletes); deleteFilterRequiredColumns.stream() .filter(not(icebergColumns::contains)) .forEach(requiredColumns::add); - icebergColumns.stream() - .filter(column -> column.isUpdateRowIdColumn() || column.isMergeRowIdColumn()) - .findFirst().ifPresent(rowIdColumn -> { - Set alreadyRequiredColumnIds = requiredColumns.stream() - .map(IcebergColumnHandle::getId) - .collect(toImmutableSet()); - for (ColumnIdentity identity : rowIdColumn.getColumnIdentity().getChildren()) { - if (alreadyRequiredColumnIds.contains(identity.getId())) { - // ignore - } - else if (identity.getId() == MetadataColumns.FILE_PATH.fieldId()) { - requiredColumns.add(new IcebergColumnHandle(identity, VARCHAR, ImmutableList.of(), VARCHAR, Optional.empty())); - } - else if (identity.getId() == ROW_POSITION.fieldId()) { - requiredColumns.add(new IcebergColumnHandle(identity, BIGINT, ImmutableList.of(), BIGINT, Optional.empty())); - } - else if (identity.getId() == TRINO_MERGE_PARTITION_SPEC_ID) { - requiredColumns.add(new IcebergColumnHandle(identity, INTEGER, ImmutableList.of(), INTEGER, Optional.empty())); - } - else if (identity.getId() == TRINO_MERGE_PARTITION_DATA) { - requiredColumns.add(new IcebergColumnHandle(identity, VARCHAR, ImmutableList.of(), VARCHAR, Optional.empty())); - } - else { - requiredColumns.add(getColumnHandle(tableSchema.findField(identity.getId()), typeManager)); - } - } - }); - - TupleDomain effectivePredicate = table.getUnenforcedPredicate() - .intersect(dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast)) - .simplify(ICEBERG_DOMAIN_COMPACTION_THRESHOLD); - if (effectivePredicate.isNone()) { - return new EmptyPageSource(); - } - - TrinoFileSystem fileSystem = fileSystemFactory.create(session); - TrinoInputFile inputfile = isUseFileSizeFromMetadata(session) - ? fileSystem.newInputFile(Location.of(split.getPath()), split.getFileSize()) - : fileSystem.newInputFile(Location.of(split.getPath())); - ReaderPageSourceWithRowPositions readerPageSourceWithRowPositions = createDataPageSource( session, - inputfile, - split.getStart(), - split.getLength(), + inputFile, + start, + length, + fileSize, partitionSpec.specId(), - split.getPartitionDataJson(), - split.getFileFormat(), - SchemaParser.fromJson(table.getTableSchemaJson()), + partitionDataJson, + fileFormat, + tableSchema, requiredColumns, effectivePredicate, - table.getNameMappingJson().map(NameMappingParser::fromJson), + nameMapping, + partition, partitionKeys); - ReaderPageSource dataPageSource = readerPageSourceWithRowPositions.getReaderPageSource(); - - Optional projectionsAdapter = dataPageSource.getReaderColumns().map(readerColumns -> - new ReaderProjectionsAdapter( - requiredColumns, - readerColumns, - column -> ((IcebergColumnHandle) column).getType(), - IcebergPageSourceProvider::applyProjection)); - - List readColumns = dataPageSource.getReaderColumns() - .map(readerColumns -> readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toList())) - .orElse(requiredColumns); - - Supplier> deletePredicate = Suppliers.memoize(() -> { - List deleteFilters = readDeletes( - session, - tableSchema, - split.getPath(), - split.getDeletes(), - readerPageSourceWithRowPositions.getStartRowPosition(), - readerPageSourceWithRowPositions.getEndRowPosition()); - return deleteFilters.stream() - .map(filter -> filter.createPredicate(readColumns)) - .reduce(RowPredicate::and); - }); - return new IcebergPageSource( - icebergColumns, - requiredColumns, - dataPageSource.get(), - projectionsAdapter, - deletePredicate); + ConnectorPageSource pageSource = readerPageSourceWithRowPositions.pageSource(); + + // filter out deleted rows + if (!deletes.isEmpty()) { + Supplier> deletePredicate = memoize(() -> getDeleteManager(partitionSpec, partitionData) + .getDeletePredicate( + path, + dataSequenceNumber, + deletes, + requiredColumns, + tableSchema, + readerPageSourceWithRowPositions, + (deleteFile, deleteColumns, tupleDomain) -> openDeletes(session, fileSystem, deleteFile, deleteColumns, tupleDomain))); + pageSource = TransformConnectorPageSource.create(pageSource, page -> { + try { + Optional rowPredicate = deletePredicate.get(); + rowPredicate.ifPresent(predicate -> predicate.applyFilter(page)); + if (icebergColumns.size() == page.getChannelCount()) { + return page; + } + return new PrefixColumnsSourcePage(page, icebergColumns.size()).getPage(); + } + catch (RuntimeException e) { + throwIfInstanceOf(e, TrinoException.class); + throw new TrinoException(ICEBERG_BAD_DATA, e); + } + }); + } + return pageSource; + } + + private DeleteManager getDeleteManager(PartitionSpec partitionSpec, PartitionData partitionData) + { + if (partitionSpec.isUnpartitioned()) { + return unpartitionedTableDeleteManager; + } + + Types.StructType structType = partitionSpec.partitionType(); + PartitionKey partitionKey = partitionKeyFactories.computeIfAbsent( + partitionSpec.specId(), + key -> { + // creating the template wrapper is expensive, reuse it for all partitions of the same spec + // reuse is only safe because we only use the copyFor method which is thread safe + StructLikeWrapper templateWrapper = StructLikeWrapper.forType(structType); + return data -> new PartitionKey(key, templateWrapper.copyFor(data)); + }) + .apply(partitionData); + + return partitionedDeleteManagers.computeIfAbsent(partitionKey, ignored -> new DeleteManager(typeManager)); + } + + private record PartitionKey(int specId, StructLikeWrapper partitionData) {} + + private TupleDomain getUnenforcedPredicate( + Schema tableSchema, + Map> partitionKeys, + DynamicFilter dynamicFilter, + TupleDomain unenforcedPredicate, + TupleDomain fileStatisticsDomain) + { + return prunePredicate( + tableSchema, + partitionKeys, + // We reach here when we could not prune the split using file level stats, table predicate + // and the dynamic filter in the coordinator during split generation. The file level stats + // in IcebergSplit#fileStatisticsDomain could help to prune this split when a more selective dynamic filter + // is available now, without having to access parquet/orc file footer for row-group/stripe stats. + TupleDomain.intersect(ImmutableList.of( + unenforcedPredicate, + fileStatisticsDomain, + dynamicFilter.getCurrentPredicate().transformKeys(IcebergColumnHandle.class::cast))), + fileStatisticsDomain) + .simplify(ICEBERG_DOMAIN_COMPACTION_THRESHOLD); + } + + private TupleDomain prunePredicate( + Schema tableSchema, + Map> partitionKeys, + TupleDomain unenforcedPredicate, + TupleDomain fileStatisticsDomain) + { + if (unenforcedPredicate.isAll() || unenforcedPredicate.isNone()) { + return unenforcedPredicate; + } + + Set partitionColumns = partitionKeys.keySet().stream() + .map(fieldId -> getColumnHandle(tableSchema.findField(fieldId), typeManager)) + .collect(toImmutableSet()); + Supplier> partitionValues = memoize(() -> getPartitionValues(partitionColumns, partitionKeys)); + if (!partitionMatchesPredicate(partitionColumns, partitionValues, unenforcedPredicate)) { + return TupleDomain.none(); + } + + return unenforcedPredicate + // Filter out partition columns domains from the dynamic filter because they should be irrelevant at data file level + .filter((columnHandle, ignore) -> !partitionKeys.containsKey(columnHandle.getId())) + // remove domains from predicate that fully contain split data because they are irrelevant for filtering + .filter((handle, domain) -> !domain.contains(getDomain(fileStatisticsDomain, handle, domain.getType()))); } private Set requiredColumnsForDeletes(Schema schema, List deletes) @@ -358,92 +480,19 @@ else if (deleteFile.content() == EQUALITY_DELETES) { return requiredColumns.build(); } - private List readDeletes( - ConnectorSession session, - Schema schema, - String dataFilePath, - List deleteFiles, - Optional startRowPosition, - Optional endRowPosition) - { - verify(startRowPosition.isPresent() == endRowPosition.isPresent(), "startRowPosition and endRowPosition must be specified together"); - - Slice targetPath = utf8Slice(dataFilePath); - List filters = new ArrayList<>(); - LongBitmapDataProvider deletedRows = new Roaring64Bitmap(); - - IcebergColumnHandle deleteFilePath = getColumnHandle(DELETE_FILE_PATH, typeManager); - IcebergColumnHandle deleteFilePos = getColumnHandle(DELETE_FILE_POS, typeManager); - List deleteColumns = ImmutableList.of(deleteFilePath, deleteFilePos); - TupleDomain deleteDomain = TupleDomain.fromFixedValues(ImmutableMap.of(deleteFilePath, NullableValue.of(VARCHAR, targetPath))); - if (startRowPosition.isPresent()) { - Range positionRange = Range.range(deleteFilePos.getType(), startRowPosition.get(), true, endRowPosition.get(), true); - TupleDomain positionDomain = TupleDomain.withColumnDomains(ImmutableMap.of(deleteFilePos, Domain.create(ValueSet.ofRanges(positionRange), false))); - deleteDomain = deleteDomain.intersect(positionDomain); - } - - for (DeleteFile delete : deleteFiles) { - if (delete.content() == POSITION_DELETES) { - if (startRowPosition.isPresent()) { - byte[] lowerBoundBytes = delete.getLowerBounds().get(DELETE_FILE_POS.fieldId()); - Optional positionLowerBound = Optional.ofNullable(lowerBoundBytes) - .map(bytes -> Conversions.fromByteBuffer(DELETE_FILE_POS.type(), ByteBuffer.wrap(bytes))); - - byte[] upperBoundBytes = delete.getUpperBounds().get(DELETE_FILE_POS.fieldId()); - Optional positionUpperBound = Optional.ofNullable(upperBoundBytes) - .map(bytes -> Conversions.fromByteBuffer(DELETE_FILE_POS.type(), ByteBuffer.wrap(bytes))); - - if ((positionLowerBound.isPresent() && positionLowerBound.get() > endRowPosition.get()) || - (positionUpperBound.isPresent() && positionUpperBound.get() < startRowPosition.get())) { - continue; - } - } - - try (ConnectorPageSource pageSource = openDeletes(session, delete, deleteColumns, deleteDomain)) { - readPositionDeletes(pageSource, targetPath, deletedRows); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - else if (delete.content() == EQUALITY_DELETES) { - List fieldIds = delete.equalityFieldIds(); - verify(!fieldIds.isEmpty(), "equality field IDs are missing"); - List columns = fieldIds.stream() - .map(id -> getColumnHandle(schema.findField(id), typeManager)) - .collect(toImmutableList()); - - try (ConnectorPageSource pageSource = openDeletes(session, delete, columns, TupleDomain.all())) { - filters.add(readEqualityDeletes(pageSource, columns, schema)); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - else { - throw new VerifyException("Unknown delete content: " + delete.content()); - } - } - - if (!deletedRows.isEmpty()) { - filters.add(new PositionDeleteFilter(deletedRows)); - } - - return filters; - } - private ConnectorPageSource openDeletes( ConnectorSession session, + TrinoFileSystem fileSystem, DeleteFile delete, List columns, TupleDomain tupleDomain) { - TrinoFileSystem fileSystem = fileSystemFactory.create(session); return createDataPageSource( session, fileSystem.newInputFile(Location.of(delete.path()), delete.fileSizeInBytes()), 0, delete.fileSizeInBytes(), + delete.fileSizeInBytes(), 0, "", IcebergFileFormat.fromIceberg(delete.format()), @@ -451,16 +500,17 @@ private ConnectorPageSource openDeletes( columns, tupleDomain, Optional.empty(), + "", ImmutableMap.of()) - .getReaderPageSource() - .get(); + .pageSource(); } - public ReaderPageSourceWithRowPositions createDataPageSource( + private ReaderPageSourceWithRowPositions createDataPageSource( ConnectorSession session, TrinoInputFile inputFile, long start, long length, + long fileSize, int partitionSpecId, String partitionData, IcebergFileFormat fileFormat, @@ -468,62 +518,101 @@ public ReaderPageSourceWithRowPositions createDataPageSource( List dataColumns, TupleDomain predicate, Optional nameMapping, + String partition, Map> partitionKeys) { - switch (fileFormat) { - case ORC: - return createOrcPageSource( - inputFile, - start, - length, - partitionSpecId, - partitionData, - dataColumns, - predicate, - orcReaderOptions - .withMaxMergeDistance(getOrcMaxMergeDistance(session)) - .withMaxBufferSize(getOrcMaxBufferSize(session)) - .withStreamBufferSize(getOrcStreamBufferSize(session)) - .withTinyStripeThreshold(getOrcTinyStripeThreshold(session)) - .withMaxReadBlockSize(getOrcMaxReadBlockSize(session)) - .withLazyReadSmallRanges(getOrcLazyReadSmallRanges(session)) - .withNestedLazy(isOrcNestedLazy(session)) - .withBloomFiltersEnabled(isOrcBloomFiltersEnabled(session)), - fileFormatDataSourceStats, - typeManager, - nameMapping, - partitionKeys); - case PARQUET: - return createParquetPageSource( - inputFile, - start, - length, - partitionSpecId, - partitionData, - dataColumns, - parquetReaderOptions - .withMaxReadBlockSize(getParquetMaxReadBlockSize(session)) - .withMaxReadBlockRowCount(getParquetMaxReadBlockRowCount(session)) - .withBatchColumnReaders(isParquetOptimizedReaderEnabled(session)) - .withBloomFilter(useParquetBloomFilter(session)) - .withBatchNestedColumnReaders(isParquetOptimizedNestedReaderEnabled(session)), - predicate, - fileFormatDataSourceStats, - nameMapping, - partitionKeys); - case AVRO: - return createAvroPageSource( - inputFile, - start, - length, - partitionSpecId, - partitionData, - fileSchema, - nameMapping, - dataColumns); - default: - throw new TrinoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat); + return switch (fileFormat) { + case ORC -> createOrcPageSource( + inputFile, + start, + length, + partitionSpecId, + partitionData, + dataColumns, + predicate, + orcReaderOptions + .withMaxMergeDistance(getOrcMaxMergeDistance(session)) + .withMaxBufferSize(getOrcMaxBufferSize(session)) + .withStreamBufferSize(getOrcStreamBufferSize(session)) + .withTinyStripeThreshold(getOrcTinyStripeThreshold(session)) + .withMaxReadBlockSize(getOrcMaxReadBlockSize(session)) + .withLazyReadSmallRanges(getOrcLazyReadSmallRanges(session)) + .withNestedLazy(isOrcNestedLazy(session)) + .withBloomFiltersEnabled(isOrcBloomFiltersEnabled(session)), + fileFormatDataSourceStats, + typeManager, + nameMapping, + partition, + partitionKeys); + case PARQUET -> createParquetPageSource( + inputFile, + start, + length, + fileSize, + partitionSpecId, + partitionData, + dataColumns, + ParquetReaderOptions.builder(parquetReaderOptions) + .withMaxReadBlockSize(getParquetMaxReadBlockSize(session)) + .withMaxReadBlockRowCount(getParquetMaxReadBlockRowCount(session)) + .withSmallFileThreshold(getParquetSmallFileThreshold(session)) + .withIgnoreStatistics(isParquetIgnoreStatistics(session)) + .withBloomFilter(useParquetBloomFilter(session)) + // TODO https://github.com/trinodb/trino/issues/11000 + .withUseColumnIndex(false) + .withVectorizedDecodingEnabled(isParquetVectorizedDecodingEnabled(session)) + .build(), + predicate, + fileFormatDataSourceStats, + nameMapping, + partition, + partitionKeys); + case AVRO -> createAvroPageSource( + inputFile, + start, + length, + partitionSpecId, + partitionData, + fileSchema, + nameMapping, + partition, + dataColumns); + }; + } + + private static ConnectorPageSource generatePages( + long totalRowCount, + List icebergColumns, + Map> partitionKeys) + { + int maxPageSize = MAX_RLE_PAGE_SIZE; + Block[] pageBlocks = new Block[icebergColumns.size()]; + for (int i = 0; i < icebergColumns.size(); i++) { + IcebergColumnHandle column = icebergColumns.get(i); + Type trinoType = column.getType(); + Object partitionValue = deserializePartitionValue(trinoType, partitionKeys.get(column.getId()).orElse(null), column.getName()); + pageBlocks[i] = RunLengthEncodedBlock.create(nativeValueToBlock(trinoType, partitionValue), maxPageSize); } + Page maxPage = new Page(maxPageSize, pageBlocks); + + return new FixedPageSource( + new AbstractIterator<>() + { + private long rowIndex; + + @Override + protected Page computeNext() + { + if (rowIndex == totalRowCount) { + return endOfData(); + } + int pageSize = toIntExact(min(maxPageSize, totalRowCount - rowIndex)); + Page page = maxPage.getRegion(0, pageSize); + rowIndex += pageSize; + return page; + } + }, + maxPage.getRetainedSizeInBytes()); } private static ReaderPageSourceWithRowPositions createOrcPageSource( @@ -538,6 +627,7 @@ private static ReaderPageSourceWithRowPositions createOrcPageSource( FileFormatDataSourceStats stats, TypeManager typeManager, Optional nameMapping, + String partition, Map> partitionKeys) { OrcDataSource orcDataSource = null; @@ -547,92 +637,90 @@ private static ReaderPageSourceWithRowPositions createOrcPageSource( OrcReader reader = OrcReader.createOrcReader(orcDataSource, options) .orElseThrow(() -> new TrinoException(ICEBERG_BAD_DATA, "ORC file is zero length")); - List fileColumns = reader.getRootColumn().getNestedColumns(); - if (nameMapping.isPresent() && !hasIds(reader.getRootColumn())) { - fileColumns = fileColumns.stream() - .map(orcColumn -> setMissingFieldIds(orcColumn, nameMapping.get(), ImmutableList.of(orcColumn.getColumnName()))) - .collect(toImmutableList()); - } - - Map fileColumnsByIcebergId = mapIdsToOrcFileColumns(fileColumns); + Map fileColumnsByIcebergId = fileColumnsByIcebergId(reader, nameMapping); TupleDomainOrcPredicateBuilder predicateBuilder = TupleDomainOrcPredicate.builder() .setBloomFiltersEnabled(options.isBloomFiltersEnabled()); Map effectivePredicateDomains = effectivePredicate.getDomains() .orElseThrow(() -> new IllegalArgumentException("Effective predicate is none")); + for (IcebergColumnHandle column : columns) { + for (Map.Entry domainEntry : effectivePredicateDomains.entrySet()) { + IcebergColumnHandle predicateColumn = domainEntry.getKey(); + OrcColumn predicateOrcColumn = fileColumnsByIcebergId.get(predicateColumn.getId()); + if (predicateOrcColumn != null && column.getBaseColumnIdentity().equals(predicateColumn.getBaseColumnIdentity())) { + predicateBuilder.addColumn(predicateOrcColumn.getColumnId(), domainEntry.getValue()); + } + } + } - Optional baseColumnProjections = projectBaseColumns(columns); Map>> projectionsByFieldId = columns.stream() .collect(groupingBy( column -> column.getBaseColumnIdentity().getId(), mapping(IcebergColumnHandle::getPath, toUnmodifiableList()))); - List readBaseColumns = baseColumnProjections - .map(readerColumns -> (List) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())) - .orElse(columns); - List fileReadColumns = new ArrayList<>(readBaseColumns.size()); - List fileReadTypes = new ArrayList<>(readBaseColumns.size()); - List projectedLayouts = new ArrayList<>(readBaseColumns.size()); - List columnAdaptations = new ArrayList<>(readBaseColumns.size()); - - for (IcebergColumnHandle column : readBaseColumns) { - verify(column.isBaseColumn(), "Column projections must be based from a root column"); - OrcColumn orcColumn = fileColumnsByIcebergId.get(column.getId()); + List baseColumns = new ArrayList<>(columns.size()); + Map baseColumnIdToOrdinal = new HashMap<>(); + List fileReadColumns = new ArrayList<>(columns.size()); + List fileReadTypes = new ArrayList<>(columns.size()); + List projectedLayouts = new ArrayList<>(columns.size()); + TransformConnectorPageSource.Builder transforms = TransformConnectorPageSource.builder(); + boolean appendRowNumberColumn = false; + for (IcebergColumnHandle column : columns) { if (column.isIsDeletedColumn()) { - columnAdaptations.add(ColumnAdaptation.constantColumn(nativeValueToBlock(BOOLEAN, false))); + transforms.constantValue(nativeValueToBlock(BOOLEAN, false)); } else if (partitionKeys.containsKey(column.getId())) { Type trinoType = column.getType(); - columnAdaptations.add(ColumnAdaptation.constantColumn(nativeValueToBlock( + transforms.constantValue(nativeValueToBlock( trinoType, - deserializePartitionValue(trinoType, partitionKeys.get(column.getId()).orElse(null), column.getName())))); + deserializePartitionValue(trinoType, partitionKeys.get(column.getId()).orElse(null), column.getName()))); + } + else if (column.isPartitionColumn()) { + transforms.constantValue(nativeValueToBlock(PARTITION.getType(), utf8Slice(partition))); } else if (column.isPathColumn()) { - columnAdaptations.add(ColumnAdaptation.constantColumn(nativeValueToBlock(FILE_PATH.getType(), utf8Slice(inputFile.location().toString())))); + transforms.constantValue(nativeValueToBlock(FILE_PATH.getType(), utf8Slice(inputFile.location().toString()))); } else if (column.isFileModifiedTimeColumn()) { - columnAdaptations.add(ColumnAdaptation.constantColumn(nativeValueToBlock(FILE_MODIFIED_TIME.getType(), packDateTimeWithZone(inputFile.lastModified().toEpochMilli(), UTC_KEY)))); + transforms.constantValue(nativeValueToBlock(FILE_MODIFIED_TIME.getType(), packDateTimeWithZone(inputFile.lastModified().toEpochMilli(), UTC_KEY))); } - else if (column.isUpdateRowIdColumn() || column.isMergeRowIdColumn()) { - // $row_id is a composite of multiple physical columns. It is assembled by the IcebergPageSource - columnAdaptations.add(ColumnAdaptation.nullColumn(column.getType())); + else if (column.isMergeRowIdColumn()) { + appendRowNumberColumn = true; + transforms.transform(MergeRowIdTransform.create(utf8Slice(inputFile.location().toString()), partitionSpecId, utf8Slice(partitionData))); } else if (column.isRowPositionColumn()) { - columnAdaptations.add(ColumnAdaptation.positionColumn()); - } - else if (column.getId() == TRINO_MERGE_PARTITION_SPEC_ID) { - columnAdaptations.add(ColumnAdaptation.constantColumn(nativeValueToBlock(column.getType(), (long) partitionSpecId))); + appendRowNumberColumn = true; + transforms.transform(new GetRowPositionFromSource()); } - else if (column.getId() == TRINO_MERGE_PARTITION_DATA) { - columnAdaptations.add(ColumnAdaptation.constantColumn(nativeValueToBlock(column.getType(), utf8Slice(partitionData)))); + else if (!fileColumnsByIcebergId.containsKey(column.getBaseColumnIdentity().getId())) { + transforms.constantValue(createNullBlock(column.getType())); } - else if (orcColumn != null) { - Type readType = getOrcReadType(column.getType(), typeManager); - - if (column.getType() == UUID && !"UUID".equals(orcColumn.getAttributes().get(ICEBERG_BINARY_TYPE))) { - throw new TrinoException(ICEBERG_BAD_DATA, format("Expected ORC column for UUID data to be annotated with %s=UUID: %s", ICEBERG_BINARY_TYPE, orcColumn)); + else { + IcebergColumnHandle baseColumn = column.getBaseColumn(); + Integer ordinal = baseColumnIdToOrdinal.get(baseColumn.getId()); + if (ordinal == null) { + ordinal = baseColumns.size(); + baseColumns.add(baseColumn); + baseColumnIdToOrdinal.put(baseColumn.getId(), ordinal); + + OrcColumn orcBaseColumn = requireNonNull(fileColumnsByIcebergId.get(baseColumn.getId())); + fileReadColumns.add(orcBaseColumn); + fileReadTypes.add(getOrcReadType(baseColumn.getType(), typeManager)); + projectedLayouts.add(IcebergOrcProjectedLayout.createProjectedLayout( + orcBaseColumn, + projectionsByFieldId.get(baseColumn.getId()))); } - List> fieldIdProjections = projectionsByFieldId.get(column.getId()); - ProjectedLayout projectedLayout = IcebergOrcProjectedLayout.createProjectedLayout(orcColumn, fieldIdProjections); - - int sourceIndex = fileReadColumns.size(); - columnAdaptations.add(ColumnAdaptation.sourceColumn(sourceIndex)); - fileReadColumns.add(orcColumn); - fileReadTypes.add(readType); - projectedLayouts.add(projectedLayout); - - for (Map.Entry domainEntry : effectivePredicateDomains.entrySet()) { - IcebergColumnHandle predicateColumn = domainEntry.getKey(); - OrcColumn predicateOrcColumn = fileColumnsByIcebergId.get(predicateColumn.getId()); - if (predicateOrcColumn != null && column.getColumnIdentity().equals(predicateColumn.getBaseColumnIdentity())) { - predicateBuilder.addColumn(predicateOrcColumn.getColumnId(), domainEntry.getValue()); - } + if (column.isBaseColumn()) { + transforms.column(ordinal); + } + else { + transforms.dereferenceField(ImmutableList.builder() + .add(ordinal) + .addAll(applyProjection(column, baseColumn)) + .build()); } - } - else { - columnAdaptations.add(ColumnAdaptation.nullColumn(column.getType())); } } @@ -649,20 +737,22 @@ else if (orcColumn != null) { memoryUsage, INITIAL_BATCH_SIZE, exception -> handleException(orcDataSourceId, exception), - new IdBasedFieldMapperFactory(readBaseColumns)); + new IdBasedFieldMapperFactory(baseColumns)); + + ConnectorPageSource pageSource = new OrcPageSource( + recordReader, + List.of(), // FIXME: If we use ORC + orcDataSource, + Optional.empty(), + Optional.empty(), + memoryUsage, + stats, + reader.getCompressionKind()); + + pageSource = transforms.build(pageSource); return new ReaderPageSourceWithRowPositions( - new ReaderPageSource( - new OrcPageSource( - recordReader, - columnAdaptations, - orcDataSource, - Optional.empty(), - Optional.empty(), - memoryUsage, - stats, - reader.getCompressionKind()), - baseColumnProjections), + pageSource, recordReader.getStartRowPosition(), recordReader.getEndRowPosition()); } @@ -677,8 +767,8 @@ else if (orcColumn != null) { } } } - if (e instanceof TrinoException) { - throw (TrinoException) e; + if (e instanceof TrinoException trinoException) { + throw trinoException; } if (e instanceof OrcCorruptionException) { throw new TrinoException(ICEBERG_BAD_DATA, e); @@ -688,48 +778,6 @@ else if (orcColumn != null) { } } - private static boolean hasIds(OrcColumn column) - { - if (column.getAttributes().containsKey(ORC_ICEBERG_ID_KEY)) { - return true; - } - - return column.getNestedColumns().stream().anyMatch(IcebergPageSourceProvider::hasIds); - } - - private static OrcColumn setMissingFieldIds(OrcColumn column, NameMapping nameMapping, List qualifiedPath) - { - MappedField mappedField = nameMapping.find(qualifiedPath); - - ImmutableMap.Builder attributes = ImmutableMap.builder() - .putAll(column.getAttributes()); - if (mappedField != null && mappedField.id() != null) { - attributes.put(ORC_ICEBERG_ID_KEY, String.valueOf(mappedField.id())); - } - - return new OrcColumn( - column.getPath(), - column.getColumnId(), - column.getColumnName(), - column.getColumnType(), - column.getOrcDataSourceId(), - column.getNestedColumns().stream() - .map(nestedColumn -> { - ImmutableList.Builder nextQualifiedPath = ImmutableList.builder() - .addAll(qualifiedPath); - if (column.getColumnType() == OrcType.OrcTypeKind.LIST) { - // The Trino ORC reader uses "item" for list element names, but the NameMapper expects "element" - nextQualifiedPath.add("element"); - } - else { - nextQualifiedPath.add(nestedColumn.getColumnName()); - } - return setMissingFieldIds(nestedColumn, nameMapping, nextQualifiedPath.build()); - }) - .collect(toImmutableList()), - attributes.buildOrThrow()); - } - /** * Gets the index based dereference chain to get from the readColumnHandle to the expectedColumnHandle */ @@ -750,39 +798,25 @@ private static List applyProjection(ColumnHandle expectedColumnHandle, return dereferenceChain.build(); } - private static Map mapIdsToOrcFileColumns(List columns) - { - ImmutableMap.Builder columnsById = ImmutableMap.builder(); - Traverser.forTree(OrcColumn::getNestedColumns) - .depthFirstPreOrder(columns) - .forEach(column -> { - String fieldId = column.getAttributes().get(ORC_ICEBERG_ID_KEY); - if (fieldId != null) { - columnsById.put(Integer.parseInt(fieldId), column); - } - }); - return columnsById.buildOrThrow(); - } - private static Integer getIcebergFieldId(OrcColumn column) { String icebergId = column.getAttributes().get(ORC_ICEBERG_ID_KEY); - verify(icebergId != null, format("column %s does not have %s property", column, ORC_ICEBERG_ID_KEY)); + verify(icebergId != null, "column %s does not have %s property", column, ORC_ICEBERG_ID_KEY); return Integer.valueOf(icebergId); } private static Type getOrcReadType(Type columnType, TypeManager typeManager) { - if (columnType instanceof ArrayType) { - return new ArrayType(getOrcReadType(((ArrayType) columnType).getElementType(), typeManager)); + if (columnType instanceof ArrayType arrayType) { + return new ArrayType(getOrcReadType(arrayType.getElementType(), typeManager)); } if (columnType instanceof MapType mapType) { Type keyType = getOrcReadType(mapType.getKeyType(), typeManager); Type valueType = getOrcReadType(mapType.getValueType(), typeManager); return new MapType(keyType, valueType, typeManager.getTypeOperators()); } - if (columnType instanceof RowType) { - return RowType.from(((RowType) columnType).getFields().stream() + if (columnType instanceof RowType rowType) { + return RowType.from(rowType.getFields().stream() .map(field -> new RowType.Field(field.getName(), getOrcReadType(field.getType(), typeManager))) .collect(toImmutableList())); } @@ -802,8 +836,8 @@ public IdBasedFieldMapperFactory(List columns) ImmutableMap.Builder> mapping = ImmutableMap.builder(); for (IcebergColumnHandle column : columns) { - if (column.isUpdateRowIdColumn() || column.isMergeRowIdColumn()) { - // The update $row_id column contains fields which should not be accounted for in the mapping. + if (column.isMergeRowIdColumn()) { + // The merge $row_id column contains fields which should not be accounted for in the mapping. continue; } @@ -868,22 +902,24 @@ private static ReaderPageSourceWithRowPositions createParquetPageSource( TrinoInputFile inputFile, long start, long length, + long fileSize, int partitionSpecId, String partitionData, - List regularColumns, + List columns, ParquetReaderOptions options, TupleDomain effectivePredicate, FileFormatDataSourceStats fileFormatDataSourceStats, Optional nameMapping, + String partition, Map> partitionKeys) { AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext(); ParquetDataSource dataSource = null; try { - dataSource = new TrinoParquetDataSource(inputFile, options, fileFormatDataSourceStats); - ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, Optional.empty()); - FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); + dataSource = createDataSource(inputFile, OptionalLong.of(fileSize), options, memoryContext, fileFormatDataSourceStats); + ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource, options.getMaxFooterReadSize()); + FileMetadata fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); if (nameMapping.isPresent() && !ParquetSchemaUtil.hasIds(fileSchema)) { // NameMapping conversion is necessary because MetadataReader converts all column names to lowercase and NameMapping is case sensitive @@ -891,114 +927,123 @@ private static ReaderPageSourceWithRowPositions createParquetPageSource( } // Mapping from Iceberg field ID to Parquet fields. - Map parquetIdToField = createParquetIdToFieldMapping(fileSchema); - - Optional baseColumnProjections = projectBaseColumns(regularColumns); - List readBaseColumns = baseColumnProjections - .map(readerColumns -> (List) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())) - .orElse(regularColumns); - - List parquetFields = readBaseColumns.stream() - .map(column -> parquetIdToField.get(column.getId())) - .collect(toList()); + Map parquetIdToFieldName = createParquetIdToFieldMapping(fileSchema); - MessageType requestedSchema = getMessageType(regularColumns, fileSchema.getName(), parquetIdToField); + MessageType requestedSchema = getMessageType(columns, fileSchema.getName(), parquetIdToFieldName); Map, ColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema); - TupleDomain parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate); + TupleDomain parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate); TupleDomainParquetPredicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, UTC); - long nextStart = 0; - Optional startRowPosition = Optional.empty(); - Optional endRowPosition = Optional.empty(); - ImmutableList.Builder blockStarts = ImmutableList.builder(); - List blocks = new ArrayList<>(); - for (BlockMetaData block : parquetMetadata.getBlocks()) { - long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); - Optional bloomFilterStore = getBloomFilterStore(dataSource, block, parquetTupleDomain, options); - - if (start <= firstDataPage && firstDataPage < start + length && - predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, Optional.empty(), bloomFilterStore, UTC, ICEBERG_DOMAIN_COMPACTION_THRESHOLD)) { - blocks.add(block); - blockStarts.add(nextStart); - if (startRowPosition.isEmpty()) { - startRowPosition = Optional.of(nextStart); - } - endRowPosition = Optional.of(nextStart + block.getRowCount()); - } - nextStart += block.getRowCount(); - } - MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema); - ParquetPageSource.Builder pageSourceBuilder = ParquetPageSource.builder(); - int parquetSourceChannel = 0; - - ImmutableList.Builder parquetColumnFieldsBuilder = ImmutableList.builder(); - for (int columnIndex = 0; columnIndex < readBaseColumns.size(); columnIndex++) { - IcebergColumnHandle column = readBaseColumns.get(columnIndex); + Map baseColumnIdToOrdinal = new HashMap<>(); + TransformConnectorPageSource.Builder transforms = TransformConnectorPageSource.builder(); + boolean appendRowNumberColumn = false; + int nextOrdinal = 0; + ImmutableList.Builder parquetColumnFieldsBuilder = ImmutableList.builder(); + for (IcebergColumnHandle column : columns) { if (column.isIsDeletedColumn()) { - pageSourceBuilder.addConstantColumn(nativeValueToBlock(BOOLEAN, false)); + transforms.constantValue(nativeValueToBlock(BOOLEAN, false)); } else if (partitionKeys.containsKey(column.getId())) { Type trinoType = column.getType(); - pageSourceBuilder.addConstantColumn(nativeValueToBlock( + transforms.constantValue(nativeValueToBlock( trinoType, deserializePartitionValue(trinoType, partitionKeys.get(column.getId()).orElse(null), column.getName()))); } + else if (column.isPartitionColumn()) { + transforms.constantValue(nativeValueToBlock(PARTITION.getType(), utf8Slice(partition))); + } else if (column.isPathColumn()) { - pageSourceBuilder.addConstantColumn(nativeValueToBlock(FILE_PATH.getType(), utf8Slice(inputFile.location().toString()))); + transforms.constantValue(nativeValueToBlock(FILE_PATH.getType(), utf8Slice(inputFile.location().toString()))); } else if (column.isFileModifiedTimeColumn()) { - pageSourceBuilder.addConstantColumn(nativeValueToBlock(FILE_MODIFIED_TIME.getType(), packDateTimeWithZone(inputFile.lastModified().toEpochMilli(), UTC_KEY))); + transforms.constantValue(nativeValueToBlock(FILE_MODIFIED_TIME.getType(), packDateTimeWithZone(inputFile.lastModified().toEpochMilli(), UTC_KEY))); } - else if (column.isUpdateRowIdColumn() || column.isMergeRowIdColumn()) { - // $row_id is a composite of multiple physical columns, it is assembled by the IcebergPageSource - pageSourceBuilder.addNullColumn(column.getType()); + else if (column.isMergeRowIdColumn()) { + appendRowNumberColumn = true; + transforms.transform(MergeRowIdTransform.create(utf8Slice(inputFile.location().toString()), partitionSpecId, utf8Slice(partitionData))); } else if (column.isRowPositionColumn()) { - pageSourceBuilder.addRowIndexColumn(); + appendRowNumberColumn = true; + transforms.transform(new GetRowPositionFromSource()); } - else if (column.getId() == TRINO_MERGE_PARTITION_SPEC_ID) { - pageSourceBuilder.addConstantColumn(nativeValueToBlock(column.getType(), (long) partitionSpecId)); - } - else if (column.getId() == TRINO_MERGE_PARTITION_DATA) { - pageSourceBuilder.addConstantColumn(nativeValueToBlock(column.getType(), utf8Slice(partitionData))); + else if (!parquetIdToFieldName.containsKey(column.getBaseColumn().getId())) { + transforms.constantValue(createNullBlock(column.getType())); } else { - org.apache.parquet.schema.Type parquetField = parquetFields.get(columnIndex); - Type trinoType = column.getBaseType(); - if (parquetField == null) { - pageSourceBuilder.addNullColumn(trinoType); - continue; + IcebergColumnHandle baseColumn = column.getBaseColumn(); + Integer ordinal = baseColumnIdToOrdinal.get(baseColumn.getId()); + if (ordinal == null) { + String parquetFieldName = requireNonNull(parquetIdToFieldName.get(baseColumn.getId())).getName(); + + // The top level columns are already mapped by name/id appropriately. + Optional field = IcebergParquetColumnIOConverter.constructField( + new FieldContext(baseColumn.getType(), baseColumn.getColumnIdentity()), + messageColumnIO.getChild(parquetFieldName)); + if (field.isEmpty()) { + // base column is missing so return a null + transforms.constantValue(createNullBlock(column.getType())); + continue; + } + + ordinal = nextOrdinal; + nextOrdinal++; + baseColumnIdToOrdinal.put(baseColumn.getId(), ordinal); + + parquetColumnFieldsBuilder.add(new Column(parquetFieldName, field.get())); + } + if (column.isBaseColumn()) { + transforms.column(ordinal); } - // The top level columns are already mapped by name/id appropriately. - ColumnIO columnIO = messageColumnIO.getChild(parquetField.getName()); - Optional field = IcebergParquetColumnIOConverter.constructField(new FieldContext(trinoType, column.getColumnIdentity()), columnIO); - if (field.isEmpty()) { - pageSourceBuilder.addNullColumn(trinoType); - continue; + else { + transforms.dereferenceField(ImmutableList.builder() + .add(ordinal) + .addAll(applyProjection(column, baseColumn)) + .build()); } - parquetColumnFieldsBuilder.add(field.get()); - pageSourceBuilder.addSourceColumn(parquetSourceChannel); - parquetSourceChannel++; } } + List rowGroups = getFilteredRowGroups( + start, + length, + dataSource, + parquetMetadata, + ImmutableList.of(parquetTupleDomain), + ImmutableList.of(parquetPredicate), + descriptorsByPath, + UTC, + ICEBERG_DOMAIN_COMPACTION_THRESHOLD, + options); + ParquetDataSourceId dataSourceId = dataSource.getId(); - ParquetReader parquetReader = new ParquetReader( + ParquetReaderNew parquetReader = new ParquetReaderNew( Optional.ofNullable(fileMetaData.getCreatedBy()), parquetColumnFieldsBuilder.build(), - blocks, - blockStarts.build(), + appendRowNumberColumn, + rowGroups, dataSource, UTC, memoryContext, options, - exception -> handleException(dataSourceId, exception)); + exception -> handleException(dataSourceId, exception), + Optional.empty(), + Optional.empty()); + + ConnectorPageSource pageSource = new ParquetPageSourceNew(parquetReader); + pageSource = transforms.build(pageSource); + + Optional startRowPosition = Optional.empty(); + Optional endRowPosition = Optional.empty(); + if (!rowGroups.isEmpty()) { + startRowPosition = Optional.of(rowGroups.get(0).fileRowOffset()); + RowGroupInfo lastRowGroup = rowGroups.get(rowGroups.size() - 1); + endRowPosition = Optional.of(lastRowGroup.fileRowOffset() + lastRowGroup.prunedBlockMetadata().getRowCount()); + } + return new ReaderPageSourceWithRowPositions( - new ReaderPageSource( - pageSourceBuilder.build(parquetReader), - baseColumnProjections), + pageSource, startRowPosition, endRowPosition); } @@ -1013,8 +1058,8 @@ else if (column.getId() == TRINO_MERGE_PARTITION_DATA) { e.addSuppressed(ex); } } - if (e instanceof TrinoException) { - throw (TrinoException) e; + if (e instanceof TrinoException trinoException) { + throw trinoException; } if (e instanceof ParquetCorruptionException) { throw new TrinoException(ICEBERG_BAD_DATA, e); @@ -1024,6 +1069,20 @@ else if (column.getId() == TRINO_MERGE_PARTITION_DATA) { } } + public static ParquetDataSource createDataSource( + TrinoInputFile inputFile, + OptionalLong estimatedFileSize, + ParquetReaderOptions options, + AggregatedMemoryContext memoryContext, + FileFormatDataSourceStats stats) + throws IOException + { + if (estimatedFileSize.isEmpty() || estimatedFileSize.getAsLong() > options.getSmallFileThreshold().toBytes()) { + return new TrinoParquetDataSource(inputFile, options, stats); + } + return new MemoryParquetDataSource(inputFile, memoryContext, stats); + } + private static Map createParquetIdToFieldMapping(MessageType fileSchema) { ImmutableMap.Builder builder = ImmutableMap.builder(); @@ -1051,10 +1110,7 @@ else if (type instanceof GroupType groupType) { private static MessageType getMessageType(List regularColumns, String fileSchemaName, Map parquetIdToField) { - return projectSufficientColumns(regularColumns) - .map(readerColumns -> readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toUnmodifiableList())) - .orElse(regularColumns) - .stream() + return projectSufficientColumns(regularColumns).stream() .map(column -> getColumnType(column, parquetIdToField)) .filter(Optional::isPresent) .map(Optional::get) @@ -1071,25 +1127,17 @@ private static ReaderPageSourceWithRowPositions createAvroPageSource( String partitionData, Schema fileSchema, Optional nameMapping, + String partition, List columns) { - ConstantPopulatingPageSource.Builder constantPopulatingPageSourceBuilder = ConstantPopulatingPageSource.builder(); - int avroSourceChannel = 0; - - Optional baseColumnProjections = projectBaseColumns(columns); - - List readBaseColumns = baseColumnProjections - .map(readerColumns -> (List) readerColumns.get().stream().map(IcebergColumnHandle.class::cast).collect(toImmutableList())) - .orElse(columns); - InputFile file = new ForwardingInputFile(inputFile); OptionalLong fileModifiedTime = OptionalLong.empty(); try { - if (readBaseColumns.stream().anyMatch(IcebergColumnHandle::isFileModifiedTimeColumn)) { + if (columns.stream().anyMatch(IcebergColumnHandle::isFileModifiedTimeColumn)) { fileModifiedTime = OptionalLong.of(inputFile.lastModified().toEpochMilli()); } } - catch (IOException e) { + catch (IOException | UncheckedIOException e) { throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, e); } @@ -1107,61 +1155,74 @@ private static ReaderPageSourceWithRowPositions createAvroPageSource( ImmutableList.Builder columnNames = ImmutableList.builder(); ImmutableList.Builder columnTypes = ImmutableList.builder(); - ImmutableList.Builder rowIndexChannels = ImmutableList.builder(); - - for (IcebergColumnHandle column : readBaseColumns) { - verify(column.isBaseColumn(), "Column projections must be based from a root column"); - org.apache.avro.Schema.Field field = fileColumnsByIcebergId.get(column.getId()); + TransformConnectorPageSource.Builder transforms = TransformConnectorPageSource.builder(); + boolean appendRowNumberColumn = false; + Map baseColumnIdToOrdinal = new HashMap<>(); - if (column.isPathColumn()) { - constantPopulatingPageSourceBuilder.addConstantColumn(nativeValueToBlock(FILE_PATH.getType(), utf8Slice(file.location()))); + int nextOrdinal = 0; + for (IcebergColumnHandle column : columns) { + if (column.isPartitionColumn()) { + transforms.constantValue(nativeValueToBlock(PARTITION.getType(), utf8Slice(partition))); } - else if (column.isFileModifiedTimeColumn()) { - constantPopulatingPageSourceBuilder.addConstantColumn(nativeValueToBlock(FILE_MODIFIED_TIME.getType(), packDateTimeWithZone(fileModifiedTime.orElseThrow(), UTC_KEY))); + else if (column.isPathColumn()) { + transforms.constantValue(nativeValueToBlock(FILE_PATH.getType(), utf8Slice(file.location()))); } - // For delete - else if (column.isRowPositionColumn()) { - rowIndexChannels.add(true); - columnNames.add(ROW_POSITION.name()); - columnTypes.add(BIGINT); - constantPopulatingPageSourceBuilder.addDelegateColumn(avroSourceChannel); - avroSourceChannel++; + else if (column.isFileModifiedTimeColumn()) { + transforms.constantValue(nativeValueToBlock(FILE_MODIFIED_TIME.getType(), packDateTimeWithZone(fileModifiedTime.orElseThrow(), UTC_KEY))); } - else if (column.getId() == TRINO_MERGE_PARTITION_SPEC_ID) { - constantPopulatingPageSourceBuilder.addConstantColumn(nativeValueToBlock(column.getType(), (long) partitionSpecId)); + else if (column.isMergeRowIdColumn()) { + appendRowNumberColumn = true; + transforms.transform(MergeRowIdTransform.create(utf8Slice(file.location()), partitionSpecId, utf8Slice(partitionData))); } - else if (column.getId() == TRINO_MERGE_PARTITION_DATA) { - constantPopulatingPageSourceBuilder.addConstantColumn(nativeValueToBlock(column.getType(), utf8Slice(partitionData))); + else if (column.isRowPositionColumn()) { + appendRowNumberColumn = true; + transforms.transform(new GetRowPositionFromSource()); } - else if (field == null) { - constantPopulatingPageSourceBuilder.addConstantColumn(nativeValueToBlock(column.getType(), null)); + else if (!fileColumnsByIcebergId.containsKey(column.getBaseColumn().getId())) { + transforms.constantValue(nativeValueToBlock(column.getType(), null)); } else { - rowIndexChannels.add(false); - columnNames.add(column.getName()); - columnTypes.add(column.getType()); - constantPopulatingPageSourceBuilder.addDelegateColumn(avroSourceChannel); - avroSourceChannel++; + IcebergColumnHandle baseColumn = column.getBaseColumn(); + Integer ordinal = baseColumnIdToOrdinal.get(baseColumn.getId()); + if (ordinal == null) { + ordinal = nextOrdinal; + nextOrdinal++; + baseColumnIdToOrdinal.put(baseColumn.getId(), ordinal); + + columnNames.add(baseColumn.getName()); + columnTypes.add(baseColumn.getType()); + } + + if (column.isBaseColumn()) { + transforms.column(ordinal); + } + else { + transforms.dereferenceField(ImmutableList.builder() + .add(ordinal) + .addAll(applyProjection(column, baseColumn)) + .build()); + } } } + ConnectorPageSource pageSource = new IcebergAvroPageSource( + file, + start, + length, + fileSchema, + nameMapping, + columnNames.build(), + columnTypes.build(), + appendRowNumberColumn, + newSimpleAggregatedMemoryContext()); + pageSource = transforms.build(pageSource); + return new ReaderPageSourceWithRowPositions( - new ReaderPageSource( - constantPopulatingPageSourceBuilder.build(new IcebergAvroPageSource( - file, - start, - length, - fileSchema, - nameMapping, - columnNames.build(), - columnTypes.build(), - rowIndexChannels.build(), - newSimpleAggregatedMemoryContext())), - baseColumnProjections), + pageSource, Optional.empty(), Optional.empty()); } - catch (IOException e) { + catch (IOException | UncheckedIOException e) { throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, e); } } @@ -1239,9 +1300,7 @@ public static ProjectedLayout createProjectedLayout(OrcColumn root, List>> dereferencesByField = fieldIdDereferences.stream() - .collect(groupingBy( - sequence -> sequence.get(0), - mapping(sequence -> sequence.subList(1, sequence.size()), toUnmodifiableList()))); + .collect(groupingBy(items -> items.get(0), mapping(sequence -> sequence.subList(1, sequence.size()), toUnmodifiableList()))); ImmutableMap.Builder fieldLayouts = ImmutableMap.builder(); for (OrcColumn nestedColumn : root.getNestedColumns()) { @@ -1262,52 +1321,17 @@ public ProjectedLayout getFieldLayout(OrcColumn orcColumn) } } - /** - * Creates a mapping between the input {@code columns} and base columns if required. - */ - public static Optional projectBaseColumns(List columns) - { - requireNonNull(columns, "columns is null"); - - // No projection is required if all columns are base columns - if (columns.stream().allMatch(IcebergColumnHandle::isBaseColumn)) { - return Optional.empty(); - } - - ImmutableList.Builder projectedColumns = ImmutableList.builder(); - ImmutableList.Builder outputColumnMapping = ImmutableList.builder(); - Map mappedFieldIds = new HashMap<>(); - int projectedColumnCount = 0; - - for (IcebergColumnHandle column : columns) { - int baseColumnId = column.getBaseColumnIdentity().getId(); - Integer mapped = mappedFieldIds.get(baseColumnId); - - if (mapped == null) { - projectedColumns.add(column.getBaseColumn()); - mappedFieldIds.put(baseColumnId, projectedColumnCount); - outputColumnMapping.add(projectedColumnCount); - projectedColumnCount++; - } - else { - outputColumnMapping.add(mapped); - } - } - - return Optional.of(new ReaderColumns(projectedColumns.build(), outputColumnMapping.build())); - } - /** * Creates a set of sufficient columns for the input projected columns and prepares a mapping between the two. - * For example, if input {@param columns} include columns "a.b" and "a.b.c", then they will be projected + * For example, if input columns include columns "a.b" and "a.b.c", then they will be projected * from a single column "a.b". */ - private static Optional projectSufficientColumns(List columns) + private static List projectSufficientColumns(List columns) { requireNonNull(columns, "columns is null"); if (columns.stream().allMatch(IcebergColumnHandle::isBaseColumn)) { - return Optional.empty(); + return columns; } ImmutableBiMap.Builder dereferenceChainsBuilder = ImmutableBiMap.builder(); @@ -1319,14 +1343,13 @@ private static Optional projectSufficientColumns(List dereferenceChains = dereferenceChainsBuilder.build(); - List sufficientColumns = new ArrayList<>(); - ImmutableList.Builder outputColumnMapping = ImmutableList.builder(); + List sufficientColumns = new ArrayList<>(); Map pickedColumns = new HashMap<>(); // Pick a covering column for every column for (IcebergColumnHandle columnHandle : columns) { - DereferenceChain dereferenceChain = dereferenceChains.inverse().get(columnHandle); + DereferenceChain dereferenceChain = requireNonNull(dereferenceChains.inverse().get(columnHandle)); DereferenceChain chosenColumn = null; // Shortest existing prefix is chosen as the input. @@ -1338,23 +1361,15 @@ private static Optional projectSufficientColumns(List getColumnType(IcebergColumnHandle column, Map parquetIdToField) @@ -1384,18 +1399,22 @@ private static Optional getColumnType(IcebergCol return Optional.of(new GroupType(baseType.getRepetition(), baseType.getName(), ImmutableList.of(type))); } - private static TupleDomain getParquetTupleDomain(Map, ColumnDescriptor> descriptorsByPath, TupleDomain effectivePredicate) + @VisibleForTesting + static TupleDomain getParquetTupleDomain(Map, ColumnDescriptor> descriptorsByPath, TupleDomain effectivePredicate) { if (effectivePredicate.isNone()) { return TupleDomain.none(); } + Map descriptorsById = descriptorsByPath.values().stream() + .filter(descriptor -> descriptor.getPrimitiveType().getId() != null) + .collect(toImmutableMap(descriptor -> descriptor.getPrimitiveType().getId().intValue(), identity())); ImmutableMap.Builder predicate = ImmutableMap.builder(); effectivePredicate.getDomains().orElseThrow().forEach((columnHandle, domain) -> { - String baseType = columnHandle.getType().getTypeSignature().getBase(); + ColumnIdentity columnIdentity = columnHandle.getColumnIdentity(); // skip looking up predicates for complex types as Parquet only stores stats for primitives - if (columnHandle.isBaseColumn() && (!baseType.equals(StandardTypes.MAP) && !baseType.equals(StandardTypes.ARRAY) && !baseType.equals(StandardTypes.ROW))) { - ColumnDescriptor descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName())); + if (PRIMITIVE == columnIdentity.getTypeCategory()) { + ColumnDescriptor descriptor = descriptorsById.get(columnHandle.getId()); if (descriptor != null) { predicate.put(descriptor, domain); } @@ -1406,8 +1425,8 @@ private static TupleDomain getParquetTupleDomain(Map startRowPosition, + Optional endRowPosition) { - private final ReaderPageSource readerPageSource; - private final Optional startRowPosition; - private final Optional endRowPosition; - - public ReaderPageSourceWithRowPositions( - ReaderPageSource readerPageSource, - Optional startRowPosition, - Optional endRowPosition) - { - this.readerPageSource = requireNonNull(readerPageSource, "readerPageSource is null"); - this.startRowPosition = requireNonNull(startRowPosition, "startRowPosition is null"); - this.endRowPosition = requireNonNull(endRowPosition, "endRowPosition is null"); - } - - public ReaderPageSource getReaderPageSource() - { - return readerPageSource; - } - - public Optional getStartRowPosition() + public ReaderPageSourceWithRowPositions { - return startRowPosition; - } - - public Optional getEndRowPosition() - { - return endRowPosition; + requireNonNull(pageSource, "pageSource is null"); + requireNonNull(startRowPosition, "startRowPosition is null"); + requireNonNull(endRowPosition, "endRowPosition is null"); } } @@ -1510,4 +1510,109 @@ public int hashCode() return Objects.hash(baseColumnIdentity, path); } } + + private record MergeRowIdTransform(VariableWidthBlock filePath, IntArrayBlock partitionSpecId, VariableWidthBlock partitionData) + implements Function + { + private static Function create(Slice filePath, int partitionSpecId, Slice partitionData) + { + return new MergeRowIdTransform( + new VariableWidthBlock(1, filePath, new int[] {0, filePath.length()}, Optional.empty()), + new IntArrayBlock(1, Optional.empty(), new int[] {partitionSpecId}), + new VariableWidthBlock(1, partitionData, new int[] {0, partitionData.length()}, Optional.empty())); + } + + @Override + public Block apply(Page page) + { + Block rowPosition = page.getBlock(page.getChannelCount() - 1); + Block[] fields = new Block[] { + RunLengthEncodedBlock.create(filePath, rowPosition.getPositionCount()), + rowPosition, + RunLengthEncodedBlock.create(partitionSpecId, rowPosition.getPositionCount()), + RunLengthEncodedBlock.create(partitionData, rowPosition.getPositionCount()) + }; + return fromFieldBlocks(rowPosition.getPositionCount(), fields); + } + } + + private record GetRowPositionFromSource() + implements Function + { + @Override + public Block apply(Page page) + { + return page.getBlock(page.getChannelCount() - 1); + } + } + + private record PrefixColumnsSourcePage(Page sourcePage, int channelCount, int[] channels) + { + private static final long INSTANCE_SIZE = instanceSize(PrefixColumnsSourcePage.class); + + private PrefixColumnsSourcePage + { + requireNonNull(sourcePage, "sourcePage is null"); + checkArgument(channelCount >= 0, "channelCount is negative"); + checkArgument(channelCount < sourcePage.getChannelCount(), "channelCount is greater than or equal to sourcePage channel count"); + checkArgument(channels.length == channelCount, "channels length does not match channelCount"); + } + + private PrefixColumnsSourcePage(Page sourcePage, int channelCount) + { + this(sourcePage, channelCount, IntStream.range(0, channelCount).toArray()); + } + + public int getPositionCount() + { + return sourcePage.getPositionCount(); + } + + public long getSizeInBytes() + { + return sourcePage.getSizeInBytes(); + } + + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE + + sizeOf(channels) + + sourcePage.getRetainedSizeInBytes(); + } + + public void retainedBytesForEachPart(ObjLongConsumer consumer) + { + consumer.accept(this, INSTANCE_SIZE); + consumer.accept(channels, sizeOf(channels)); + } + + public int getChannelCount() + { + return channelCount; + } + + public Block getBlock(int channel) + { + checkIndex(channel, channelCount); + return sourcePage.getBlock(channel); + } + + public Page getPage() + { + return sourcePage.getColumns(channels); + } + + public Page getColumns(int[] channels) + { + for (int channel : channels) { + checkIndex(channel, channelCount); + } + return sourcePage.getColumns(channels); + } + + public void selectPositions(int[] positions, int offset, int size) + { + sourcePage.getPositions(positions, offset, size); + } + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProviderFactory.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProviderFactory.java new file mode 100644 index 000000000000..3b80aa3f5a64 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPageSourceProviderFactory.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.inject.Inject; +import io.trino.orc.OrcReaderOptions; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.plugin.base.metrics.FileFormatDataSourceStats; +import io.trino.plugin.hive.orc.OrcReaderConfig; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; +import io.trino.spi.connector.ConnectorPageSourceProvider; +import io.trino.spi.connector.ConnectorPageSourceProviderFactory; +import io.trino.spi.type.TypeManager; + +import static java.util.Objects.requireNonNull; + +public class IcebergPageSourceProviderFactory + implements ConnectorPageSourceProviderFactory +{ + private final IcebergFileSystemFactory fileSystemFactory; + private final ForwardingFileIoFactory fileIoFactory; + private final FileFormatDataSourceStats fileFormatDataSourceStats; + private final OrcReaderOptions orcReaderOptions; + private final ParquetReaderOptions parquetReaderOptions; + private final TypeManager typeManager; + + @Inject + public IcebergPageSourceProviderFactory( + IcebergFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory, + FileFormatDataSourceStats fileFormatDataSourceStats, + OrcReaderConfig orcReaderConfig, + ParquetReaderConfig parquetReaderConfig, + TypeManager typeManager) + { + this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.fileIoFactory = requireNonNull(fileIoFactory, "fileIoFactory is null"); + this.fileFormatDataSourceStats = requireNonNull(fileFormatDataSourceStats, "fileFormatDataSourceStats is null"); + this.orcReaderOptions = orcReaderConfig.toOrcReaderOptions(); + this.parquetReaderOptions = parquetReaderConfig.toParquetReaderOptions(); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + @Override + public ConnectorPageSourceProvider createPageSourceProvider() + { + return new IcebergPageSourceProvider(fileSystemFactory, fileIoFactory, fileFormatDataSourceStats, orcReaderOptions, parquetReaderOptions, typeManager); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPartitionFunction.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPartitionFunction.java new file mode 100644 index 000000000000..a558c2363546 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPartitionFunction.java @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.type.Type; + +import java.util.List; +import java.util.OptionalInt; +import java.util.regex.Matcher; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.plugin.iceberg.PartitionFields.ICEBERG_BUCKET_PATTERN; +import static io.trino.plugin.iceberg.PartitionFields.ICEBERG_TRUNCATE_PATTERN; +import static java.util.Objects.requireNonNull; + +// NOTE: the partitioning function must contain the data path for nested fields because the partitioning columns +// reference the top level column, not the nested column. This means that even thought the partitioning functions +// with a different path should be compatible, the system does not consider them compatible. Fortunately, partitioning +// on nested columns is not common. +public record IcebergPartitionFunction(Transform transform, List dataPath, Type type, OptionalInt size) +{ + public enum Transform + { + IDENTITY, + YEAR, + MONTH, + DAY, + HOUR, + VOID, + BUCKET, + TRUNCATE + } + + public IcebergPartitionFunction(Transform transform, List dataPath, Type type) + { + this(transform, dataPath, type, OptionalInt.empty()); + } + + public IcebergPartitionFunction + { + requireNonNull(transform, "transform is null"); + requireNonNull(dataPath, "dataPath is null"); + checkArgument(!dataPath.isEmpty(), "dataPath is empty"); + requireNonNull(type, "type is null"); + requireNonNull(size, "size is null"); + checkArgument(size.orElse(0) >= 0, "size must be greater than or equal to zero"); + checkArgument(size.isEmpty() || transform == Transform.BUCKET || transform == Transform.TRUNCATE, "size is only valid for BUCKET and TRUNCATE transforms"); + } + + public IcebergPartitionFunction withTopLevelColumnIndex(int newColumnIndex) + { + return new IcebergPartitionFunction( + transform, + ImmutableList.builder() + .add(newColumnIndex) + .addAll(dataPath().subList(1, dataPath().size())) + .build(), + type, + size); + } + + public static IcebergPartitionFunction create(String transform, List dataPath, Type type) + { + return switch (transform) { + case "identity" -> new IcebergPartitionFunction(Transform.IDENTITY, dataPath, type); + case "year" -> new IcebergPartitionFunction(Transform.YEAR, dataPath, type); + case "month" -> new IcebergPartitionFunction(Transform.MONTH, dataPath, type); + case "day" -> new IcebergPartitionFunction(Transform.DAY, dataPath, type); + case "hour" -> new IcebergPartitionFunction(Transform.HOUR, dataPath, type); + case "void" -> new IcebergPartitionFunction(Transform.VOID, dataPath, type); + default -> { + Matcher matcher = ICEBERG_BUCKET_PATTERN.matcher(transform); + if (matcher.matches()) { + yield new IcebergPartitionFunction(Transform.BUCKET, dataPath, type, OptionalInt.of(Integer.parseInt(matcher.group(1)))); + } + + matcher = ICEBERG_TRUNCATE_PATTERN.matcher(transform); + if (matcher.matches()) { + yield new IcebergPartitionFunction(Transform.TRUNCATE, dataPath, type, OptionalInt.of(Integer.parseInt(matcher.group(1)))); + } + + throw new UnsupportedOperationException("Unsupported partition transform: " + transform); + } + }; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPartitioningHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPartitioningHandle.java index 884ead60950e..a42d629db219 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPartitioningHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergPartitioningHandle.java @@ -13,48 +13,141 @@ */ package io.trino.plugin.iceberg; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.hash.Hasher; +import com.google.common.hash.Hashing; import io.trino.spi.connector.ConnectorPartitioningHandle; +import io.trino.spi.type.TypeManager; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.types.Types; +import java.util.ArrayDeque; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; -import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.trino.plugin.iceberg.TypeConverter.toTrinoType; +import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.Objects.requireNonNull; -public class IcebergPartitioningHandle +public record IcebergPartitioningHandle(boolean update, List partitionFunctions) implements ConnectorPartitioningHandle { - private final List partitioning; - private final List partitioningColumns; + public IcebergPartitioningHandle + { + partitionFunctions = ImmutableList.copyOf(requireNonNull(partitionFunctions, "partitioning is null")); + } + + public IcebergPartitioningHandle forUpdate() + { + return new IcebergPartitioningHandle(true, partitionFunctions); + } - @JsonCreator - public IcebergPartitioningHandle( - @JsonProperty("partitioning") List partitioning, - @JsonProperty("partitioningColumns") List partitioningColumns) + public static IcebergPartitioningHandle create(PartitionSpec spec, TypeManager typeManager, List partitioningColumns) { - this.partitioning = ImmutableList.copyOf(requireNonNull(partitioning, "partitioning is null")); - this.partitioningColumns = ImmutableList.copyOf(requireNonNull(partitioningColumns, "partitioningColumns is null")); + Map> dataPaths = buildDataPaths(spec); + List partitionFields = spec.fields().stream() + .map(field -> IcebergPartitionFunction.create( + field.transform().toString(), + dataPaths.get(field.sourceId()), + toTrinoType(spec.schema().findType(field.sourceId()), typeManager))) + .collect(toImmutableList()); + + return new IcebergPartitioningHandle(false, partitionFields); } - @JsonProperty - public List getPartitioning() + /** + * Constructs a map of field IDs to data paths. + * The data path for root field is the ordinal position of the partition field under this root field, defined by {@link IcebergMetadata#getWriteLayout} + * The data path for non-root nested fields is the ordinal position in its parent's nested field. + * e.g. for a schema {f1: {f3, f4}, f2, f5} + * when partitioned by f1.f3 and f2, the data paths are {3 : [1,0], 2 : [0]} + * when partitioned by f1.f4 and f5, the data paths are {4 : [0, 1], 5 : [1]} + */ + private static Map> buildDataPaths(PartitionSpec spec) { - return partitioning; + Set partitionFieldIds = spec.fields().stream().map(PartitionField::sourceId).collect(toImmutableSet()); + + /* + * In this loop, the field ID acts as a placeholder in the first position + * Later, these placeholders will be replaced with the actual channel IDs by the order of its partitioned sub-field ID. + */ + Map> fieldInfo = new HashMap<>(); + for (Types.NestedField field : spec.schema().asStruct().fields()) { + // Partition fields can only be nested in a struct + if (field.type() instanceof Types.StructType nestedStruct) { + buildDataPaths(partitionFieldIds, nestedStruct, new ArrayDeque<>(ImmutableList.of(field.fieldId())), fieldInfo); + } + else if (field.type().isPrimitiveType() && partitionFieldIds.contains(field.fieldId())) { + fieldInfo.put(field.fieldId(), ImmutableList.of(field.fieldId())); + } + } + + /* + * Replace the root field ID with the actual channel ID. + * Transformation: {fieldId : rootFieldId.structOrdinalX.structOrdinalY} -> {fieldId : channel.structOrdinalX.structOrdinalY}. + * Root field's channelId is assigned sequentially based on the key fieldId. + */ + List sortedFieldIds = fieldInfo.keySet().stream() + .sorted() + .collect(toImmutableList()); + + ImmutableMap.Builder> builder = ImmutableMap + .builderWithExpectedSize(sortedFieldIds.size()); + + Map fieldChannels = new HashMap<>(); + AtomicInteger channel = new AtomicInteger(); + for (int sortedFieldId : sortedFieldIds) { + List dataPath = fieldInfo.get(sortedFieldId); + int fieldChannel = fieldChannels.computeIfAbsent(dataPath.get(0), ignore -> channel.getAndIncrement()); + List channelDataPath = ImmutableList.builder() + .add(fieldChannel) + .addAll(dataPath.stream() + .skip(1) + .iterator()) + .build(); + builder.put(sortedFieldId, channelDataPath); + } + + return builder.buildOrThrow(); } - @JsonProperty - public List getPartitioningColumns() + private static void buildDataPaths(Set partitionFieldIds, Types.StructType struct, ArrayDeque currentPaths, Map> dataPaths) { - return partitioningColumns; + List fields = struct.fields(); + for (int fieldOrdinal = 0; fieldOrdinal < fields.size(); fieldOrdinal++) { + Types.NestedField field = fields.get(fieldOrdinal); + int fieldId = field.fieldId(); + + currentPaths.addLast(fieldOrdinal); + org.apache.iceberg.types.Type type = field.type(); + if (type instanceof Types.StructType nestedStruct) { + buildDataPaths(partitionFieldIds, nestedStruct, currentPaths, dataPaths); + } + // Map and List types are not supported in partitioning + else if (type.isPrimitiveType() && partitionFieldIds.contains(fieldId)) { + dataPaths.put(fieldId, ImmutableList.copyOf(currentPaths)); + } + currentPaths.removeLast(); + } } - @Override - public String toString() + public long getCacheKeyHint() { - return toStringHelper(this) - .add("partitioning", partitioning) - .toString(); + Hasher hasher = Hashing.goodFastHash(64).newHasher(); + hasher.putBoolean(update); + for (IcebergPartitionFunction function : partitionFunctions) { + hasher.putInt(function.transform().ordinal()); + function.dataPath().forEach(hasher::putInt); + hasher.putString(function.type().getTypeSignature().toString(), UTF_8); + function.size().ifPresent(hasher::putInt); + } + return hasher.hash().asLong(); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSessionProperties.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSessionProperties.java index a607c52b37cd..6866596fe010 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSessionProperties.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSessionProperties.java @@ -19,7 +19,6 @@ import io.airlift.units.Duration; import io.trino.orc.OrcWriteValidation.OrcWriteValidationMode; import io.trino.plugin.base.session.SessionPropertiesProvider; -import io.trino.plugin.hive.HiveCompressionCodec; import io.trino.plugin.hive.orc.OrcReaderConfig; import io.trino.plugin.hive.orc.OrcWriterConfig; import io.trino.plugin.hive.parquet.ParquetReaderConfig; @@ -27,12 +26,17 @@ import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.session.PropertyMetadata; +import io.trino.spi.type.ArrayType; +import java.util.Collection; import java.util.List; import java.util.Optional; +import java.util.Set; import java.util.concurrent.ThreadLocalRandom; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Strings.isNullOrEmpty; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.trino.plugin.base.session.PropertyMetadataUtil.dataSizeProperty; import static io.trino.plugin.base.session.PropertyMetadataUtil.durationProperty; import static io.trino.plugin.base.session.PropertyMetadataUtil.validateMaxDataSize; @@ -48,12 +52,15 @@ import static io.trino.spi.session.PropertyMetadata.enumProperty; import static io.trino.spi.session.PropertyMetadata.integerProperty; import static io.trino.spi.session.PropertyMetadata.stringProperty; +import static io.trino.spi.type.VarcharType.VARCHAR; import static java.lang.String.format; +import static java.util.Locale.ENGLISH; +import static java.util.Objects.requireNonNull; public final class IcebergSessionProperties implements SessionPropertiesProvider { - private static final String COMPRESSION_CODEC = "compression_codec"; + public static final String SPLIT_SIZE = "experimental_split_size"; private static final String USE_FILE_SIZE_FROM_METADATA = "use_file_size_from_metadata"; private static final String ORC_BLOOM_FILTERS_ENABLED = "orc_bloom_filters_enabled"; private static final String ORC_MAX_MERGE_DISTANCE = "orc_max_merge_distance"; @@ -69,20 +76,24 @@ public final class IcebergSessionProperties private static final String ORC_WRITER_MIN_STRIPE_SIZE = "orc_writer_min_stripe_size"; private static final String ORC_WRITER_MAX_STRIPE_SIZE = "orc_writer_max_stripe_size"; private static final String ORC_WRITER_MAX_STRIPE_ROWS = "orc_writer_max_stripe_rows"; + private static final String ORC_WRITER_MAX_ROW_GROUP_ROWS = "orc_writer_max_row_group_rows"; private static final String ORC_WRITER_MAX_DICTIONARY_MEMORY = "orc_writer_max_dictionary_memory"; private static final String PARQUET_MAX_READ_BLOCK_SIZE = "parquet_max_read_block_size"; private static final String PARQUET_USE_BLOOM_FILTER = "parquet_use_bloom_filter"; private static final String PARQUET_MAX_READ_BLOCK_ROW_COUNT = "parquet_max_read_block_row_count"; - private static final String PARQUET_OPTIMIZED_READER_ENABLED = "parquet_optimized_reader_enabled"; - private static final String PARQUET_OPTIMIZED_NESTED_READER_ENABLED = "parquet_optimized_nested_reader_enabled"; + private static final String PARQUET_SMALL_FILE_THRESHOLD = "parquet_small_file_threshold"; + private static final String PARQUET_IGNORE_STATISTICS = "parquet_ignore_statistics"; + private static final String PARQUET_VECTORIZED_DECODING_ENABLED = "parquet_vectorized_decoding_enabled"; private static final String PARQUET_WRITER_BLOCK_SIZE = "parquet_writer_block_size"; private static final String PARQUET_WRITER_PAGE_SIZE = "parquet_writer_page_size"; + private static final String PARQUET_WRITER_PAGE_VALUE_COUNT = "parquet_writer_page_value_count"; private static final String PARQUET_WRITER_BATCH_SIZE = "parquet_writer_batch_size"; - private static final String DYNAMIC_FILTERING_WAIT_TIMEOUT = "dynamic_filtering_wait_timeout"; + public static final String DYNAMIC_FILTERING_WAIT_TIMEOUT = "dynamic_filtering_wait_timeout"; private static final String STATISTICS_ENABLED = "statistics_enabled"; public static final String EXTENDED_STATISTICS_ENABLED = "extended_statistics_enabled"; private static final String PROJECTION_PUSHDOWN_ENABLED = "projection_pushdown_enabled"; private static final String TARGET_MAX_FILE_SIZE = "target_max_file_size"; + private static final String IDLE_WRITER_MIN_FILE_SIZE = "idle_writer_min_file_size"; public static final String COLLECT_EXTENDED_STATISTICS_ON_WRITE = "collect_extended_statistics_on_write"; private static final String HIVE_CATALOG_NAME = "hive_catalog_name"; private static final String MINIMUM_ASSIGNED_SPLIT_WEIGHT = "minimum_assigned_split_weight"; @@ -90,6 +101,12 @@ public final class IcebergSessionProperties public static final String REMOVE_ORPHAN_FILES_MIN_RETENTION = "remove_orphan_files_min_retention"; private static final String MERGE_MANIFESTS_ON_WRITE = "merge_manifests_on_write"; private static final String SORTED_WRITING_ENABLED = "sorted_writing_enabled"; + private static final String QUERY_PARTITION_FILTER_REQUIRED = "query_partition_filter_required"; + private static final String QUERY_PARTITION_FILTER_REQUIRED_SCHEMAS = "query_partition_filter_required_schemas"; + private static final String INCREMENTAL_REFRESH_ENABLED = "incremental_refresh_enabled"; + public static final String BUCKET_EXECUTION_ENABLED = "bucket_execution_enabled"; + public static final String FILE_BASED_CONFLICT_DETECTION_ENABLED = "file_based_conflict_detection_enabled"; + private static final String MAX_PARTITIONS_PER_WRITER = "max_partitions_per_writer"; private final List> sessionProperties; @@ -102,12 +119,13 @@ public IcebergSessionProperties( ParquetWriterConfig parquetWriterConfig) { sessionProperties = ImmutableList.>builder() - .add(enumProperty( - COMPRESSION_CODEC, - "Compression codec to use when writing files", - HiveCompressionCodec.class, - icebergConfig.getCompressionCodec(), - false)) + .add(dataSizeProperty( + SPLIT_SIZE, + "Target split size", + // Note: this is null by default & hidden, currently mainly for tests. + // See https://github.com/trinodb/trino/issues/9018#issuecomment-1752929193 for further discussion. + null, + true)) .add(booleanProperty( USE_FILE_SIZE_FROM_METADATA, "Use file size stored in Iceberg metadata", @@ -192,6 +210,11 @@ public IcebergSessionProperties( "ORC: Max stripe row count", orcWriterConfig.getStripeMaxRowCount(), false)) + .add(integerProperty( + ORC_WRITER_MAX_ROW_GROUP_ROWS, + "ORC: Max number of rows in a row group", + orcWriterConfig.getRowGroupMaxRowCount(), + false)) .add(dataSizeProperty( ORC_WRITER_MAX_DICTIONARY_MEMORY, "ORC: Max dictionary memory", @@ -219,16 +242,22 @@ public IcebergSessionProperties( } }, false)) + //.add(dataSizeProperty( + // PARQUET_SMALL_FILE_THRESHOLD, + // "Parquet: Size below which a parquet file will be read entirely", + // parquetReaderConfig.getSmallFileThreshold(), + // value -> validateMaxDataSize(PARQUET_SMALL_FILE_THRESHOLD, value, DataSize.valueOf(PARQUET_READER_MAX_SMALL_FILE_THRESHOLD)), + // false)) .add(booleanProperty( - PARQUET_OPTIMIZED_READER_ENABLED, - "Use optimized Parquet reader", - parquetReaderConfig.isOptimizedReaderEnabled(), - false)) - .add(booleanProperty( - PARQUET_OPTIMIZED_NESTED_READER_ENABLED, - "Use optimized Parquet reader for nested columns", - parquetReaderConfig.isOptimizedNestedReaderEnabled(), - false)) + PARQUET_IGNORE_STATISTICS, + "Ignore statistics from Parquet to allow querying files with corrupted or incorrect statistics", + parquetReaderConfig.isIgnoreStatistics(), + false)) + //.add(booleanProperty( + // PARQUET_VECTORIZED_DECODING_ENABLED, + // "Enable using Java Vector API for faster decoding of parquet files", + // parquetReaderConfig.isVectorizedDecodingEnabled(), + // false)) .add(dataSizeProperty( PARQUET_WRITER_BLOCK_SIZE, "Parquet: Writer block size", @@ -244,6 +273,18 @@ public IcebergSessionProperties( validateMaxDataSize(PARQUET_WRITER_PAGE_SIZE, value, DataSize.valueOf(PARQUET_WRITER_MAX_PAGE_SIZE)); }, false)) + //.add(integerProperty( + // PARQUET_WRITER_PAGE_VALUE_COUNT, + // "Parquet: Writer page row count", + // parquetWriterConfig.getPageValueCount(), + // value -> { + // if (value < PARQUET_WRITER_MIN_PAGE_VALUE_COUNT || value > PARQUET_WRITER_MAX_PAGE_VALUE_COUNT) { + // throw new TrinoException( + // INVALID_SESSION_PROPERTY, + // format("%s must be between %s and %s: %s", PARQUET_WRITER_PAGE_VALUE_COUNT, PARQUET_WRITER_MIN_PAGE_VALUE_COUNT, PARQUET_WRITER_MAX_PAGE_VALUE_COUNT, value)); + // } + // }, + // false)) .add(integerProperty( PARQUET_WRITER_BATCH_SIZE, "Parquet: Maximum number of rows passed to the writer in each batch", @@ -274,6 +315,11 @@ public IcebergSessionProperties( "Target maximum size of written files; the actual size may be larger", icebergConfig.getTargetMaxFileSize(), false)) + .add(dataSizeProperty( + IDLE_WRITER_MIN_FILE_SIZE, + "Minimum data written by a single partition writer before it can be consider as 'idle' and could be closed by the engine", + icebergConfig.getIdleWriterMinFileSize(), + false)) .add(booleanProperty( COLLECT_EXTENDED_STATISTICS_ON_WRITE, COLLECT_EXTENDED_STATISTICS_ON_WRITE_DESCRIPTION, @@ -311,6 +357,55 @@ public IcebergSessionProperties( "Enable sorted writing to tables with a specified sort order", icebergConfig.isSortedWritingEnabled(), false)) + .add(booleanProperty( + QUERY_PARTITION_FILTER_REQUIRED, + "Require filter on partition column", + icebergConfig.isQueryPartitionFilterRequired(), + false)) + .add(new PropertyMetadata<>( + QUERY_PARTITION_FILTER_REQUIRED_SCHEMAS, + "List of schemas for which filter on partition column is enforced.", + new ArrayType(VARCHAR), + Set.class, + icebergConfig.getQueryPartitionFilterRequiredSchemas(), + false, + object -> ((Collection) object).stream() + .map(String.class::cast) + .peek(property -> { + if (isNullOrEmpty(property)) { + throw new TrinoException(INVALID_SESSION_PROPERTY, format("Invalid null or empty value in %s property", QUERY_PARTITION_FILTER_REQUIRED_SCHEMAS)); + } + }) + .map(schema -> schema.toLowerCase(ENGLISH)) + .collect(toImmutableSet()), + value -> value)) + .add(booleanProperty( + INCREMENTAL_REFRESH_ENABLED, + "Enable Incremental refresh for MVs backed by Iceberg tables, when possible.", + icebergConfig.isIncrementalRefreshEnabled(), + false)) + .add(booleanProperty( + BUCKET_EXECUTION_ENABLED, + "Enable bucket-aware execution: use physical bucketing information to optimize queries", + icebergConfig.isBucketExecutionEnabled(), + false)) + .add(booleanProperty( + FILE_BASED_CONFLICT_DETECTION_ENABLED, + "Enable file-based conflict detection: take partition information from the actual written files as a source for the conflict detection system", + icebergConfig.isFileBasedConflictDetectionEnabled(), + false)) + .add(integerProperty( + MAX_PARTITIONS_PER_WRITER, + "Maximum number of partitions per writer", + icebergConfig.getMaxPartitionsPerWriter(), + value -> { + if (value < 1 || value > icebergConfig.getMaxPartitionsPerWriter()) { + throw new TrinoException( + INVALID_SESSION_PROPERTY, + format("%s must be between 1 and %s", MAX_PARTITIONS_PER_WRITER, icebergConfig.getMaxPartitionsPerWriter())); + } + }, + false)) .build(); } @@ -397,14 +492,19 @@ public static int getOrcWriterMaxStripeRows(ConnectorSession session) return session.getProperty(ORC_WRITER_MAX_STRIPE_ROWS, Integer.class); } + public static int getOrcWriterMaxRowGroupRows(ConnectorSession session) + { + return session.getProperty(ORC_WRITER_MAX_ROW_GROUP_ROWS, Integer.class); + } + public static DataSize getOrcWriterMaxDictionaryMemory(ConnectorSession session) { return session.getProperty(ORC_WRITER_MAX_DICTIONARY_MEMORY, DataSize.class); } - public static HiveCompressionCodec getCompressionCodec(ConnectorSession session) + public static Optional getSplitSize(ConnectorSession session) { - return session.getProperty(COMPRESSION_CODEC, HiveCompressionCodec.class); + return Optional.ofNullable(session.getProperty(SPLIT_SIZE, DataSize.class)); } public static boolean isUseFileSizeFromMetadata(ConnectorSession session) @@ -422,14 +522,19 @@ public static int getParquetMaxReadBlockRowCount(ConnectorSession session) return session.getProperty(PARQUET_MAX_READ_BLOCK_ROW_COUNT, Integer.class); } - public static boolean isParquetOptimizedReaderEnabled(ConnectorSession session) + public static DataSize getParquetSmallFileThreshold(ConnectorSession session) { - return session.getProperty(PARQUET_OPTIMIZED_READER_ENABLED, Boolean.class); + return session.getProperty(PARQUET_SMALL_FILE_THRESHOLD, DataSize.class); } - public static boolean isParquetOptimizedNestedReaderEnabled(ConnectorSession session) + public static boolean isParquetIgnoreStatistics(ConnectorSession session) { - return session.getProperty(PARQUET_OPTIMIZED_NESTED_READER_ENABLED, Boolean.class); + return session.getProperty(PARQUET_IGNORE_STATISTICS, Boolean.class); + } + + public static boolean isParquetVectorizedDecodingEnabled(ConnectorSession session) + { + return session.getProperty(PARQUET_VECTORIZED_DECODING_ENABLED, Boolean.class); } public static DataSize getParquetWriterPageSize(ConnectorSession session) @@ -437,6 +542,11 @@ public static DataSize getParquetWriterPageSize(ConnectorSession session) return session.getProperty(PARQUET_WRITER_PAGE_SIZE, DataSize.class); } + public static int getParquetWriterPageValueCount(ConnectorSession session) + { + return session.getProperty(PARQUET_WRITER_PAGE_VALUE_COUNT, Integer.class); + } + public static DataSize getParquetWriterBlockSize(ConnectorSession session) { return session.getProperty(PARQUET_WRITER_BLOCK_SIZE, DataSize.class); @@ -482,6 +592,11 @@ public static long getTargetMaxFileSize(ConnectorSession session) return session.getProperty(TARGET_MAX_FILE_SIZE, DataSize.class).toBytes(); } + public static long getIdleWriterMinFileSize(ConnectorSession session) + { + return session.getProperty(IDLE_WRITER_MIN_FILE_SIZE, DataSize.class).toBytes(); + } + public static Optional getHiveCatalogName(ConnectorSession session) { return Optional.ofNullable(session.getProperty(HIVE_CATALOG_NAME, String.class)); @@ -511,4 +626,37 @@ public static boolean isSortedWritingEnabled(ConnectorSession session) { return session.getProperty(SORTED_WRITING_ENABLED, Boolean.class); } + + public static boolean isQueryPartitionFilterRequired(ConnectorSession session) + { + return session.getProperty(QUERY_PARTITION_FILTER_REQUIRED, Boolean.class); + } + + @SuppressWarnings("unchecked cast") + public static Set getQueryPartitionFilterRequiredSchemas(ConnectorSession session) + { + Set queryPartitionFilterRequiredSchemas = (Set) session.getProperty(QUERY_PARTITION_FILTER_REQUIRED_SCHEMAS, Set.class); + requireNonNull(queryPartitionFilterRequiredSchemas, "queryPartitionFilterRequiredSchemas is null"); + return queryPartitionFilterRequiredSchemas; + } + + public static boolean isIncrementalRefreshEnabled(ConnectorSession session) + { + return session.getProperty(INCREMENTAL_REFRESH_ENABLED, Boolean.class); + } + + public static boolean isBucketExecutionEnabled(ConnectorSession session) + { + return session.getProperty(BUCKET_EXECUTION_ENABLED, Boolean.class); + } + + public static boolean isFileBasedConflictDetectionEnabled(ConnectorSession session) + { + return session.getProperty(FILE_BASED_CONFLICT_DETECTION_ENABLED, Boolean.class); + } + + public static int maxPartitionsPerWriter(ConnectorSession session) + { + return session.getProperty(MAX_PARTITIONS_PER_WRITER, Integer.class); + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSplit.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSplit.java index 20719a9d8dfb..a7813193c67a 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSplit.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergSplit.java @@ -19,12 +19,16 @@ import com.google.common.base.MoreObjects.ToStringHelper; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import io.airlift.slice.SizeOf; import io.trino.plugin.iceberg.delete.DeleteFile; import io.trino.spi.HostAddress; import io.trino.spi.SplitWeight; import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.predicate.TupleDomain; import java.util.List; +import java.util.Map; +import java.util.Optional; import static com.google.common.base.MoreObjects.toStringHelper; import static io.airlift.slice.SizeOf.estimatedSizeOf; @@ -40,11 +44,17 @@ public class IcebergSplit private final long start; private final long length; private final long fileSize; + private final long fileRecordCount; private final IcebergFileFormat fileFormat; + private final Optional> partitionValues; private final String partitionSpecJson; private final String partitionDataJson; private final List deletes; private final SplitWeight splitWeight; + private final TupleDomain fileStatisticsDomain; + private final Map fileIoProperties; + private final long dataSequenceNumber; + private final List addresses; @JsonCreator public IcebergSplit( @@ -52,34 +62,73 @@ public IcebergSplit( @JsonProperty("start") long start, @JsonProperty("length") long length, @JsonProperty("fileSize") long fileSize, + @JsonProperty("fileRecordCount") long fileRecordCount, @JsonProperty("fileFormat") IcebergFileFormat fileFormat, @JsonProperty("partitionSpecJson") String partitionSpecJson, @JsonProperty("partitionDataJson") String partitionDataJson, @JsonProperty("deletes") List deletes, - @JsonProperty("splitWeight") SplitWeight splitWeight) + @JsonProperty("splitWeight") SplitWeight splitWeight, + @JsonProperty("fileStatisticsDomain") TupleDomain fileStatisticsDomain, + @JsonProperty("fileIoProperties") Map fileIoProperties, + @JsonProperty("dataSequenceNumber") long dataSequenceNumber) + { + this( + path, + start, + length, + fileSize, + fileRecordCount, + fileFormat, + Optional.empty(), + partitionSpecJson, + partitionDataJson, + deletes, + splitWeight, + fileStatisticsDomain, + fileIoProperties, + ImmutableList.of(), + dataSequenceNumber); + } + + public IcebergSplit( + String path, + long start, + long length, + long fileSize, + long fileRecordCount, + IcebergFileFormat fileFormat, + Optional> partitionValues, + String partitionSpecJson, + String partitionDataJson, + List deletes, + SplitWeight splitWeight, + TupleDomain fileStatisticsDomain, + Map fileIoProperties, + List addresses, + long dataSequenceNumber) { this.path = requireNonNull(path, "path is null"); this.start = start; this.length = length; this.fileSize = fileSize; + this.fileRecordCount = fileRecordCount; this.fileFormat = requireNonNull(fileFormat, "fileFormat is null"); + this.partitionValues = requireNonNull(partitionValues, "partitionValues is null"); this.partitionSpecJson = requireNonNull(partitionSpecJson, "partitionSpecJson is null"); this.partitionDataJson = requireNonNull(partitionDataJson, "partitionDataJson is null"); this.deletes = ImmutableList.copyOf(requireNonNull(deletes, "deletes is null")); this.splitWeight = requireNonNull(splitWeight, "splitWeight is null"); - } - - @Override - public boolean isRemotelyAccessible() - { - return true; + this.fileStatisticsDomain = requireNonNull(fileStatisticsDomain, "fileStatisticsDomain is null"); + this.fileIoProperties = ImmutableMap.copyOf(requireNonNull(fileIoProperties, "fileIoProperties is null")); + this.addresses = requireNonNull(addresses, "addresses is null"); + this.dataSequenceNumber = dataSequenceNumber; } @JsonIgnore @Override public List getAddresses() { - return ImmutableList.of(); + return addresses; } @JsonProperty @@ -106,6 +155,12 @@ public long getFileSize() return fileSize; } + @JsonProperty + public long getFileRecordCount() + { + return fileRecordCount; + } + @JsonProperty public IcebergFileFormat getFileFormat() { @@ -118,6 +173,16 @@ public String getPartitionSpecJson() return partitionSpecJson; } + /** + * Trino (stack) values of the partition columns. The values are the result of evaluating + * the partition expressions on the partition data. + */ + @JsonIgnore + public Optional> getPartitionValues() + { + return partitionValues; + } + @JsonProperty public String getPartitionDataJson() { @@ -137,14 +202,22 @@ public SplitWeight getSplitWeight() return splitWeight; } - @Override - public Object getInfo() + @JsonProperty + public TupleDomain getFileStatisticsDomain() { - return ImmutableMap.builder() - .put("path", path) - .put("start", start) - .put("length", length) - .buildOrThrow(); + return fileStatisticsDomain; + } + + @JsonProperty + public Map getFileIoProperties() + { + return fileIoProperties; + } + + @JsonProperty + public long getDataSequenceNumber() + { + return dataSequenceNumber; } @Override @@ -154,8 +227,11 @@ public long getRetainedSizeInBytes() + estimatedSizeOf(path) + estimatedSizeOf(partitionSpecJson) + estimatedSizeOf(partitionDataJson) - + estimatedSizeOf(deletes, DeleteFile::getRetainedSizeInBytes) - + splitWeight.getRetainedSizeInBytes(); + + estimatedSizeOf(deletes, DeleteFile::retainedSizeInBytes) + + splitWeight.getRetainedSizeInBytes() + + fileStatisticsDomain.getRetainedSizeInBytes(IcebergColumnHandle::getRetainedSizeInBytes) + + estimatedSizeOf(fileIoProperties, SizeOf::estimatedSizeOf, SizeOf::estimatedSizeOf) + + estimatedSizeOf(addresses, HostAddress::getRetainedSizeInBytes); } @Override @@ -164,7 +240,8 @@ public String toString() ToStringHelper helper = toStringHelper(this) .addValue(path) .add("start", start) - .add("length", length); + .add("length", length) + .add("fileStatisticsDomain", fileStatisticsDomain); if (!deletes.isEmpty()) { helper.add("deleteFiles", deletes.size()); helper.add("deleteRecords", deletes.stream() @@ -172,4 +249,20 @@ public String toString() } return helper.toString(); } + + @Override + public boolean isRemotelyAccessible() + { + return true; + } + + @Override + public Object getInfo() + { + return ImmutableMap.builder() + .put("path", path) + .put("start", start) + .put("length", length) + .buildOrThrow(); + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergStatistics.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergStatistics.java index 8b3c80053da5..661814c03679 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergStatistics.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergStatistics.java @@ -13,9 +13,7 @@ */ package io.trino.plugin.iceberg; -import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import com.google.errorprone.annotations.Immutable; import io.trino.spi.TrinoException; import io.trino.spi.type.TypeManager; import jakarta.annotation.Nullable; @@ -45,99 +43,43 @@ import static io.trino.spi.function.InvocationConvention.simpleConvention; import static java.util.Objects.requireNonNull; -@Immutable -final class IcebergStatistics +public record IcebergStatistics( + long recordCount, + long fileCount, + long size, + Map minValues, + Map maxValues, + Map nullCounts, + Map nanCounts, + Map columnSizes) { - private final long recordCount; - private final long fileCount; - private final long size; - private final Map minValues; - private final Map maxValues; - private final Map nullCounts; - private final Map nanCounts; - private final Map columnSizes; - - private IcebergStatistics( - long recordCount, - long fileCount, - long size, - Map minValues, - Map maxValues, - Map nullCounts, - Map nanCounts, - Map columnSizes) - { - this.recordCount = recordCount; - this.fileCount = fileCount; - this.size = size; - this.minValues = ImmutableMap.copyOf(requireNonNull(minValues, "minValues is null")); - this.maxValues = ImmutableMap.copyOf(requireNonNull(maxValues, "maxValues is null")); - this.nullCounts = ImmutableMap.copyOf(requireNonNull(nullCounts, "nullCounts is null")); - this.nanCounts = ImmutableMap.copyOf(requireNonNull(nanCounts, "nanCounts is null")); - this.columnSizes = ImmutableMap.copyOf(requireNonNull(columnSizes, "columnSizes is null")); - } - - public long getRecordCount() - { - return recordCount; - } - - public long getFileCount() - { - return fileCount; - } - - public long getSize() - { - return size; - } - - public Map getMinValues() - { - return minValues; - } - - public Map getMaxValues() - { - return maxValues; - } - - public Map getNullCounts() - { - return nullCounts; - } - - public Map getNanCounts() + public IcebergStatistics { - return nanCounts; - } - - public Map getColumnSizes() - { - return columnSizes; + minValues = ImmutableMap.copyOf(requireNonNull(minValues, "minValues is null")); + maxValues = ImmutableMap.copyOf(requireNonNull(maxValues, "maxValues is null")); + nullCounts = ImmutableMap.copyOf(requireNonNull(nullCounts, "nullCounts is null")); + nanCounts = ImmutableMap.copyOf(requireNonNull(nanCounts, "nanCounts is null")); + columnSizes = ImmutableMap.copyOf(requireNonNull(columnSizes, "columnSizes is null")); } public static class Builder { - private final List columns; private final TypeManager typeManager; - private final Map> nullCounts = new HashMap<>(); - private final Map> nanCounts = new HashMap<>(); - private final Map columnStatistics = new HashMap<>(); - private final Map columnSizes = new HashMap<>(); private final Map fieldIdToTrinoType; private long recordCount; private long fileCount; private long size; + private final Map columnStatistics = new HashMap<>(); + private final Map> nullCounts = new HashMap<>(); + private final Map> nanCounts = new HashMap<>(); + private final Map columnSizes = new HashMap<>(); public Builder( List columns, TypeManager typeManager) { - this.columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null")); this.typeManager = requireNonNull(typeManager, "typeManager is null"); - this.fieldIdToTrinoType = columns.stream() .collect(toImmutableMap(Types.NestedField::fieldId, column -> toTrinoType(column.type(), typeManager))); } @@ -150,11 +92,10 @@ public void acceptDataFile(DataFile dataFile, PartitionSpec partitionSpec) Map newColumnSizes = dataFile.columnSizes(); if (newColumnSizes != null) { - for (Types.NestedField column : columns) { - int id = column.fieldId(); - Long addedSize = newColumnSizes.get(id); + for (Map.Entry entry : newColumnSizes.entrySet()) { + Long addedSize = entry.getValue(); if (addedSize != null) { - columnSizes.merge(id, addedSize, Long::sum); + columnSizes.merge(entry.getKey(), addedSize, Long::sum); } } } @@ -262,7 +203,7 @@ private void updateMinMaxStats( if (type.isOrderable() && (nullCount.isEmpty() || nullCount.get() != recordCount)) { // Capture the initial bounds during construction so there are always valid min/max values to compare to. This does make the first call to // `ColumnStatistics#updateMinMax` a no-op. - columnStatistics.computeIfAbsent(id, ignored -> { + columnStatistics.computeIfAbsent(id, ignore -> { MethodHandle comparisonHandle = typeManager.getTypeOperators() .getComparisonUnorderedLastOperator(type, simpleConvention(FAIL_ON_NULL, NEVER_NULL, NEVER_NULL)); return new ColumnStatistics(comparisonHandle, lowerBound, upperBound); diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableHandle.java index b3edb6165935..710d71eadd74 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableHandle.java @@ -18,8 +18,8 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import com.google.errorprone.annotations.DoNotCall; import io.airlift.units.DataSize; -import io.trino.spi.connector.CatalogHandle; import io.trino.spi.connector.ConnectorTableHandle; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.predicate.TupleDomain; @@ -37,7 +37,6 @@ public class IcebergTableHandle implements ConnectorTableHandle { - private final CatalogHandle catalog; private final String schemaName; private final String tableName; private final TableType tableType; @@ -55,19 +54,28 @@ public class IcebergTableHandle // Filter guaranteed to be enforced by Iceberg connector private final TupleDomain enforcedPredicate; + // Columns that are present in {@link Constraint#predicate()} applied on the table scan + private final Set constraintColumns; + // semantically limit is applied after enforcedPredicate private final OptionalLong limit; private final Set projectedColumns; private final Optional nameMappingJson; + // Coordinator-only - table partitioning applied to the table splits if available and active + private final Optional tablePartitioning; + // OPTIMIZE only. Coordinator-only private final boolean recordScannedFiles; private final Optional maxScannedFileSize; + // ANALYZE only. Coordinator-only + private final Optional forAnalyze; + @JsonCreator + @DoNotCall // For JSON deserialization only public static IcebergTableHandle fromJsonForDeserializationOnly( - @JsonProperty("catalog") CatalogHandle catalog, @JsonProperty("schemaName") String schemaName, @JsonProperty("tableName") String tableName, @JsonProperty("tableType") TableType tableType, @@ -84,7 +92,6 @@ public static IcebergTableHandle fromJsonForDeserializationOnly( @JsonProperty("storageProperties") Map storageProperties) { return new IcebergTableHandle( - catalog, schemaName, tableName, tableType, @@ -99,12 +106,14 @@ public static IcebergTableHandle fromJsonForDeserializationOnly( nameMappingJson, tableLocation, storageProperties, + Optional.empty(), false, + Optional.empty(), + ImmutableSet.of(), Optional.empty()); } public IcebergTableHandle( - CatalogHandle catalog, String schemaName, String tableName, TableType tableType, @@ -119,10 +128,12 @@ public IcebergTableHandle( Optional nameMappingJson, String tableLocation, Map storageProperties, + Optional tablePartitioning, boolean recordScannedFiles, - Optional maxScannedFileSize) + Optional maxScannedFileSize, + Set constraintColumns, + Optional forAnalyze) { - this.catalog = requireNonNull(catalog, "catalog is null"); this.schemaName = requireNonNull(schemaName, "schemaName is null"); this.tableName = requireNonNull(tableName, "tableName is null"); this.tableType = requireNonNull(tableType, "tableType is null"); @@ -137,14 +148,11 @@ public IcebergTableHandle( this.nameMappingJson = requireNonNull(nameMappingJson, "nameMappingJson is null"); this.tableLocation = requireNonNull(tableLocation, "tableLocation is null"); this.storageProperties = ImmutableMap.copyOf(requireNonNull(storageProperties, "storageProperties is null")); + this.tablePartitioning = requireNonNull(tablePartitioning, "tablePartitioning is null"); this.recordScannedFiles = recordScannedFiles; this.maxScannedFileSize = requireNonNull(maxScannedFileSize, "maxScannedFileSize is null"); - } - - @JsonProperty - public CatalogHandle getCatalog() - { - return catalog; + this.constraintColumns = ImmutableSet.copyOf(requireNonNull(constraintColumns, "constraintColumns is null")); + this.forAnalyze = requireNonNull(forAnalyze, "forAnalyze is null"); } @JsonProperty @@ -232,6 +240,15 @@ public Map getStorageProperties() return storageProperties; } + /** + * Get the partitioning for the table splits. + */ + @JsonIgnore + public Optional getTablePartitioning() + { + return tablePartitioning; + } + @JsonIgnore public boolean isRecordScannedFiles() { @@ -244,6 +261,18 @@ public Optional getMaxScannedFileSize() return maxScannedFileSize; } + @JsonIgnore + public Set getConstraintColumns() + { + return constraintColumns; + } + + @JsonIgnore + public Optional getForAnalyze() + { + return forAnalyze; + } + public SchemaTableName getSchemaTableName() { return new SchemaTableName(schemaName, tableName); @@ -257,7 +286,6 @@ public SchemaTableName getSchemaTableNameWithType() public IcebergTableHandle withProjectedColumns(Set projectedColumns) { return new IcebergTableHandle( - catalog, schemaName, tableName, tableType, @@ -272,14 +300,40 @@ public IcebergTableHandle withProjectedColumns(Set projecte nameMappingJson, tableLocation, storageProperties, + tablePartitioning, recordScannedFiles, - maxScannedFileSize); + maxScannedFileSize, + constraintColumns, + forAnalyze); + } + + public IcebergTableHandle forAnalyze() + { + return new IcebergTableHandle( + schemaName, + tableName, + tableType, + snapshotId, + tableSchemaJson, + partitionSpecJson, + formatVersion, + unenforcedPredicate, + enforcedPredicate, + limit, + projectedColumns, + nameMappingJson, + tableLocation, + storageProperties, + tablePartitioning, + recordScannedFiles, + maxScannedFileSize, + constraintColumns, + Optional.of(true)); } public IcebergTableHandle forOptimize(boolean recordScannedFiles, DataSize maxScannedFileSize) { return new IcebergTableHandle( - catalog, schemaName, tableName, tableType, @@ -294,8 +348,35 @@ public IcebergTableHandle forOptimize(boolean recordScannedFiles, DataSize maxSc nameMappingJson, tableLocation, storageProperties, + tablePartitioning, + recordScannedFiles, + Optional.of(maxScannedFileSize), + constraintColumns, + forAnalyze); + } + + public IcebergTableHandle withTablePartitioning(Optional requiredTablePartitioning) + { + return new IcebergTableHandle( + schemaName, + tableName, + tableType, + snapshotId, + tableSchemaJson, + partitionSpecJson, + formatVersion, + unenforcedPredicate, + enforcedPredicate, + limit, + projectedColumns, + nameMappingJson, + tableLocation, + storageProperties, + requiredTablePartitioning, recordScannedFiles, - Optional.of(maxScannedFileSize)); + maxScannedFileSize, + constraintColumns, + forAnalyze); } @Override @@ -310,7 +391,6 @@ public boolean equals(Object o) IcebergTableHandle that = (IcebergTableHandle) o; return recordScannedFiles == that.recordScannedFiles && - Objects.equals(catalog, that.catalog) && Objects.equals(schemaName, that.schemaName) && Objects.equals(tableName, that.tableName) && tableType == that.tableType && @@ -325,14 +405,15 @@ public boolean equals(Object o) Objects.equals(nameMappingJson, that.nameMappingJson) && Objects.equals(tableLocation, that.tableLocation) && Objects.equals(storageProperties, that.storageProperties) && - Objects.equals(maxScannedFileSize, that.maxScannedFileSize); + Objects.equals(maxScannedFileSize, that.maxScannedFileSize) && + Objects.equals(constraintColumns, that.constraintColumns) && + Objects.equals(forAnalyze, that.forAnalyze); } @Override public int hashCode() { return Objects.hash( - catalog, schemaName, tableName, tableType, @@ -348,7 +429,9 @@ public int hashCode() tableLocation, storageProperties, recordScannedFiles, - maxScannedFileSize); + maxScannedFileSize, + constraintColumns, + forAnalyze); } @Override diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableName.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableName.java index 32716889471d..63790f48bb05 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableName.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableName.java @@ -13,14 +13,15 @@ */ package io.trino.plugin.iceberg; -import io.trino.spi.TrinoException; - -import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verify; import static io.trino.plugin.iceberg.TableType.DATA; -import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.plugin.iceberg.TableType.MATERIALIZED_VIEW_STORAGE; import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; @@ -28,9 +29,22 @@ public final class IcebergTableName { private IcebergTableName() {} - private static final Pattern TABLE_PATTERN = Pattern.compile("" + - "(?[^$@]+)" + - "(?:\\$(?[^@]+))?"); + private static final Pattern TABLE_PATTERN; + + static { + String referencableTableTypes = Stream.of(TableType.values()) + .filter(tableType -> tableType != DATA) + .map(tableType -> tableType.name().toLowerCase(ENGLISH)) + .collect(Collectors.joining("|")); + TABLE_PATTERN = Pattern.compile("" + + "(?
[^$@]+)" + + "(?:\\$(?(?i:" + referencableTableTypes + ")))?"); + } + + public static boolean isIcebergTableName(String tableName) + { + return TABLE_PATTERN.matcher(tableName).matches(); + } public static String tableNameWithType(String tableName, TableType tableType) { @@ -38,46 +52,38 @@ public static String tableNameWithType(String tableName, TableType tableType) return tableName + "$" + tableType.name().toLowerCase(ENGLISH); } - public static String tableNameFrom(String name) + public static String tableNameFrom(String validIcebergTableName) { - Matcher match = TABLE_PATTERN.matcher(name); - if (!match.matches()) { - throw new TrinoException(NOT_SUPPORTED, "Invalid Iceberg table name: " + name); - } - + Matcher match = TABLE_PATTERN.matcher(validIcebergTableName); + checkArgument(match.matches(), "Invalid Iceberg table name: %s", validIcebergTableName); return match.group("table"); } - public static Optional tableTypeFrom(String name) + public static TableType tableTypeFrom(String validIcebergTableName) { - Matcher match = TABLE_PATTERN.matcher(name); - if (!match.matches()) { - throw new TrinoException(NOT_SUPPORTED, "Invalid Iceberg table name: " + name); - } + Matcher match = TABLE_PATTERN.matcher(validIcebergTableName); + checkArgument(match.matches(), "Invalid Iceberg table name: %s", validIcebergTableName); + String typeString = match.group("type"); if (typeString == null) { - return Optional.of(DATA); - } - try { - TableType parsedType = TableType.valueOf(typeString.toUpperCase(ENGLISH)); - if (parsedType == DATA) { - // $data cannot be encoded in table name - return Optional.empty(); - } - return Optional.of(parsedType); - } - catch (IllegalArgumentException e) { - return Optional.empty(); + return DATA; } + TableType parsedType = TableType.valueOf(typeString.toUpperCase(ENGLISH)); + // $data cannot be encoded in table name + verify(parsedType != DATA, "parsedType is unexpectedly DATA"); + return parsedType; } - public static boolean isDataTable(String name) + public static boolean isDataTable(String validIcebergTableName) { - Matcher match = TABLE_PATTERN.matcher(name); - if (!match.matches()) { - throw new TrinoException(NOT_SUPPORTED, "Invalid Iceberg table name: " + name); - } + Matcher match = TABLE_PATTERN.matcher(validIcebergTableName); + checkArgument(match.matches(), "Invalid Iceberg table name: %s", validIcebergTableName); String typeString = match.group("type"); return typeString == null; } + + public static boolean isMaterializedViewStorage(String validIcebergTableName) + { + return tableTypeFrom(validIcebergTableName) == MATERIALIZED_VIEW_STORAGE; + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTablePartitioning.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTablePartitioning.java new file mode 100644 index 000000000000..5d5562e50c49 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTablePartitioning.java @@ -0,0 +1,48 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.connector.ConnectorTablePartitioning; + +import java.util.List; +import java.util.Optional; + +import static com.google.common.base.Preconditions.checkArgument; +import static java.util.Objects.requireNonNull; + +public record IcebergTablePartitioning( + boolean active, + IcebergPartitioningHandle partitioningHandle, + List partitioningColumns, + List partitionStructFields) +{ + public IcebergTablePartitioning + { + requireNonNull(partitioningHandle, "partitioningHandle is null"); + partitioningColumns = ImmutableList.copyOf(requireNonNull(partitioningColumns, "partitioningColumns is null")); + partitionStructFields = ImmutableList.copyOf(requireNonNull(partitionStructFields, "partitionStructFields is null")); + checkArgument(partitioningHandle.partitionFunctions().size() == partitionStructFields.size(), "partitioningColumns and partitionStructFields must have the same size"); + } + + public IcebergTablePartitioning activate() + { + return new IcebergTablePartitioning(true, partitioningHandle, partitioningColumns, partitionStructFields); + } + + public Optional toConnectorTablePartitioning() + { + return active ? Optional.of(new ConnectorTablePartitioning(partitioningHandle, ImmutableList.copyOf(partitioningColumns))) : Optional.empty(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableProperties.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableProperties.java index 9de78f2280e1..ef292b112291 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableProperties.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTableProperties.java @@ -13,21 +13,36 @@ */ package io.trino.plugin.iceberg; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; import com.google.inject.Inject; +import io.trino.plugin.hive.HiveCompressionCodec; +import io.trino.plugin.hive.HiveCompressionCodecs; +import io.trino.plugin.hive.HiveCompressionOption; +import io.trino.plugin.hive.HiveStorageFormat; import io.trino.plugin.hive.orc.OrcWriterConfig; import io.trino.spi.TrinoException; import io.trino.spi.session.PropertyMetadata; import io.trino.spi.type.ArrayType; +import io.trino.spi.type.MapType; +import io.trino.spi.type.TypeManager; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; +import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; import static io.trino.plugin.iceberg.IcebergConfig.FORMAT_VERSION_SUPPORT_MAX; import static io.trino.plugin.iceberg.IcebergConfig.FORMAT_VERSION_SUPPORT_MIN; +import static io.trino.plugin.iceberg.IcebergFileFormat.AVRO; +import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; import static io.trino.spi.StandardErrorCode.INVALID_TABLE_PROPERTY; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.session.PropertyMetadata.booleanProperty; import static io.trino.spi.session.PropertyMetadata.doubleProperty; import static io.trino.spi.session.PropertyMetadata.enumProperty; import static io.trino.spi.session.PropertyMetadata.integerProperty; @@ -35,6 +50,11 @@ import static io.trino.spi.type.VarcharType.VARCHAR; import static java.lang.String.format; import static java.util.Locale.ENGLISH; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; +import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_COLUMNS; +import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_FPP; +import static org.apache.iceberg.TableProperties.RESERVED_PROPERTIES; public class IcebergTableProperties { @@ -43,15 +63,47 @@ public class IcebergTableProperties public static final String SORTED_BY_PROPERTY = "sorted_by"; public static final String LOCATION_PROPERTY = "location"; public static final String FORMAT_VERSION_PROPERTY = "format_version"; - public static final String ORC_BLOOM_FILTER_COLUMNS = "orc_bloom_filter_columns"; - public static final String ORC_BLOOM_FILTER_FPP = "orc_bloom_filter_fpp"; + public static final String COMPRESSION_CODEC = "compression_codec"; + public static final String MAX_COMMIT_RETRY = "max_commit_retry"; + public static final String ORC_BLOOM_FILTER_COLUMNS_PROPERTY = "orc_bloom_filter_columns"; + public static final String ORC_BLOOM_FILTER_FPP_PROPERTY = "orc_bloom_filter_fpp"; + public static final String PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY = "parquet_bloom_filter_columns"; + public static final String OBJECT_STORE_LAYOUT_ENABLED_PROPERTY = "object_store_layout_enabled"; + public static final String DATA_LOCATION_PROPERTY = "data_location"; + public static final String EXTRA_PROPERTIES_PROPERTY = "extra_properties"; + + public static final Set SUPPORTED_PROPERTIES = ImmutableSet.builder() + .add(FILE_FORMAT_PROPERTY) + .add(COMPRESSION_CODEC) + .add(PARTITIONING_PROPERTY) + .add(SORTED_BY_PROPERTY) + .add(LOCATION_PROPERTY) + .add(FORMAT_VERSION_PROPERTY) + .add(MAX_COMMIT_RETRY) + .add(ORC_BLOOM_FILTER_COLUMNS_PROPERTY) + .add(ORC_BLOOM_FILTER_FPP_PROPERTY) + .add(OBJECT_STORE_LAYOUT_ENABLED_PROPERTY) + .add(DATA_LOCATION_PROPERTY) + .add(EXTRA_PROPERTIES_PROPERTY) + .add(PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY) + .build(); + + // These properties are used by Trino or Iceberg internally and cannot be set directly by users through extra_properties + public static final Set PROTECTED_ICEBERG_NATIVE_PROPERTIES = ImmutableSet.builder() + .addAll(RESERVED_PROPERTIES) + .add(ORC_BLOOM_FILTER_COLUMNS) + .add(ORC_BLOOM_FILTER_FPP) + .add(DEFAULT_FILE_FORMAT) + .add(FORMAT_VERSION) + .build(); private final List> tableProperties; @Inject public IcebergTableProperties( IcebergConfig icebergConfig, - OrcWriterConfig orcWriterConfig) + OrcWriterConfig orcWriterConfig, + TypeManager typeManager) { tableProperties = ImmutableList.>builder() .add(enumProperty( @@ -60,6 +112,12 @@ public IcebergTableProperties( IcebergFileFormat.class, icebergConfig.getFileFormat(), false)) + .add(enumProperty( + COMPRESSION_CODEC, + "Write compression codec for the table", + HiveCompressionOption.class, + null, + false)) .add(new PropertyMetadata<>( PARTITIONING_PROPERTY, "Partition transforms", @@ -89,8 +147,18 @@ public IcebergTableProperties( icebergConfig.getFormatVersion(), IcebergTableProperties::validateFormatVersion, false)) + .add(integerProperty( + MAX_COMMIT_RETRY, + "Number of times to retry a commit before failing", + icebergConfig.getMaxCommitRetry().orElse(null), + value -> { + if (value < 0) { + throw new TrinoException(INVALID_TABLE_PROPERTY, "max_commit_retry must be greater than or equal to 0"); + } + }, + false)) .add(new PropertyMetadata<>( - ORC_BLOOM_FILTER_COLUMNS, + ORC_BLOOM_FILTER_COLUMNS_PROPERTY, "ORC Bloom filter index columns", new ArrayType(VARCHAR), List.class, @@ -102,12 +170,59 @@ public IcebergTableProperties( .collect(toImmutableList()), value -> value)) .add(doubleProperty( - ORC_BLOOM_FILTER_FPP, + ORC_BLOOM_FILTER_FPP_PROPERTY, "ORC Bloom filter false positive probability", orcWriterConfig.getDefaultBloomFilterFpp(), IcebergTableProperties::validateOrcBloomFilterFpp, false)) + .add(new PropertyMetadata<>( + PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY, + "Parquet Bloom filter index columns", + new ArrayType(VARCHAR), + List.class, + ImmutableList.of(), + false, + value -> ((List) value).stream() + .map(String.class::cast) + .map(name -> name.toLowerCase(ENGLISH)) + .collect(toImmutableList()), + value -> value)) + .add(new PropertyMetadata<>( + EXTRA_PROPERTIES_PROPERTY, + "Extra table properties", + new MapType(VARCHAR, VARCHAR, typeManager.getTypeOperators()), + Map.class, + null, + true, // currently not shown in SHOW CREATE TABLE + value -> { + Map extraProperties = (Map) value; + if (extraProperties.containsValue(null)) { + throw new TrinoException(INVALID_TABLE_PROPERTY, format("Extra table property value cannot be null '%s'", extraProperties)); + } + if (extraProperties.containsKey(null)) { + throw new TrinoException(INVALID_TABLE_PROPERTY, format("Extra table property key cannot be null '%s'", extraProperties)); + } + + return extraProperties.entrySet().stream() + .collect(toImmutableMap(entry -> entry.getKey().toLowerCase(ENGLISH), Map.Entry::getValue)); + }, + value -> value)) + .add(booleanProperty( + OBJECT_STORE_LAYOUT_ENABLED_PROPERTY, + "Set to true to enable Iceberg object store file layout", + icebergConfig.isObjectStoreLayoutEnabled(), + false)) + .add(stringProperty( + DATA_LOCATION_PROPERTY, + "File system location URI for the table's data files", + null, + false)) .build(); + + checkState(SUPPORTED_PROPERTIES.containsAll(tableProperties.stream() + .map(PropertyMetadata::getName) + .collect(toImmutableList())), + "%s does not contain all supported properties", SUPPORTED_PROPERTIES); } public List> getTableProperties() @@ -120,6 +235,12 @@ public static IcebergFileFormat getFileFormat(Map tablePropertie return (IcebergFileFormat) tableProperties.get(FILE_FORMAT_PROPERTY); } + public static Optional getCompressionCodec(Map inputProperties) + { + return Optional.ofNullable((HiveCompressionOption) inputProperties.get(COMPRESSION_CODEC)) + .map(co -> HiveCompressionCodecs.selectCompressionCodec(co, HiveStorageFormat.PARQUET)); + } + @SuppressWarnings("unchecked") public static List getPartitioning(Map tableProperties) { @@ -152,15 +273,38 @@ private static void validateFormatVersion(int version) } } + public static void validateCompression(IcebergFileFormat fileFormat, Optional compressionCodec) + { + if (compressionCodec.isPresent()) { + if (!isCompressionCodecSupportedForFormat(fileFormat, compressionCodec.get())) { + throw new TrinoException(NOT_SUPPORTED, format("Compression codec %s not supported for %s", compressionCodec.get(), fileFormat.humanName())); + } + } + } + + @VisibleForTesting + static boolean isCompressionCodecSupportedForFormat(IcebergFileFormat fileFormat, HiveCompressionCodec codec) + { + return switch (codec) { + case LZ4 -> !(fileFormat == AVRO || fileFormat == PARQUET); + default -> true; + }; + } + + public static Optional getMaxCommitRetry(Map tableProperties) + { + return Optional.ofNullable((Integer) tableProperties.get(MAX_COMMIT_RETRY)); + } + public static List getOrcBloomFilterColumns(Map tableProperties) { - List orcBloomFilterColumns = (List) tableProperties.get(ORC_BLOOM_FILTER_COLUMNS); + List orcBloomFilterColumns = (List) tableProperties.get(ORC_BLOOM_FILTER_COLUMNS_PROPERTY); return orcBloomFilterColumns == null ? ImmutableList.of() : ImmutableList.copyOf(orcBloomFilterColumns); } public static Double getOrcBloomFilterFpp(Map tableProperties) { - return (Double) tableProperties.get(ORC_BLOOM_FILTER_FPP); + return (Double) tableProperties.get(ORC_BLOOM_FILTER_FPP_PROPERTY); } private static void validateOrcBloomFilterFpp(double fpp) @@ -169,4 +313,25 @@ private static void validateOrcBloomFilterFpp(double fpp) throw new TrinoException(INVALID_TABLE_PROPERTY, "Bloom filter fpp value must be between 0.0 and 1.0"); } } + + public static List getParquetBloomFilterColumns(Map tableProperties) + { + List parquetBloomFilterColumns = (List) tableProperties.get(PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY); + return parquetBloomFilterColumns == null ? ImmutableList.of() : ImmutableList.copyOf(parquetBloomFilterColumns); + } + + public static boolean getObjectStoreLayoutEnabled(Map tableProperties) + { + return (boolean) tableProperties.getOrDefault(OBJECT_STORE_LAYOUT_ENABLED_PROPERTY, false); + } + + public static Optional getDataLocation(Map tableProperties) + { + return Optional.ofNullable((String) tableProperties.get(DATA_LOCATION_PROPERTY)); + } + + public static Optional> getExtraProperties(Map tableProperties) + { + return Optional.ofNullable((Map) tableProperties.get(EXTRA_PROPERTIES_PROPERTY)); + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTypes.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTypes.java index 76d837dc4937..f5cdbd1b37f5 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTypes.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergTypes.java @@ -60,8 +60,8 @@ private IcebergTypes() {} /** * Convert value from Trino representation to Iceberg representation. - * - * @apiNote This accepts a Trino type because, currently, no two Iceberg types translate to one Trino type. + *

+ * Note: This accepts a Trino type because, currently, no two Iceberg types translate to one Trino type. */ public static Object convertTrinoValueToIceberg(io.trino.spi.type.Type type, Object trinoNativeValue) { @@ -165,7 +165,7 @@ public static Object convertIcebergValueToTrino(Type icebergType, Object value) if (icebergType instanceof Types.StringType) { // Partition values are passed as String, but min/max values are passed as a CharBuffer if (value instanceof CharBuffer) { - value = new String(((CharBuffer) value).array()); + value = ((CharBuffer) value).toString(); } return utf8Slice(((String) value)); } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java index 78b6c1a254f4..0ffe27f51afa 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergUtil.java @@ -15,6 +15,7 @@ import com.google.common.base.Joiner; import com.google.common.base.Splitter; +import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; @@ -22,18 +23,36 @@ import io.airlift.slice.Slice; import io.airlift.slice.SliceUtf8; import io.airlift.slice.Slices; +import io.trino.filesystem.FileEntry; +import io.trino.filesystem.FileIterator; +import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoInputFile; +import io.trino.metastore.TableInfo; +import io.trino.plugin.hive.HiveCompressionCodec; +import io.trino.plugin.hive.HiveCompressionOption; import io.trino.plugin.iceberg.PartitionTransforms.ColumnTransform; import io.trino.plugin.iceberg.catalog.IcebergTableOperations; import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; import io.trino.plugin.iceberg.catalog.TrinoCatalog; +import io.trino.plugin.iceberg.util.DefaultLocationProvider; +import io.trino.plugin.iceberg.util.ObjectStoreLocationProvider; import io.trino.spi.TrinoException; +import io.trino.spi.block.Block; +import io.trino.spi.block.DictionaryBlock; +import io.trino.spi.block.RowBlock; +import io.trino.spi.block.RunLengthEncodedBlock; +import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ColumnMetadata; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.ConnectorViewDefinition.ViewColumn; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.function.InvocationConvention; import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.NullableValue; import io.trino.spi.predicate.Range; +import io.trino.spi.predicate.TupleDomain; import io.trino.spi.predicate.ValueSet; import io.trino.spi.type.DecimalType; import io.trino.spi.type.Int128; @@ -43,11 +62,16 @@ import io.trino.spi.type.UuidType; import io.trino.spi.type.VarbinaryType; import io.trino.spi.type.VarcharType; +import io.trino.util.Reflection; +import jakarta.annotation.Nullable; import org.apache.iceberg.BaseTable; +import org.apache.iceberg.ContentFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.HistoryEntry; -import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.ManifestReader; import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -58,13 +82,16 @@ import org.apache.iceberg.Table; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableOperations; -import org.apache.iceberg.TableScan; import org.apache.iceberg.Transaction; +import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.LocationProvider; import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.types.Types.StructType; +import java.io.IOException; +import java.io.UncheckedIOException; import java.lang.invoke.MethodHandle; import java.math.BigDecimal; import java.math.BigInteger; @@ -73,46 +100,59 @@ import java.util.ArrayList; import java.util.Base64; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Map.Entry; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.builderWithExpectedSize; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.ImmutableSet.toImmutableSet; -import static com.google.common.collect.Maps.immutableEntry; -import static com.google.common.collect.Streams.mapWithIndex; +import static com.google.common.collect.Iterables.getOnlyElement; import static io.airlift.slice.Slices.utf8Slice; +import static io.trino.metastore.TableInfo.PRESTO_VIEW_COMMENT; import static io.trino.plugin.base.io.ByteBuffers.getWrappedBytes; import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; +import static io.trino.plugin.hive.ViewReaderUtil.PRESTO_VIEW_FLAG; import static io.trino.plugin.iceberg.ColumnIdentity.createColumnIdentity; +import static io.trino.plugin.iceberg.IcebergColumnHandle.fileModifiedTimeColumnHandle; import static io.trino.plugin.iceberg.IcebergColumnHandle.fileModifiedTimeColumnMetadata; +import static io.trino.plugin.iceberg.IcebergColumnHandle.partitionColumnHandle; +import static io.trino.plugin.iceberg.IcebergColumnHandle.partitionColumnMetadata; +import static io.trino.plugin.iceberg.IcebergColumnHandle.pathColumnHandle; import static io.trino.plugin.iceberg.IcebergColumnHandle.pathColumnMetadata; -import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_PARTITION_VALUE; -import static io.trino.plugin.iceberg.IcebergMetadata.ORC_BLOOM_FILTER_COLUMNS_KEY; -import static io.trino.plugin.iceberg.IcebergMetadata.ORC_BLOOM_FILTER_FPP_KEY; +import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; +import static io.trino.plugin.iceberg.IcebergMetadata.calculateTableCompressionProperties; +import static io.trino.plugin.iceberg.IcebergTableProperties.COMPRESSION_CODEC; +import static io.trino.plugin.iceberg.IcebergTableProperties.DATA_LOCATION_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.FILE_FORMAT_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.FORMAT_VERSION_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.LOCATION_PROPERTY; -import static io.trino.plugin.iceberg.IcebergTableProperties.ORC_BLOOM_FILTER_COLUMNS; -import static io.trino.plugin.iceberg.IcebergTableProperties.ORC_BLOOM_FILTER_FPP; +import static io.trino.plugin.iceberg.IcebergTableProperties.MAX_COMMIT_RETRY; +import static io.trino.plugin.iceberg.IcebergTableProperties.OBJECT_STORE_LAYOUT_ENABLED_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.ORC_BLOOM_FILTER_COLUMNS_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.ORC_BLOOM_FILTER_FPP_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY; import static io.trino.plugin.iceberg.IcebergTableProperties.PARTITIONING_PROPERTY; +import static io.trino.plugin.iceberg.IcebergTableProperties.PROTECTED_ICEBERG_NATIVE_PROPERTIES; import static io.trino.plugin.iceberg.IcebergTableProperties.SORTED_BY_PROPERTY; -import static io.trino.plugin.iceberg.IcebergTableProperties.getOrcBloomFilterColumns; -import static io.trino.plugin.iceberg.IcebergTableProperties.getOrcBloomFilterFpp; +import static io.trino.plugin.iceberg.IcebergTableProperties.SUPPORTED_PROPERTIES; import static io.trino.plugin.iceberg.IcebergTableProperties.getPartitioning; import static io.trino.plugin.iceberg.IcebergTableProperties.getSortOrder; -import static io.trino.plugin.iceberg.IcebergTableProperties.getTableLocation; +import static io.trino.plugin.iceberg.IcebergTableProperties.validateCompression; import static io.trino.plugin.iceberg.PartitionFields.parsePartitionFields; import static io.trino.plugin.iceberg.PartitionFields.toPartitionFields; import static io.trino.plugin.iceberg.SortFieldUtils.parseSortFields; @@ -140,6 +180,9 @@ import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS; import static io.trino.spi.type.Timestamps.PICOSECONDS_PER_MICROSECOND; import static io.trino.spi.type.UuidType.javaUuidToTrinoUuid; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.Boolean.parseBoolean; import static java.lang.Double.parseDouble; import static java.lang.Float.floatToRawIntBits; import static java.lang.Float.parseFloat; @@ -148,29 +191,44 @@ import static java.lang.String.format; import static java.math.RoundingMode.UNNECESSARY; import static java.util.Comparator.comparing; +import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; -import static org.apache.iceberg.LocationProviders.locationsFor; -import static org.apache.iceberg.MetadataTableUtils.createMetadataTableInstance; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.COMMIT_NUM_RETRIES; import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; import static org.apache.iceberg.TableProperties.FORMAT_VERSION; -import static org.apache.iceberg.TableProperties.OBJECT_STORE_PATH; +import static org.apache.iceberg.TableProperties.OBJECT_STORE_ENABLED; +import static org.apache.iceberg.TableProperties.OBJECT_STORE_ENABLED_DEFAULT; +import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_COLUMNS; +import static org.apache.iceberg.TableProperties.ORC_BLOOM_FILTER_FPP; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.WRITE_DATA_LOCATION; import static org.apache.iceberg.TableProperties.WRITE_LOCATION_PROVIDER_IMPL; -import static org.apache.iceberg.TableProperties.WRITE_METADATA_LOCATION; +import static org.apache.iceberg.TableUtil.formatVersion; import static org.apache.iceberg.types.Type.TypeID.BINARY; import static org.apache.iceberg.types.Type.TypeID.FIXED; +import static org.apache.iceberg.util.LocationUtil.stripTrailingSlash; +import static org.apache.iceberg.util.PropertyUtil.propertyAsBoolean; public final class IcebergUtil { public static final String TRINO_TABLE_METADATA_INFO_VALID_FOR = "trino_table_metadata_info_valid_for"; + public static final String TRINO_TABLE_COMMENT_CACHE_PREVENTED = "trino_table_comment_cache_prevented"; public static final String COLUMN_TRINO_NOT_NULL_PROPERTY = "trino_not_null"; public static final String COLUMN_TRINO_TYPE_ID_PROPERTY = "trino_type_id"; public static final String METADATA_FOLDER_NAME = "metadata"; public static final String METADATA_FILE_EXTENSION = ".metadata.json"; + public static final String TRINO_QUERY_ID_NAME = "trino_query_id"; + public static final String TRINO_USER_NAME = "trino_user"; + // For backward compatibility only. DO NOT USE. + private static final String BROKEN_ORC_BLOOM_FILTER_FPP_KEY = "orc.bloom.filter.fpp"; + // For backward compatibility only. DO NOT USE. + private static final String BROKEN_ORC_BLOOM_FILTER_COLUMNS_KEY = "orc.bloom.filter.columns"; private static final Pattern SIMPLE_NAME = Pattern.compile("[a-z][a-z0-9]*"); - static final String TRINO_QUERY_ID_NAME = "trino_query_id"; // Metadata file name examples // - 00001-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json // - 00001-409702ba-4735-4645-8f14-09537cc0b2c8.gz.metadata.json (https://github.com/apache/iceberg/blob/ab398a0d5ff195f763f8c7a4358ac98fa38a8de7/core/src/main/java/org/apache/iceberg/TableMetadataParser.java#L141) @@ -184,7 +242,7 @@ public final class IcebergUtil private IcebergUtil() {} - public static Table loadIcebergTable(TrinoCatalog catalog, IcebergTableOperationsProvider tableOperationsProvider, ConnectorSession session, SchemaTableName table) + public static BaseTable loadIcebergTable(TrinoCatalog catalog, IcebergTableOperationsProvider tableOperationsProvider, ConnectorSession session, SchemaTableName table) { TableOperations operations = tableOperationsProvider.createTableOperations( catalog, @@ -196,7 +254,7 @@ public static Table loadIcebergTable(TrinoCatalog catalog, IcebergTableOperation return new BaseTable(operations, quotedTableName(table), TRINO_METRICS_REPORTER); } - public static Table getIcebergTableWithMetadata( + public static BaseTable getIcebergTableWithMetadata( TrinoCatalog catalog, IcebergTableOperationsProvider tableOperationsProvider, ConnectorSession session, @@ -214,14 +272,75 @@ public static Table getIcebergTableWithMetadata( return new BaseTable(operations, quotedTableName(table), TRINO_METRICS_REPORTER); } - public static Map getIcebergTableProperties(Table icebergTable) + public static List getProjectedColumns(Schema schema, TypeManager typeManager) + { + Map indexById = TypeUtil.indexById(schema.asStruct()); + return getProjectedColumns(schema, typeManager, indexById, indexById.keySet() /* project all columns */); + } + + public static List getProjectedColumns(Schema schema, TypeManager typeManager, Set fieldIds) + { + Map indexById = TypeUtil.indexById(schema.asStruct()); + return getProjectedColumns(schema, typeManager, indexById, fieldIds /* project selected columns */); + } + + private static List getProjectedColumns(Schema schema, TypeManager typeManager, Map indexById, Set fieldIds) + { + ImmutableList.Builder columns = builderWithExpectedSize(fieldIds.size()); + Map indexParents = TypeUtil.indexParents(schema.asStruct()); + Map> indexPaths = indexById.entrySet().stream() + .collect(toImmutableMap(Entry::getKey, entry -> ImmutableList.copyOf(buildPath(indexParents, entry.getKey())))); + + for (int fieldId : fieldIds) { + columns.add(createColumnHandle(typeManager, fieldId, indexById, indexPaths)); + } + return columns.build(); + } + + public static IcebergColumnHandle createColumnHandle(TypeManager typeManager, int fieldId, Map indexById, Map> indexPaths) + { + NestedField childField = indexById.get(fieldId); + NestedField baseField = childField; + + List path = requireNonNull(indexPaths.get(fieldId)); + if (!path.isEmpty()) { + baseField = indexById.get(path.get(0)); + path = ImmutableList.builder() + .addAll(path.subList(1, path.size())) // Base column id shouldn't exist in IcebergColumnHandle.path + .add(fieldId) // Append the leaf field id + .build(); + } + return createColumnHandle(baseField, childField, typeManager, path); + } + + public static List buildPath(Map indexParents, int fieldId) + { + List path = new ArrayList<>(); + while (indexParents.containsKey(fieldId)) { + int parentId = indexParents.get(fieldId); + path.add(parentId); + fieldId = parentId; + } + List reversedPath = new ArrayList<>(path); + java.util.Collections.reverse(reversedPath); + return ImmutableList.copyOf(reversedPath); + } + + public static Map getIcebergTableProperties(BaseTable icebergTable) { ImmutableMap.Builder properties = ImmutableMap.builder(); - properties.put(FILE_FORMAT_PROPERTY, getFileFormat(icebergTable)); + IcebergFileFormat fileFormat = getFileFormat(icebergTable); + properties.put(FILE_FORMAT_PROPERTY, fileFormat); if (!icebergTable.spec().fields().isEmpty()) { properties.put(PARTITIONING_PROPERTY, toPartitionFields(icebergTable.spec())); } + Optional compressionCodec = getHiveCompressionCodec(fileFormat, icebergTable.properties()); + + validateCompression(fileFormat, compressionCodec); + + compressionCodec.ifPresent(hiveCompressionCodec -> properties.put(COMPRESSION_CODEC, HiveCompressionOption.valueOf(hiveCompressionCodec.name()))); + SortOrder sortOrder = icebergTable.sortOrder(); // TODO: Support sort column transforms (https://github.com/trinodb/trino/issues/15088) if (sortOrder.isSorted() && sortOrder.fields().stream().allMatch(sortField -> sortField.transform().isIdentity())) { @@ -233,23 +352,71 @@ public static Map getIcebergTableProperties(Table icebergTable) properties.put(LOCATION_PROPERTY, icebergTable.location()); } - int formatVersion = ((BaseTable) icebergTable).operations().current().formatVersion(); + int formatVersion = formatVersion(icebergTable); properties.put(FORMAT_VERSION_PROPERTY, formatVersion); + if (icebergTable.properties().containsKey(COMMIT_NUM_RETRIES)) { + int commitNumRetries = parseInt(icebergTable.properties().get(COMMIT_NUM_RETRIES)); + properties.put(MAX_COMMIT_RETRY, commitNumRetries); + } + + // iceberg ORC format bloom filter properties + Optional orcBloomFilterColumns = getOrcBloomFilterColumns(icebergTable.properties()); + if (orcBloomFilterColumns.isPresent()) { + properties.put(ORC_BLOOM_FILTER_COLUMNS_PROPERTY, Splitter.on(',').trimResults().omitEmptyStrings().splitToList(orcBloomFilterColumns.get())); + } // iceberg ORC format bloom filter properties - String orcBloomFilterColumns = icebergTable.properties().get(ORC_BLOOM_FILTER_COLUMNS_KEY); - if (orcBloomFilterColumns != null) { - properties.put(ORC_BLOOM_FILTER_COLUMNS, Splitter.on(',').trimResults().omitEmptyStrings().splitToList(orcBloomFilterColumns)); + Optional orcBloomFilterFpp = getOrcBloomFilterFpp(icebergTable.properties()); + if (orcBloomFilterFpp.isPresent()) { + properties.put(ORC_BLOOM_FILTER_FPP_PROPERTY, Double.parseDouble(orcBloomFilterFpp.get())); } - String orcBloomFilterFpp = icebergTable.properties().get(ORC_BLOOM_FILTER_FPP_KEY); - if (orcBloomFilterFpp != null) { - properties.put(ORC_BLOOM_FILTER_FPP, Double.parseDouble(orcBloomFilterFpp)); + + // iceberg Parquet format bloom filter properties + Set parquetBloomFilterColumns = getParquetBloomFilterColumns(icebergTable.properties()); + if (!parquetBloomFilterColumns.isEmpty()) { + properties.put(PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY, ImmutableList.copyOf(parquetBloomFilterColumns)); } + if (parseBoolean(icebergTable.properties().getOrDefault(OBJECT_STORE_ENABLED, "false"))) { + properties.put(OBJECT_STORE_LAYOUT_ENABLED_PROPERTY, true); + } + + Optional dataLocation = Optional.ofNullable(icebergTable.properties().get(WRITE_DATA_LOCATION)); + dataLocation.ifPresent(location -> properties.put(DATA_LOCATION_PROPERTY, location)); + return properties.buildOrThrow(); } - public static List getColumns(Schema schema, TypeManager typeManager) + // Version 382-438 set incorrect table properties: https://github.com/trinodb/trino/commit/b89aac68c43e5392f23b8d6ba053bbeb6df85028#diff-2af3e19a6b656640a7d0bb73114ef224953a2efa04e569b1fe4da953b2cc6d15R418-R419 + // `orc.bloom.filter.columns` was set instead of `write.orc.bloom.filter.columns`, and `orc.bloom.filter.fpp` instead of `write.orc.bloom.filter.fpp` + // These methods maintain backward compatibility for existing table. + public static Optional getOrcBloomFilterColumns(Map properties) + { + return Stream.of( + properties.get(ORC_BLOOM_FILTER_COLUMNS), + properties.get(BROKEN_ORC_BLOOM_FILTER_COLUMNS_KEY)) + .filter(Objects::nonNull) + .findFirst(); + } + + public static Set getParquetBloomFilterColumns(Map properties) + { + return properties.entrySet().stream() + .filter(entry -> entry.getKey().startsWith(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX) && "true".equals(entry.getValue())) + .map(entry -> entry.getKey().substring(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX.length())) + .collect(toImmutableSet()); + } + + public static Optional getOrcBloomFilterFpp(Map properties) + { + return Stream.of( + properties.get(ORC_BLOOM_FILTER_FPP), + properties.get(BROKEN_ORC_BLOOM_FILTER_FPP_KEY)) + .filter(Objects::nonNull) + .findFirst(); + } + + public static List getTopLevelColumns(Schema schema, TypeManager typeManager) { return schema.columns().stream() .map(column -> getColumnHandle(column, typeManager)) @@ -259,7 +426,7 @@ public static List getColumns(Schema schema, TypeManager ty public static List getColumnMetadatas(Schema schema, TypeManager typeManager) { List icebergColumns = schema.columns(); - ImmutableList.Builder columns = ImmutableList.builderWithExpectedSize(icebergColumns.size() + 2); + ImmutableList.Builder columns = builderWithExpectedSize(icebergColumns.size() + 2); icebergColumns.stream() .map(column -> @@ -270,28 +437,50 @@ public static List getColumnMetadatas(Schema schema, TypeManager .setComment(Optional.ofNullable(column.doc())) .build()) .forEach(columns::add); + columns.add(partitionColumnMetadata()); columns.add(pathColumnMetadata()); columns.add(fileModifiedTimeColumnMetadata()); return columns.build(); } + public static Schema updateColumnComment(Schema schema, String columnName, String comment) + { + NestedField fieldToUpdate = schema.findField(columnName); + checkArgument(fieldToUpdate != null, "Field %s does not exist", columnName); + NestedField updatedField = NestedField.from(fieldToUpdate).withDoc(comment).build(); + List newFields = schema.columns().stream() + .map(field -> (field.fieldId() == updatedField.fieldId()) ? updatedField : field) + .toList(); + + return new Schema(newFields, schema.getAliases(), schema.identifierFieldIds()); + } + public static IcebergColumnHandle getColumnHandle(NestedField column, TypeManager typeManager) { - Type type = toTrinoType(column.type(), typeManager); - return new IcebergColumnHandle( - createColumnIdentity(column), - type, - ImmutableList.of(), - type, - Optional.ofNullable(column.doc())); + return createColumnHandle(column, column, typeManager, ImmutableList.of()); + } + + private static IcebergColumnHandle createColumnHandle(NestedField baseColumn, NestedField childColumn, TypeManager typeManager, List path) + { + return IcebergColumnHandle.builder(createColumnIdentity(baseColumn)) + .fieldType(toTrinoType(baseColumn.type(), typeManager), toTrinoType(childColumn.type(), typeManager)) + .path(path) + .nullable(childColumn.isOptional()) + .comment(childColumn.doc()) + .build(); } public static Schema schemaFromHandles(List columns) + { + return structTypeFromHandles(columns).asSchema(); + } + + public static StructType structTypeFromHandles(List columns) { List icebergColumns = columns.stream() .map(column -> NestedField.optional(column.getId(), column.getName(), toIcebergType(column.getType(), column.getColumnIdentity()))) .collect(toImmutableList()); - return new Schema(StructType.of(icebergColumns).asStructType().fields()); + return StructType.of(icebergColumns); } public static Map getIdentityPartitions(PartitionSpec partitionSpec) @@ -307,6 +496,33 @@ public static Map getIdentityPartitions(PartitionSpec p return columns.buildOrThrow(); } + public static List primitiveFields(Schema schema) + { + return primitiveFields(schema.columns()) + .collect(toImmutableList()); + } + + private static Stream primitiveFields(List nestedFields) + { + return nestedFields.stream() + .flatMap(IcebergUtil::primitiveFields); + } + + private static Stream primitiveFields(NestedField nestedField) + { + org.apache.iceberg.types.Type type = nestedField.type(); + if (type.isPrimitiveType()) { + return Stream.of(nestedField); + } + + if (type.isNestedType()) { + return primitiveFields(type.asNestedType().fields()) + .map(field -> NestedField.from(field).withName(nestedField.name() + "." + field.name()).build()); + } + + throw new IllegalStateException("Unsupported field type: " + nestedField); + } + public static Map primitiveFieldTypes(Schema schema) { return primitiveFieldTypes(schema.columns()) @@ -342,7 +558,7 @@ public static IcebergFileFormat getFileFormat(Map storagePropert { return IcebergFileFormat.fromIceberg(FileFormat.valueOf(storageProperties .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT) - .toUpperCase(Locale.ENGLISH))); + .toUpperCase(ENGLISH))); } public static Optional getTableComment(Table table) @@ -370,9 +586,15 @@ public static boolean canEnforceColumnConstraintInSpecs( IcebergColumnHandle columnHandle, Domain domain) { - return table.specs().values().stream() + List partitionSpecs = table.specs().values().stream() .filter(partitionSpec -> partitionSpecIds.contains(partitionSpec.specId())) - .allMatch(spec -> canEnforceConstraintWithinPartitioningSpec(typeOperators, spec, columnHandle, domain)); + .collect(toImmutableList()); + + if (partitionSpecs.isEmpty()) { + return canEnforceConstraintWithinPartitioningSpec(typeOperators, table.spec(), columnHandle, domain); + } + + return partitionSpecs.stream().allMatch(spec -> canEnforceConstraintWithinPartitioningSpec(typeOperators, spec, columnHandle, domain)); } private static boolean canEnforceConstraintWithinPartitioningSpec(TypeOperators typeOperators, PartitionSpec spec, IcebergColumnHandle column, Domain domain) @@ -407,10 +629,10 @@ private static boolean canEnforceConstraintWithPartitionField(TypeOperators type } ValueSet valueSet = domain.getValues(); - boolean canEnforce = valueSet.getValuesProcessor().transform( + return valueSet.getValuesProcessor().transform( ranges -> { MethodHandle targetTypeEqualOperator = typeOperators.getEqualOperator( - transform.getType(), InvocationConvention.simpleConvention(FAIL_ON_NULL, NEVER_NULL, NEVER_NULL)); + transform.type(), InvocationConvention.simpleConvention(FAIL_ON_NULL, NEVER_NULL, NEVER_NULL)); for (Range range : ranges.getOrderedRanges()) { if (!canEnforceRangeWithPartitioningField(field, transform, range, targetTypeEqualOperator)) { return false; @@ -420,12 +642,11 @@ private static boolean canEnforceConstraintWithPartitionField(TypeOperators type }, discreteValues -> false, allOrNone -> true); - return canEnforce; } private static boolean canEnforceRangeWithPartitioningField(PartitionField field, ColumnTransform transform, Range range, MethodHandle targetTypeEqualOperator) { - if (!transform.isMonotonic()) { + if (!transform.monotonic()) { // E.g. bucketing transform return false; } @@ -460,8 +681,8 @@ private static boolean yieldSamePartitioningValue( { requireNonNull(first, "first is null"); requireNonNull(second, "second is null"); - Object firstTransformed = transform.getValueTransform().apply(nativeValueToBlock(sourceType, first), 0); - Object secondTransformed = transform.getValueTransform().apply(nativeValueToBlock(sourceType, second), 0); + Object firstTransformed = transform.valueTransform().apply(nativeValueToBlock(sourceType, first), 0); + Object secondTransformed = transform.valueTransform().apply(nativeValueToBlock(sourceType, second), 0); // The pushdown logic assumes NULLs and non-NULLs are segregated, so that we have to think about non-null values only. verify(firstTransformed != null && secondTransformed != null, "Transform for %s returned null for non-null input", field); try { @@ -472,6 +693,7 @@ private static boolean yieldSamePartitioningValue( } } + @Nullable public static Object deserializePartitionValue(Type type, String valueString, String name) { if (valueString == null) { @@ -585,13 +807,34 @@ public static Map> getPartitionKeys(StructLike partiti return partitionKeys.buildOrThrow(); } + public static Map getPartitionValues( + Set identityPartitionColumns, + Map> partitionKeys) + { + ImmutableMap.Builder bindings = ImmutableMap.builder(); + for (IcebergColumnHandle partitionColumn : identityPartitionColumns) { + Object partitionValue = deserializePartitionValue( + partitionColumn.getType(), + partitionKeys.get(partitionColumn.getId()).orElse(null), + partitionColumn.getName()); + NullableValue bindingValue = new NullableValue(partitionColumn.getType(), partitionValue); + bindings.put(partitionColumn, bindingValue); + } + return bindings.buildOrThrow(); + } + public static LocationProvider getLocationProvider(SchemaTableName schemaTableName, String tableLocation, Map storageProperties) { if (storageProperties.containsKey(WRITE_LOCATION_PROVIDER_IMPL)) { throw new TrinoException(NOT_SUPPORTED, "Table " + schemaTableName + " specifies " + storageProperties.get(WRITE_LOCATION_PROVIDER_IMPL) + " as a location provider. Writing to Iceberg tables with custom location provider is not supported."); } - return locationsFor(tableLocation, storageProperties); + + if (propertyAsBoolean(storageProperties, OBJECT_STORE_ENABLED, OBJECT_STORE_ENABLED_DEFAULT)) { + return new ObjectStoreLocationProvider(tableLocation, storageProperties); + } + + return new DefaultLocationProvider(tableLocation, storageProperties); } public static Schema schemaFromMetadata(List columns) @@ -603,7 +846,13 @@ public static Schema schemaFromMetadata(List columns) if (!column.isHidden()) { int index = icebergColumns.size() + 1; org.apache.iceberg.types.Type type = toIcebergTypeForNewColumn(column.getType(), nextFieldId); - NestedField field = NestedField.of(index, column.isNullable(), column.getName(), type, column.getComment()); + NestedField field = NestedField.builder() + .withId(index) + .isOptional(column.isNullable()) + .withName(column.getName()) + .ofType(type) + .withDoc(column.getComment()) + .build(); icebergColumns.add(field); } } @@ -611,34 +860,171 @@ public static Schema schemaFromMetadata(List columns) return new Schema(icebergSchema.asStructType().fields()); } - public static Transaction newCreateTableTransaction(TrinoCatalog catalog, ConnectorTableMetadata tableMetadata, ConnectorSession session) + public static Schema schemaFromViewColumns(TypeManager typeManager, List columns) + { + List icebergColumns = new ArrayList<>(); + AtomicInteger nextFieldId = new AtomicInteger(1); + for (ViewColumn column : columns) { + Type trinoType = typeManager.getType(column.getType()); + org.apache.iceberg.types.Type type = toIcebergTypeForNewColumn(trinoType, nextFieldId); + NestedField field = NestedField.required(nextFieldId.getAndIncrement(), column.getName(), type, column.getComment().orElse(null)); + icebergColumns.add(field); + } + org.apache.iceberg.types.Type icebergSchema = StructType.of(icebergColumns); + return new Schema(icebergSchema.asStructType().fields()); + } + + public static List viewColumnsFromSchema(TypeManager typeManager, Schema schema) + { + return IcebergUtil.getTopLevelColumns(schema, typeManager).stream() + .map(column -> new ViewColumn(column.getName(), column.getType().getTypeId(), column.getComment())) + .toList(); + } + + public static Transaction newCreateTableTransaction(TrinoCatalog catalog, ConnectorTableMetadata tableMetadata, ConnectorSession session, boolean replace, String tableLocation, Predicate allowedExtraProperties) { SchemaTableName schemaTableName = tableMetadata.getTable(); Schema schema = schemaFromMetadata(tableMetadata.getColumns()); PartitionSpec partitionSpec = parsePartitionFields(schema, getPartitioning(tableMetadata.getProperties())); SortOrder sortOrder = parseSortFields(schema, getSortOrder(tableMetadata.getProperties())); - String targetPath = getTableLocation(tableMetadata.getProperties()) - .orElseGet(() -> catalog.defaultTableLocation(session, schemaTableName)); + Transaction transaction; + + if (replace) { + transaction = catalog.newCreateOrReplaceTableTransaction(session, schemaTableName, schema, partitionSpec, sortOrder, tableLocation, createTableProperties(tableMetadata, allowedExtraProperties)); + } + else { + transaction = catalog.newCreateTableTransaction(session, schemaTableName, schema, partitionSpec, sortOrder, Optional.ofNullable(tableLocation), createTableProperties(tableMetadata, allowedExtraProperties)); + } + + // If user doesn't set compression-codec for parquet, we need to remove write.parquet.compression-codec property, + // Otherwise Iceberg will set write.parquet.compression-codec to zstd by default. + String parquetCompressionValue = transaction.table().properties().get(PARQUET_COMPRESSION); + if (parquetCompressionValue != null && parquetCompressionValue.isEmpty()) { + transaction.updateProperties() + .remove(PARQUET_COMPRESSION) + .commit(); + } + + return transaction; + } + + public static Map createTableProperties(ConnectorTableMetadata tableMetadata, Predicate allowedExtraProperties) + { ImmutableMap.Builder propertiesBuilder = ImmutableMap.builder(); IcebergFileFormat fileFormat = IcebergTableProperties.getFileFormat(tableMetadata.getProperties()); propertiesBuilder.put(DEFAULT_FILE_FORMAT, fileFormat.toIceberg().toString()); propertiesBuilder.put(FORMAT_VERSION, Integer.toString(IcebergTableProperties.getFormatVersion(tableMetadata.getProperties()))); + IcebergTableProperties.getMaxCommitRetry(tableMetadata.getProperties()) + .ifPresent(value -> propertiesBuilder.put(COMMIT_NUM_RETRIES, Integer.toString(value))); + + Optional compressionCodec = IcebergTableProperties.getCompressionCodec(tableMetadata.getProperties()); + + validateCompression(fileFormat, compressionCodec); + + Map tableCompressionProperties = calculateTableCompressionProperties(fileFormat, fileFormat, ImmutableMap.of(), tableMetadata.getProperties()); + + tableCompressionProperties.forEach(propertiesBuilder::put); + + // Iceberg will set write.parquet.compression-codec to zstd by default if this property is not set: https://github.com/trinodb/trino/issues/20401, + // but we don't want to set this property if this is not explicitly set by customer via set table properties. + if (!(fileFormat == PARQUET && compressionCodec.isPresent())) { + propertiesBuilder.put(PARQUET_COMPRESSION, ""); + } + + boolean objectStoreLayoutEnabled = IcebergTableProperties.getObjectStoreLayoutEnabled(tableMetadata.getProperties()); + if (objectStoreLayoutEnabled) { + propertiesBuilder.put(OBJECT_STORE_ENABLED, "true"); + } + Optional dataLocation = IcebergTableProperties.getDataLocation(tableMetadata.getProperties()); + dataLocation.ifPresent(location -> { + if (!objectStoreLayoutEnabled) { + throw new TrinoException(INVALID_TABLE_PROPERTY, "Data location can only be set when object store layout is enabled"); + } + propertiesBuilder.put(WRITE_DATA_LOCATION, location); + }); // iceberg ORC format bloom filter properties used by create table - List columns = getOrcBloomFilterColumns(tableMetadata.getProperties()); - if (!columns.isEmpty()) { - checkFormatForProperty(fileFormat.toIceberg(), FileFormat.ORC, ORC_BLOOM_FILTER_COLUMNS); - validateOrcBloomFilterColumns(tableMetadata, columns); - propertiesBuilder.put(ORC_BLOOM_FILTER_COLUMNS_KEY, Joiner.on(",").join(columns)); - propertiesBuilder.put(ORC_BLOOM_FILTER_FPP_KEY, String.valueOf(getOrcBloomFilterFpp(tableMetadata.getProperties()))); + List orcBloomFilterColumns = IcebergTableProperties.getOrcBloomFilterColumns(tableMetadata.getProperties()); + if (!orcBloomFilterColumns.isEmpty()) { + checkFormatForProperty(fileFormat.toIceberg(), FileFormat.ORC, ORC_BLOOM_FILTER_COLUMNS_PROPERTY); + validateOrcBloomFilterColumns(tableMetadata.getColumns(), orcBloomFilterColumns); + propertiesBuilder.put(ORC_BLOOM_FILTER_COLUMNS, Joiner.on(",").join(orcBloomFilterColumns)); + propertiesBuilder.put(ORC_BLOOM_FILTER_FPP, String.valueOf(IcebergTableProperties.getOrcBloomFilterFpp(tableMetadata.getProperties()))); + } + + // iceberg Parquet format bloom filter properties used by create table + List parquetBloomFilterColumns = IcebergTableProperties.getParquetBloomFilterColumns(tableMetadata.getProperties()); + if (!parquetBloomFilterColumns.isEmpty()) { + checkFormatForProperty(fileFormat.toIceberg(), FileFormat.PARQUET, PARQUET_BLOOM_FILTER_COLUMNS_PROPERTY); + validateParquetBloomFilterColumns(tableMetadata.getColumns(), parquetBloomFilterColumns); + for (String column : parquetBloomFilterColumns) { + propertiesBuilder.put(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + column, "true"); + } } if (tableMetadata.getComment().isPresent()) { propertiesBuilder.put(TABLE_COMMENT, tableMetadata.getComment().get()); } - return catalog.newCreateTableTransaction(session, schemaTableName, schema, partitionSpec, sortOrder, targetPath, propertiesBuilder.buildOrThrow()); + Map baseProperties = propertiesBuilder.buildOrThrow(); + Map extraProperties = IcebergTableProperties.getExtraProperties(tableMetadata.getProperties()).orElseGet(ImmutableMap::of); + + verifyExtraProperties(baseProperties.keySet(), extraProperties, allowedExtraProperties); + + return ImmutableMap.builder() + .putAll(baseProperties) + .putAll(extraProperties) + .buildOrThrow(); + } + + public static void verifyExtraProperties(Set basePropertyKeys, Map extraProperties, Predicate allowedExtraProperties) + { + Set illegalExtraProperties = ImmutableSet.builder() + .addAll(Sets.intersection( + ImmutableSet.builder() + .add(TABLE_COMMENT) + .addAll(basePropertyKeys) + .addAll(SUPPORTED_PROPERTIES) + .addAll(PROTECTED_ICEBERG_NATIVE_PROPERTIES) + .build(), + extraProperties.keySet())) + .addAll(extraProperties.keySet().stream() + .filter(name -> !allowedExtraProperties.test(name)) + .collect(toImmutableSet())) + .build(); + + if (!illegalExtraProperties.isEmpty()) { + throw new TrinoException( + INVALID_TABLE_PROPERTY, + format("Illegal keys in extra_properties: %s", illegalExtraProperties)); + } + } + + public static Optional getHiveCompressionCodec(IcebergFileFormat icebergFileFormat, Map storageProperties) + { + String compressionProperty = getCompressionPropertyName(icebergFileFormat); + + return Optional.ofNullable(storageProperties.get(compressionProperty)) + .filter(value -> !value.isEmpty()) + .map(value -> { + try { + return HiveCompressionCodec.valueOf(value.toUpperCase(ENGLISH)); + } + catch (IllegalArgumentException e) { + throw new TrinoException(INVALID_TABLE_PROPERTY, + format("Compression codec %s is unsupported.", value)); + } + }); + } + + public static String getCompressionPropertyName(IcebergFileFormat fileFormat) + { + return switch (fileFormat) { + case AVRO -> AVRO_COMPRESSION; + case ORC -> ORC_COMPRESSION; + case PARQUET -> PARQUET_COMPRESSION; + }; } /** @@ -707,27 +1093,16 @@ public static long getSnapshotIdAsOfTime(Table table, long epochMillis) .snapshotId(); } - public static void validateTableCanBeDropped(Table table) - { - // TODO: support path override in Iceberg table creation: https://github.com/trinodb/trino/issues/8861 - if (table.properties().containsKey(OBJECT_STORE_PATH) || - table.properties().containsKey("write.folder-storage.path") || // Removed from Iceberg as of 0.14.0, but preserved for backward compatibility - table.properties().containsKey(WRITE_METADATA_LOCATION) || - table.properties().containsKey(WRITE_DATA_LOCATION)) { - throw new TrinoException(NOT_SUPPORTED, "Table contains Iceberg path override properties and cannot be dropped from Trino: " + table.name()); - } - } - - private static void checkFormatForProperty(FileFormat actualStorageFormat, FileFormat expectedStorageFormat, String propertyName) + public static void checkFormatForProperty(FileFormat actualStorageFormat, FileFormat expectedStorageFormat, String propertyName) { if (actualStorageFormat != expectedStorageFormat) { throw new TrinoException(INVALID_TABLE_PROPERTY, format("Cannot specify %s table property for storage format: %s", propertyName, actualStorageFormat)); } } - private static void validateOrcBloomFilterColumns(ConnectorTableMetadata tableMetadata, List orcBloomFilterColumns) + public static void validateOrcBloomFilterColumns(List columns, List orcBloomFilterColumns) { - Set allColumns = tableMetadata.getColumns().stream() + Set allColumns = columns.stream() .map(ColumnMetadata::getName) .collect(toImmutableSet()); if (!allColumns.containsAll(orcBloomFilterColumns)) { @@ -735,6 +1110,23 @@ private static void validateOrcBloomFilterColumns(ConnectorTableMetadata tableMe } } + public static final List SUPPORTED_BLOOM_FILTER_TYPES = ImmutableList.of(BIGINT, DOUBLE, INTEGER, REAL, UuidType.UUID, VARBINARY, VARCHAR); + + public static void validateParquetBloomFilterColumns(List columns, List parquetBloomFilterColumns) + { + Map columnTypes = columns.stream() + .collect(toImmutableMap(ColumnMetadata::getName, ColumnMetadata::getType)); + for (String column : parquetBloomFilterColumns) { + Type type = columnTypes.get(column); + if (type == null) { + throw new TrinoException(INVALID_TABLE_PROPERTY, format("Parquet Bloom filter column %s not present in schema", column)); + } + if (SUPPORTED_BLOOM_FILTER_TYPES.contains(type)) { + throw new TrinoException(INVALID_TABLE_PROPERTY, format("Parquet Bloom filter column %s has unsupported type %s", column, type.getDisplayName())); + } + } + } + public static int parseVersion(String metadataFileName) throws TrinoException { @@ -747,7 +1139,7 @@ public static int parseVersion(String metadataFileName) if (matcher.matches()) { return parseInt(matcher.group("version")); } - throw new TrinoException(ICEBERG_BAD_DATA, "Invalid metadata file name: " + metadataFileName); + throw new TrinoException(ICEBERG_INVALID_METADATA, "Invalid metadata file name: " + metadataFileName); } public static String fixBrokenMetadataLocation(String location) @@ -774,18 +1166,295 @@ public static String fileName(String path) public static void commit(SnapshotUpdate update, ConnectorSession session) { update.set(TRINO_QUERY_ID_NAME, session.getQueryId()); + update.set(TRINO_USER_NAME, session.getUser()); update.commit(); } - public static TableScan buildTableScan(Table icebergTable, MetadataTableType metadataTableType) + public static String getLatestMetadataLocation(TrinoFileSystem fileSystem, String location) { - return createMetadataTableInstance(icebergTable, metadataTableType).newScan(); + List latestMetadataLocations = new ArrayList<>(); + String metadataDirectoryLocation = format("%s/%s", stripTrailingSlash(location), METADATA_FOLDER_NAME); + try { + int latestMetadataVersion = -1; + FileIterator fileIterator = fileSystem.listFiles(Location.of(metadataDirectoryLocation)); + while (fileIterator.hasNext()) { + FileEntry fileEntry = fileIterator.next(); + Location fileLocation = fileEntry.location(); + String fileName = fileLocation.fileName(); + if (fileName.endsWith(METADATA_FILE_EXTENSION)) { + int versionNumber = parseVersion(fileName); + if (versionNumber > latestMetadataVersion) { + latestMetadataVersion = versionNumber; + latestMetadataLocations.clear(); + latestMetadataLocations.add(fileLocation); + } + else if (versionNumber == latestMetadataVersion) { + latestMetadataLocations.add(fileLocation); + } + } + } + if (latestMetadataLocations.isEmpty()) { + throw new TrinoException(ICEBERG_INVALID_METADATA, "No versioned metadata file exists at location: " + metadataDirectoryLocation); + } + if (latestMetadataLocations.size() > 1) { + throw new TrinoException(ICEBERG_INVALID_METADATA, format( + "More than one latest metadata file found at location: %s, latest metadata files are %s", + metadataDirectoryLocation, + latestMetadataLocations)); + } + } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed checking table location: " + location, e); + } + return getOnlyElement(latestMetadataLocations).toString(); } - public static Map columnNameToPositionInSchema(Schema schema) + public static Domain getPartitionDomain(TupleDomain effectivePredicate) { - return mapWithIndex(schema.columns().stream(), - (column, position) -> immutableEntry(column.name(), Long.valueOf(position).intValue())) - .collect(toImmutableMap(Entry::getKey, Entry::getValue)); + IcebergColumnHandle partitionColumn = partitionColumnHandle(); + Domain domain = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Unexpected NONE tuple domain")) + .get(partitionColumn); + if (domain == null) { + return Domain.all(partitionColumn.getType()); + } + return domain; + } + + public static Domain getPathDomain(TupleDomain effectivePredicate) + { + IcebergColumnHandle pathColumn = pathColumnHandle(); + Domain domain = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Unexpected NONE tuple domain")) + .get(pathColumn); + if (domain == null) { + return Domain.all(pathColumn.getType()); + } + return domain; + } + + public static Domain getFileModifiedTimeDomain(TupleDomain effectivePredicate) + { + IcebergColumnHandle fileModifiedTimeColumn = fileModifiedTimeColumnHandle(); + Domain domain = effectivePredicate.getDomains().orElseThrow(() -> new IllegalArgumentException("Unexpected NONE tuple domain")) + .get(fileModifiedTimeColumn); + if (domain == null) { + return Domain.all(fileModifiedTimeColumn.getType()); + } + return domain; + } + + public static long getModificationTime(String path, TrinoFileSystem fileSystem) + { + try { + TrinoInputFile inputFile = fileSystem.newInputFile(Location.of(path)); + return inputFile.lastModified().toEpochMilli(); + } + catch (IOException | UncheckedIOException e) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed to get file modification time: " + path, e); + } + } + + public static ManifestReader> readerForManifest(ManifestFile manifest, Table table) + { + return readerForManifest(manifest, table.io(), table.specs()); + } + + public static ManifestReader> readerForManifest(ManifestFile manifest, FileIO fileIO, Map specsById) + { + return switch (manifest.content()) { + case DATA -> ManifestFiles.read(manifest, fileIO); + case DELETES -> ManifestFiles.readDeleteManifest(manifest, fileIO, specsById); + }; + } + + /** + * Returns true if table represents a Hive view, Trino/Presto view, materialized view or anything + * else that gets registered using table type "VIRTUAL_VIEW". + * Note: this method returns false for a table that represents Hive's own materialized view + * ("MATERIALIZED_VIEW" table type). Hive own's materialized views are currently treated as ordinary + * tables by Trino. + */ + public static boolean isSomeKindOfAView(io.trino.plugin.hive.metastore.Table table) + { + return table.getTableType().equals("VIRTUAL_VIEW"); + } + + public static boolean isHiveView(io.trino.plugin.hive.metastore.Table table) + { + return table.getTableType().equals("VIRTUAL_VIEW") && + !table.getParameters().containsKey(PRESTO_VIEW_FLAG); + } + + /** + * Returns true when the table represents a "Trino view" (AKA "presto view"). + * Returns false for Hive views or Trino materialized views. + */ + public static boolean isTrinoView(io.trino.plugin.hive.metastore.Table table) + { + return isTrinoView(table.getTableType(), table.getParameters()); + } + + /** + * Returns true when the table represents a "Trino view" (AKA "presto view"). + * Returns false for Hive views or Trino materialized views. + */ + public static boolean isTrinoView(String tableType, Map tableParameters) + { + // A Trino view can be recognized by table type "VIRTUAL_VIEW" and table parameters presto_view="true" and comment="Presto View" since their first implementation see + // https://github.com/trinodb/trino/blame/38bd0dff736024f3ae01dbbe7d1db5bd1d50c43e/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java#L902. + return tableType.equals("VIRTUAL_VIEW") && + "true".equals(tableParameters.get(PRESTO_VIEW_FLAG)) && + PRESTO_VIEW_COMMENT.equalsIgnoreCase(tableParameters.get(TABLE_COMMENT)); + } + + public static boolean isTrinoMaterializedView(String tableType, Map tableParameters) + { + // A Trino materialized view can be recognized by table type "VIRTUAL_VIEW" and table parameters presto_view="true" and comment="Presto Materialized View" + // since their first implementation see + // https://github.com/trinodb/trino/blame/ff4a1e31fb9cb49f1b960abfc16ad469e7126a64/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergMetadata.java#L898 + return tableType.equals("VIRTUAL_VIEW") && + "true".equals(tableParameters.get(PRESTO_VIEW_FLAG)) && + TableInfo.ICEBERG_MATERIALIZED_VIEW_COMMENT.equalsIgnoreCase(tableParameters.get(TABLE_COMMENT)); + } + + /** + * Returns the row fields from the specified block. The block maybe a RunLengthEncodedBlock, or + * DictionaryBlock, but the underlying block must be a RowBlock. The returned field blocks will be the same + * length as the specified block, which means they are not null suppressed. + */ + public static List getRowFieldBlocks(RowBlock rowBlock) + { + MethodHandle getFieldBlocksMethod = Reflection.methodHandle(RowBlock.class, "getFieldBlocks"); + try { + return List.of((Block[]) getFieldBlocksMethod.invoke(rowBlock)); + } + catch (Throwable e) { + Throwables.throwIfUnchecked(e); + throw new RuntimeException(e); + } + } + + public static int getDictionaryIdsOffset(DictionaryBlock rowBlock) + { + MethodHandle getFieldBlocksMethod = Reflection.methodHandle(DictionaryBlock.class, "getRawIdsOffset"); + try { + return (int) getFieldBlocksMethod.invoke(rowBlock); + } + catch (Throwable e) { + Throwables.throwIfUnchecked(e); + throw new RuntimeException(e); + } + } + + public static Block createProjection(DictionaryBlock sourceBlock, Block newDictionary) + { + Block dictionary = sourceBlock.getDictionary(); + int positionCount = sourceBlock.getPositionCount(); + if (newDictionary.getPositionCount() != dictionary.getPositionCount()) { + throw new IllegalArgumentException("newDictionary must have the same position count"); + } + + if (newDictionary instanceof RunLengthEncodedBlock rle) { + return RunLengthEncodedBlock.create(rle.getValue(), positionCount); + } + + // unwrap dictionary in dictionary + int[] newIds = new int[positionCount]; + for (int position = 0; position < positionCount; position++) { + newIds[position] = ((DictionaryBlock) newDictionary).getId(position); + } + return DictionaryBlock.create(positionCount, dictionary, newIds); + } + + public static List getRowFieldsFromBlock(Block block) + { + if (block instanceof RunLengthEncodedBlock runLengthEncodedBlock) { + RowBlock rowBlock = (RowBlock) runLengthEncodedBlock.getValue(); + return getRowFieldBlocks(rowBlock).stream() + .map(fieldBlock -> RunLengthEncodedBlock.create(fieldBlock, runLengthEncodedBlock.getPositionCount())) + .toList(); + } + if (block instanceof DictionaryBlock dictionaryBlock) { + RowBlock rowBlock = (RowBlock) dictionaryBlock.getDictionary(); + return getRowFieldBlocks(rowBlock).stream() + .map(fieldBlock -> DictionaryBlock.createProjectedDictionaryBlock(dictionaryBlock.getPositionCount(), dictionaryBlock, null, null)) + .toList(); + } + if (block instanceof RowBlock rowBlock) { + return getRowFieldBlocks(rowBlock); + } + throw new IllegalArgumentException("Unexpected block type: " + block.getClass().getSimpleName()); + } + + public static Block createNullBlock(Type type) + { + return type.createBlockBuilder(null, 1, 0) + .appendNull() + .build(); + } + + /** + * Create a row block directly from field blocks that are not null-suppressed. The field value of a null row must be null. + */ + public static RowBlock fromNotNullSuppressedFieldBlocks(int positionCount, Optional rowIsNullOptional, Block[] fieldBlocks) + { + // verify that field values for null rows are null + if (rowIsNullOptional.isPresent()) { + boolean[] rowIsNull = rowIsNullOptional.get(); + checkArrayRange(rowIsNull, 0, positionCount); + + for (int fieldIndex = 0; fieldIndex < fieldBlocks.length; fieldIndex++) { + Block field = fieldBlocks[fieldIndex]; + for (int position = 0; position < positionCount; position++) { + if (rowIsNull[position] && !field.isNull(position)) { + throw new IllegalArgumentException(format("Field value for null row must be null: field %s, position %s", fieldIndex, position)); + } + } + } + } + return createRowBlockInternal(positionCount, null, fieldBlocks); + } + + /** + * Create a row block directly from field blocks. The returned RowBlock will not contain any null rows, although the fields may contain null values +. + */ + public static RowBlock fromFieldBlocks(int positionCount, Block[] fieldBlocks) + { + return createRowBlockInternal(positionCount, null, fieldBlocks); + } + + private static RowBlock createRowBlockInternal(int positionCount, boolean[] rowIsNull, Block[] fieldBlocks) + { + MethodHandle createRowBlockInternalMethod = Reflection.methodHandle(RowBlock.class, "createRowBlockInternal", int.class, boolean[].class, Block[].class); + try { + return (RowBlock) createRowBlockInternalMethod.invoke(null, positionCount, rowIsNull, fieldBlocks); + } + catch (Throwable e) { + Throwables.throwIfUnchecked(e); + throw new RuntimeException(e); + } + } + + static void checkArrayRange(boolean[] array, int offset, int length) + { + requireNonNull(array, "array is null"); + if (offset < 0 || length < 0 || offset + length > array.length) { + throw new IndexOutOfBoundsException(format("Invalid offset %s and length %s in array with %s elements", offset, length, array.length)); + } + } + + public static Domain getDomain(TupleDomain tupleDomain, T column, Type type) + { + if (tupleDomain.getDomains().isEmpty()) { + return Domain.none(type); + } + Domain domain = tupleDomain.getDomains().get().get(column); + if (domain != null && !domain.getType().equals(type)) { + throw new IllegalArgumentException("Provided type %s does not match domain type %s for column %s".formatted(type, domain.getType(), column)); + } + if (domain == null) { + return Domain.all(type); + } + return domain; } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergWritableTableHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergWritableTableHandle.java index 22c885b4c465..7f347564c3b7 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergWritableTableHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/IcebergWritableTableHandle.java @@ -13,8 +13,6 @@ */ package io.trino.plugin.iceberg; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.trino.spi.connector.ConnectorInsertTableHandle; @@ -28,104 +26,33 @@ import static com.google.common.base.Preconditions.checkArgument; import static java.util.Objects.requireNonNull; -public class IcebergWritableTableHandle +public record IcebergWritableTableHandle( + SchemaTableName name, + String schemaAsJson, + Map partitionsSpecsAsJson, + int partitionSpecId, + List sortOrder, + List inputColumns, + String outputPath, + IcebergFileFormat fileFormat, + Map storageProperties, + RetryMode retryMode, + Map fileIoProperties) implements ConnectorInsertTableHandle, ConnectorOutputTableHandle { - private final SchemaTableName name; - private final String schemaAsJson; - private final Map partitionsSpecsAsJson; - private final int partitionSpecId; - private final List sortOrder; - private final List inputColumns; - private final String outputPath; - private final IcebergFileFormat fileFormat; - private final Map storageProperties; - private final RetryMode retryMode; - - @JsonCreator - public IcebergWritableTableHandle( - @JsonProperty("name") SchemaTableName name, - @JsonProperty("schemaAsJson") String schemaAsJson, - @JsonProperty("partitionSpecsAsJson") Map partitionsSpecsAsJson, - @JsonProperty("partitionSpecId") int partitionSpecId, - @JsonProperty("sortOrder") List sortOrder, - @JsonProperty("inputColumns") List inputColumns, - @JsonProperty("outputPath") String outputPath, - @JsonProperty("fileFormat") IcebergFileFormat fileFormat, - @JsonProperty("properties") Map storageProperties, - @JsonProperty("retryMode") RetryMode retryMode) - { - this.name = requireNonNull(name, "name is null"); - this.schemaAsJson = requireNonNull(schemaAsJson, "schemaAsJson is null"); - this.partitionsSpecsAsJson = ImmutableMap.copyOf(requireNonNull(partitionsSpecsAsJson, "partitionsSpecsAsJson is null")); - this.partitionSpecId = partitionSpecId; - this.sortOrder = ImmutableList.copyOf(requireNonNull(sortOrder, "sortOrder is null")); - this.inputColumns = ImmutableList.copyOf(requireNonNull(inputColumns, "inputColumns is null")); - this.outputPath = requireNonNull(outputPath, "outputPath is null"); - this.fileFormat = requireNonNull(fileFormat, "fileFormat is null"); - this.storageProperties = ImmutableMap.copyOf(requireNonNull(storageProperties, "storageProperties is null")); - this.retryMode = requireNonNull(retryMode, "retryMode is null"); + public IcebergWritableTableHandle + { + requireNonNull(name, "name is null"); + requireNonNull(schemaAsJson, "schemaAsJson is null"); + partitionsSpecsAsJson = ImmutableMap.copyOf(requireNonNull(partitionsSpecsAsJson, "partitionsSpecsAsJson is null")); + sortOrder = ImmutableList.copyOf(requireNonNull(sortOrder, "sortOrder is null")); + inputColumns = ImmutableList.copyOf(requireNonNull(inputColumns, "inputColumns is null")); + requireNonNull(outputPath, "outputPath is null"); + requireNonNull(fileFormat, "fileFormat is null"); + storageProperties = ImmutableMap.copyOf(requireNonNull(storageProperties, "storageProperties is null")); + requireNonNull(retryMode, "retryMode is null"); checkArgument(partitionsSpecsAsJson.containsKey(partitionSpecId), "partitionSpecId missing from partitionSpecs"); - } - - @JsonProperty - public SchemaTableName getName() - { - return name; - } - - @JsonProperty - public String getSchemaAsJson() - { - return schemaAsJson; - } - - @JsonProperty - public Map getPartitionsSpecsAsJson() - { - return partitionsSpecsAsJson; - } - - @JsonProperty - public int getPartitionSpecId() - { - return partitionSpecId; - } - - @JsonProperty - public List getSortOrder() - { - return sortOrder; - } - - @JsonProperty - public List getInputColumns() - { - return inputColumns; - } - - @JsonProperty - public String getOutputPath() - { - return outputPath; - } - - @JsonProperty - public IcebergFileFormat getFileFormat() - { - return fileFormat; - } - - @JsonProperty - public Map getStorageProperties() - { - return storageProperties; - } - - @JsonProperty - public RetryMode getRetryMode() - { - return retryMode; + fileIoProperties = ImmutableMap.copyOf(requireNonNull(fileIoProperties, "fileIoProperties is null")); } @Override diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionFields.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionFields.java index 8c7a2bf64fac..ff332e38e281 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionFields.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionFields.java @@ -51,8 +51,8 @@ public final class PartitionFields private static final Pattern TRUNCATE_PATTERN = Pattern.compile("truncate" + FUNCTION_ARGUMENT_NAME_AND_INT, CASE_INSENSITIVE); private static final Pattern VOID_PATTERN = Pattern.compile("void" + FUNCTION_ARGUMENT_NAME, CASE_INSENSITIVE); - private static final Pattern ICEBERG_BUCKET_PATTERN = Pattern.compile("bucket\\[(\\d+)]"); - private static final Pattern ICEBERG_TRUNCATE_PATTERN = Pattern.compile("truncate\\[(\\d+)]"); + static final Pattern ICEBERG_BUCKET_PATTERN = Pattern.compile("bucket\\[(\\d+)]"); + static final Pattern ICEBERG_TRUNCATE_PATTERN = Pattern.compile("truncate\\[(\\d+)]"); private PartitionFields() {} @@ -61,7 +61,7 @@ public static PartitionSpec parsePartitionFields(Schema schema, List fie try { PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (String field : fields) { - parsePartitionField(builder, field); + parsePartitionFields(schema, fields, builder, field); } return builder.build(); } @@ -70,19 +70,59 @@ public static PartitionSpec parsePartitionFields(Schema schema, List fie } } - public static void parsePartitionField(PartitionSpec.Builder builder, String field) + private static void parsePartitionFields(Schema schema, List fields, PartitionSpec.Builder builder, String field) { - @SuppressWarnings("PointlessBooleanExpression") - boolean matched = false || - tryMatch(field, IDENTITY_PATTERN, match -> builder.identity(fromIdentifierToColumn(match.group()))) || - tryMatch(field, YEAR_PATTERN, match -> builder.year(fromIdentifierToColumn(match.group(1)))) || - tryMatch(field, MONTH_PATTERN, match -> builder.month(fromIdentifierToColumn(match.group(1)))) || - tryMatch(field, DAY_PATTERN, match -> builder.day(fromIdentifierToColumn(match.group(1)))) || - tryMatch(field, HOUR_PATTERN, match -> builder.hour(fromIdentifierToColumn(match.group(1)))) || - tryMatch(field, BUCKET_PATTERN, match -> builder.bucket(fromIdentifierToColumn(match.group(1)), parseInt(match.group(2)))) || - tryMatch(field, TRUNCATE_PATTERN, match -> builder.truncate(fromIdentifierToColumn(match.group(1)), parseInt(match.group(2)))) || - tryMatch(field, VOID_PATTERN, match -> builder.alwaysNull(fromIdentifierToColumn(match.group(1)))) || - false; + for (int i = 1; i < schema.columns().size() + fields.size(); i++) { + try { + parsePartitionField(builder, field, i == 1 ? "" : "_" + i); + return; + } + catch (IllegalArgumentException e) { + if (e.getMessage().contains("Cannot create partition from name that exists in schema") + || e.getMessage().contains("Cannot create identity partition sourced from different field in schema")) { + continue; + } + throw e; + } + } + throw new IllegalArgumentException("Cannot resolve partition field: " + field); + } + + public static void parsePartitionField(PartitionSpec.Builder builder, String field, String suffix) + { + boolean matched = + tryMatch(field, IDENTITY_PATTERN, match -> { + // identity doesn't allow specifying an alias + builder.identity(fromIdentifierToColumn(match.group())); + }) || + tryMatch(field, YEAR_PATTERN, match -> { + String column = fromIdentifierToColumn(match.group(1)); + builder.year(column, column + "_year" + suffix); + }) || + tryMatch(field, MONTH_PATTERN, match -> { + String column = fromIdentifierToColumn(match.group(1)); + builder.month(column, column + "_month" + suffix); + }) || + tryMatch(field, DAY_PATTERN, match -> { + String column = fromIdentifierToColumn(match.group(1)); + builder.day(column, column + "_day" + suffix); + }) || + tryMatch(field, HOUR_PATTERN, match -> { + String column = fromIdentifierToColumn(match.group(1)); + builder.hour(column, column + "_hour" + suffix); + }) || + tryMatch(field, BUCKET_PATTERN, match -> { + String column = fromIdentifierToColumn(match.group(1)); + builder.bucket(column, parseInt(match.group(2)), column + "_bucket" + suffix); + }) || + tryMatch(field, TRUNCATE_PATTERN, match -> { + String column = fromIdentifierToColumn(match.group(1)); + builder.truncate(column, parseInt(match.group(2)), column + "_trunc" + suffix); + }) || + tryMatch(field, VOID_PATTERN, match -> { + String column = fromIdentifierToColumn(match.group(1)); + builder.alwaysNull(column, column + "_null" + suffix); + }); if (!matched) { throw new IllegalArgumentException("Invalid partition field declaration: " + field); } @@ -91,13 +131,6 @@ public static void parsePartitionField(PartitionSpec.Builder builder, String fie public static String fromIdentifierToColumn(String identifier) { if (QUOTED_IDENTIFIER_PATTERN.matcher(identifier).matches()) { - // We only support lowercase quoted identifiers for now. - // See https://github.com/trinodb/trino/issues/12226#issuecomment-1128839259 - // TODO: Enhance quoted identifiers support in Iceberg partitioning to support mixed case identifiers - // See https://github.com/trinodb/trino/issues/12668 - if (!identifier.toLowerCase(ENGLISH).equals(identifier)) { - throw new IllegalArgumentException(format("Uppercase characters in identifier '%s' are not supported.", identifier)); - } return identifier.substring(1, identifier.length() - 1).replace("\"\"", "\""); } // Currently, all Iceberg columns are stored in lowercase in the Iceberg metadata files. @@ -159,7 +192,7 @@ private static String fromColumnToIdentifier(String column) public static String quotedName(String name) { - if (UNQUOTED_IDENTIFIER_PATTERN.matcher(name).matches()) { + if (UNQUOTED_IDENTIFIER_PATTERN.matcher(name).matches() && name.toLowerCase(ENGLISH).equals(name)) { return name; } return '"' + name.replace("\"", "\"\"") + '"'; diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionTransforms.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionTransforms.java index f22c64aebe2d..4dc1dbfc3422 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionTransforms.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionTransforms.java @@ -160,6 +160,83 @@ public static ColumnTransform getColumnTransform(PartitionField field, Type sour throw new UnsupportedOperationException("Unsupported partition transform: " + field); } + public static ColumnTransform getColumnTransform(IcebergPartitionFunction field) + { + Type type = field.type(); + return switch (field.transform()) { + case IDENTITY -> identity(type); + case YEAR -> { + if (type.equals(DATE)) { + yield yearsFromDate(); + } + if (type.equals(TIMESTAMP_MICROS)) { + yield yearsFromTimestamp(); + } + if (type.equals(TIMESTAMP_TZ_MICROS)) { + yield yearsFromTimestampWithTimeZone(); + } + throw new UnsupportedOperationException("Unsupported type for 'year': " + field); + } + case MONTH -> { + if (type.equals(DATE)) { + yield monthsFromDate(); + } + if (type.equals(TIMESTAMP_MICROS)) { + yield monthsFromTimestamp(); + } + if (type.equals(TIMESTAMP_TZ_MICROS)) { + yield monthsFromTimestampWithTimeZone(); + } + throw new UnsupportedOperationException("Unsupported type for 'month': " + field); + } + case DAY -> { + if (type.equals(DATE)) { + yield daysFromDate(); + } + if (type.equals(TIMESTAMP_MICROS)) { + yield daysFromTimestamp(); + } + if (type.equals(TIMESTAMP_TZ_MICROS)) { + yield daysFromTimestampWithTimeZone(); + } + throw new UnsupportedOperationException("Unsupported type for 'day': " + field); + } + case HOUR -> { + if (type.equals(TIMESTAMP_MICROS)) { + yield hoursFromTimestamp(); + } + if (type.equals(TIMESTAMP_TZ_MICROS)) { + yield hoursFromTimestampWithTimeZone(); + } + throw new UnsupportedOperationException("Unsupported type for 'hour': " + field); + } + case VOID -> voidTransform(type); + case BUCKET -> bucket(type, field.size().orElseThrow()); + case TRUNCATE -> { + int width = field.size().orElseThrow(); + if (type.equals(INTEGER)) { + yield truncateInteger(width); + } + if (type.equals(BIGINT)) { + yield truncateBigint(width); + } + if (type instanceof DecimalType decimalType) { + if (decimalType.isShort()) { + yield truncateShortDecimal(type, width, decimalType); + } + yield truncateLongDecimal(type, width, decimalType); + } + if (type instanceof VarcharType) { + yield truncateVarchar(width); + } + if (type.equals(VARBINARY)) { + yield truncateVarbinary(width); + } + throw new UnsupportedOperationException("Unsupported type for 'truncate': " + field); + } + }; + } + private static ColumnTransform identity(Type type) { return new ColumnTransform(type, false, true, false, Function.identity(), ValueTransform.identity(type)); @@ -547,7 +624,7 @@ private static ColumnTransform truncateShortDecimal(Type type, int width, Decima private static Block truncateShortDecimal(DecimalType type, Block block, BigInteger unscaledWidth) { - BlockBuilder builder = type.createBlockBuilder(null, block.getPositionCount()); + BlockBuilder builder = type.createFixedSizeBlockBuilder(block.getPositionCount()); for (int position = 0; position < block.getPositionCount(); position++) { if (block.isNull(position)) { builder.appendNull(); @@ -585,7 +662,7 @@ private static ColumnTransform truncateLongDecimal(Type type, int width, Decimal private static Block truncateLongDecimal(DecimalType type, Block block, BigInteger unscaledWidth) { - BlockBuilder builder = type.createBlockBuilder(null, block.getPositionCount()); + BlockBuilder builder = type.createFixedSizeBlockBuilder(block.getPositionCount()); for (int position = 0; position < block.getPositionCount(); position++) { if (block.isNull(position)) { builder.appendNull(); @@ -752,56 +829,22 @@ private interface Hasher int hash(Block block, int position); } - public static class ColumnTransform + /** + * @param type Result type. + */ + public record ColumnTransform( + Type type, + boolean preservesNonNull, + boolean monotonic, + boolean temporal, + Function blockTransform, + ValueTransform valueTransform) { - private final Type type; - private final boolean preservesNonNull; - private final boolean monotonic; - private final boolean temporal; - private final Function blockTransform; - private final ValueTransform valueTransform; - - public ColumnTransform(Type type, boolean preservesNonNull, boolean monotonic, boolean temporal, Function blockTransform, ValueTransform valueTransform) - { - this.type = requireNonNull(type, "type is null"); - this.preservesNonNull = preservesNonNull; - this.monotonic = monotonic; - this.temporal = temporal; - this.blockTransform = requireNonNull(blockTransform, "transform is null"); - this.valueTransform = requireNonNull(valueTransform, "valueTransform is null"); - } - - /** - * Result type. - */ - public Type getType() - { - return type; - } - - public boolean preservesNonNull() - { - return preservesNonNull; - } - - public boolean isMonotonic() - { - return monotonic; - } - - public boolean isTemporal() - { - return temporal; - } - - public Function getBlockTransform() - { - return blockTransform; - } - - public ValueTransform getValueTransform() + public ColumnTransform { - return valueTransform; + requireNonNull(type, "type is null"); + requireNonNull(blockTransform, "transform is null"); + requireNonNull(valueTransform, "valueTransform is null"); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/StructLikeWrapperWithFieldIdToIndex.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/StructLikeWrapperWithFieldIdToIndex.java new file mode 100644 index 000000000000..4fa5c08f1a9c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/StructLikeWrapperWithFieldIdToIndex.java @@ -0,0 +1,79 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructLikeWrapper; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.IntStream; + +public class StructLikeWrapperWithFieldIdToIndex +{ + private final StructLikeWrapper structLikeWrapper; + private final Map fieldIdToIndex; + + public static StructLikeWrapperWithFieldIdToIndex createStructLikeWrapper(FileScanTask fileScanTask) + { + Types.StructType structType = fileScanTask.spec().partitionType(); + StructLikeWrapper partitionWrapper = StructLikeWrapper.forType(structType).set(fileScanTask.file().partition()); + return new StructLikeWrapperWithFieldIdToIndex(partitionWrapper, structType); + } + + @VisibleForTesting + StructLikeWrapperWithFieldIdToIndex(StructLikeWrapper structLikeWrapper, Types.StructType structType) + { + this.structLikeWrapper = structLikeWrapper; + ImmutableMap.Builder fieldIdToIndex = ImmutableMap.builder(); + List fields = structType.fields(); + IntStream.range(0, fields.size()) + .forEach(i -> fieldIdToIndex.put(fields.get(i).fieldId(), i)); + this.fieldIdToIndex = fieldIdToIndex.buildOrThrow(); + } + + public StructLikeWrapper getStructLikeWrapper() + { + return structLikeWrapper; + } + + public Map getFieldIdToIndex() + { + return fieldIdToIndex; + } + + @Override + public boolean equals(Object o) + { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + StructLikeWrapperWithFieldIdToIndex that = (StructLikeWrapperWithFieldIdToIndex) o; + // Due to bogus implementation of equals in StructLikeWrapper https://github.com/apache/iceberg/issues/5064 order here matters. + return Objects.equals(fieldIdToIndex, that.fieldIdToIndex) && Objects.equals(structLikeWrapper, that.structLikeWrapper); + } + + @Override + public int hashCode() + { + return Objects.hash(fieldIdToIndex, structLikeWrapper); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsReader.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsReader.java index 8e00d7e0c3da..d88c461e203d 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsReader.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsReader.java @@ -18,9 +18,11 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import io.airlift.log.Logger; +import io.trino.filesystem.TrinoFileSystem; import io.trino.spi.TrinoException; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.TupleDomain; import io.trino.spi.statistics.ColumnStatistics; import io.trino.spi.statistics.DoubleRange; @@ -38,12 +40,10 @@ import org.apache.iceberg.TableScan; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.puffin.StandardBlobTypes; -import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import java.io.IOException; import java.io.UncheckedIOException; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -51,18 +51,25 @@ import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.concurrent.ExecutorService; import static com.google.common.base.Verify.verifyNotNull; +import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.getOnlyElement; import static com.google.common.collect.Streams.stream; +import static io.airlift.slice.Slices.utf8Slice; import static io.trino.plugin.iceberg.ExpressionConverter.toIcebergExpression; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static io.trino.plugin.iceberg.IcebergMetadataColumn.isMetadataColumnId; import static io.trino.plugin.iceberg.IcebergSessionProperties.isExtendedStatisticsEnabled; -import static io.trino.plugin.iceberg.IcebergUtil.getColumns; +import static io.trino.plugin.iceberg.IcebergUtil.getFileModifiedTimeDomain; +import static io.trino.plugin.iceberg.IcebergUtil.getModificationTime; +import static io.trino.plugin.iceberg.IcebergUtil.getPartitionDomain; +import static io.trino.plugin.iceberg.IcebergUtil.getPathDomain; +import static io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone; +import static io.trino.spi.type.TimeZoneKey.UTC_KEY; import static io.trino.spi.type.VarbinaryType.VARBINARY; import static io.trino.spi.type.VarcharType.VARCHAR; import static java.lang.Long.parseLong; @@ -70,6 +77,7 @@ import static java.util.function.Function.identity; import static java.util.stream.Collectors.toMap; import static java.util.stream.Collectors.toUnmodifiableMap; +import static org.apache.iceberg.util.SnapshotUtil.schemaFor; public final class TableStatisticsReader { @@ -77,22 +85,16 @@ private TableStatisticsReader() {} private static final Logger log = Logger.get(TableStatisticsReader.class); - // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties - @Deprecated - public static final String TRINO_STATS_PREFIX = "trino.stats.ndv."; - // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties - @Deprecated - public static final String TRINO_STATS_NDV_FORMAT = TRINO_STATS_PREFIX + "%d.ndv"; - // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties - @Deprecated - public static final Pattern TRINO_STATS_COLUMN_ID_PATTERN = Pattern.compile(Pattern.quote(TRINO_STATS_PREFIX) + "(?\\d+)\\..*"); - // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties - @Deprecated - public static final Pattern TRINO_STATS_NDV_PATTERN = Pattern.compile(Pattern.quote(TRINO_STATS_PREFIX) + "(?\\d+)\\.ndv"); - public static final String APACHE_DATASKETCHES_THETA_V1_NDV_PROPERTY = "ndv"; - public static TableStatistics getTableStatistics(TypeManager typeManager, ConnectorSession session, IcebergTableHandle tableHandle, Table icebergTable) + public static TableStatistics getTableStatistics( + TypeManager typeManager, + ConnectorSession session, + IcebergTableHandle tableHandle, + Set projectedColumns, + Table icebergTable, + ExecutorService icebergPlanningExecutor, + TrinoFileSystem fileSystem) { return makeTableStatistics( typeManager, @@ -100,7 +102,10 @@ public static TableStatistics getTableStatistics(TypeManager typeManager, Connec tableHandle.getSnapshotId(), tableHandle.getEnforcedPredicate(), tableHandle.getUnenforcedPredicate(), - isExtendedStatisticsEnabled(session)); + projectedColumns, + isExtendedStatisticsEnabled(session), + icebergPlanningExecutor, + fileSystem); } @VisibleForTesting @@ -110,7 +115,10 @@ public static TableStatistics makeTableStatistics( Optional snapshot, TupleDomain enforcedConstraint, TupleDomain unenforcedConstraint, - boolean extendedStatisticsEnabled) + Set projectedColumns, + boolean extendedStatisticsEnabled, + ExecutorService icebergPlanningExecutor, + TrinoFileSystem fileSystem) { if (snapshot.isEmpty()) { // No snapshot, so no data. @@ -130,25 +138,47 @@ public static TableStatistics makeTableStatistics( .build(); } - Schema icebergTableSchema = icebergTable.schema(); - List columns = icebergTableSchema.columns(); - - List columnHandles = getColumns(icebergTableSchema, typeManager); - Map idToColumnHandle = columnHandles.stream() - .collect(toUnmodifiableMap(IcebergColumnHandle::getId, identity())); + List columns = icebergTable.schema().columns(); Map idToType = columns.stream() .map(column -> Maps.immutableEntry(column.fieldId(), column.type())) .collect(toUnmodifiableMap(Map.Entry::getKey, Map.Entry::getValue)); + Set columnIds = projectedColumns.stream() + .map(IcebergColumnHandle::getId) + .collect(toImmutableSet()); + + Domain partitionDomain = getPartitionDomain(effectivePredicate); + Domain pathDomain = getPathDomain(effectivePredicate); + Domain fileModifiedTimeDomain = getFileModifiedTimeDomain(effectivePredicate); + Schema snapshotSchema = schemaFor(icebergTable, snapshotId); TableScan tableScan = icebergTable.newScan() - // Table enforced constraint may include eg $path column predicate which is not handled by Iceberg library TODO apply $path and $file_modified_time filters here .filter(toIcebergExpression(effectivePredicate.filter((column, domain) -> !isMetadataColumnId(column.getId())))) .useSnapshot(snapshotId) - .includeColumnStats(); + .includeColumnStats( + columnIds.stream() + .map(snapshotSchema::findColumnName) + .filter(Objects::nonNull) + .collect(toImmutableList())) + .planWith(icebergPlanningExecutor); IcebergStatistics.Builder icebergStatisticsBuilder = new IcebergStatistics.Builder(columns, typeManager); try (CloseableIterable fileScanTasks = tableScan.planFiles()) { - fileScanTasks.forEach(fileScanTask -> icebergStatisticsBuilder.acceptDataFile(fileScanTask.file(), fileScanTask.spec())); + fileScanTasks.forEach(fileScanTask -> { + if (!partitionDomain.isAll() && !partitionDomain.includesNullableValue(utf8Slice(fileScanTask.spec().partitionToPath(fileScanTask.partition())))) { + return; + } + if (!pathDomain.isAll() && !pathDomain.includesNullableValue(utf8Slice(fileScanTask.file().location()))) { + return; + } + if (!fileModifiedTimeDomain.isAll()) { + long fileModifiedTime = getModificationTime(fileScanTask.file().location(), fileSystem); + if (!fileModifiedTimeDomain.includesNullableValue(packDateTimeWithZone(fileModifiedTime, UTC_KEY))) { + return; + } + } + + icebergStatisticsBuilder.acceptDataFile(fileScanTask.file(), fileScanTask.spec()); + }); } catch (IOException e) { throw new UncheckedIOException(e); @@ -156,7 +186,7 @@ public static TableStatistics makeTableStatistics( IcebergStatistics summary = icebergStatisticsBuilder.build(); - if (summary.getFileCount() == 0) { + if (summary.fileCount() == 0) { return TableStatistics.builder() .setRowCount(Estimate.of(0)) .build(); @@ -165,28 +195,24 @@ public static TableStatistics makeTableStatistics( Map ndvs = readNdvs( icebergTable, snapshotId, - // TODO We don't need NDV information for columns not involved in filters/joins. Engine should provide set of columns - // it makes sense to find NDV information for. - idToColumnHandle.keySet(), + columnIds, extendedStatisticsEnabled); ImmutableMap.Builder columnHandleBuilder = ImmutableMap.builder(); - double recordCount = summary.getRecordCount(); - for (Entry columnHandleTuple : idToColumnHandle.entrySet()) { - IcebergColumnHandle columnHandle = columnHandleTuple.getValue(); + double recordCount = summary.recordCount(); + for (IcebergColumnHandle columnHandle : projectedColumns) { int fieldId = columnHandle.getId(); ColumnStatistics.Builder columnBuilder = new ColumnStatistics.Builder(); - Long nullCount = summary.getNullCounts().get(fieldId); + Long nullCount = summary.nullCounts().get(fieldId); if (nullCount != null) { columnBuilder.setNullsFraction(Estimate.of(nullCount / recordCount)); } - if (idToType.get(columnHandleTuple.getKey()).typeId() == Type.TypeID.FIXED) { - Types.FixedType fixedType = (Types.FixedType) idToType.get(columnHandleTuple.getKey()); + if (idToType.get(columnHandle.getId()) instanceof Types.FixedType fixedType) { long columnSize = fixedType.length(); columnBuilder.setDataSize(Estimate.of(columnSize)); } - else if (summary.getColumnSizes() != null) { - Long columnSize = summary.getColumnSizes().get(fieldId); + else if (summary.columnSizes() != null) { + Long columnSize = summary.columnSizes().get(fieldId); if (columnSize != null) { // columnSize is the size on disk and Trino column stats is size in memory. // The relation between the two is type and data dependent. @@ -210,8 +236,8 @@ else if (columnHandle.getBaseType() == VARBINARY) { } } } - Object min = summary.getMinValues().get(fieldId); - Object max = summary.getMaxValues().get(fieldId); + Object min = summary.minValues().get(fieldId); + Object max = summary.maxValues().get(fieldId); if (min != null && max != null) { columnBuilder.setRange(DoubleRange.from(columnHandle.getType(), min, max)); } @@ -224,20 +250,19 @@ else if (columnHandle.getBaseType() == VARBINARY) { return new TableStatistics(Estimate.of(recordCount), columnHandleBuilder.buildOrThrow()); } - private static Map readNdvs(Table icebergTable, long snapshotId, Set columnIds, boolean extendedStatisticsEnabled) + public static Map readNdvs(Table icebergTable, long snapshotId, Set columnIds, boolean extendedStatisticsEnabled) { if (!extendedStatisticsEnabled) { return ImmutableMap.of(); } ImmutableMap.Builder ndvByColumnId = ImmutableMap.builder(); - Set remainingColumnIds = new HashSet<>(columnIds); getLatestStatisticsFile(icebergTable, snapshotId).ifPresent(statisticsFile -> { Map thetaBlobsByFieldId = statisticsFile.blobMetadata().stream() .filter(blobMetadata -> blobMetadata.type().equals(StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1)) .filter(blobMetadata -> blobMetadata.fields().size() == 1) - .filter(blobMetadata -> remainingColumnIds.contains(getOnlyElement(blobMetadata.fields()))) + .filter(blobMetadata -> columnIds.contains(getOnlyElement(blobMetadata.fields()))) // Fail loud upon duplicates (there must be none) .collect(toImmutableMap(blobMetadata -> getOnlyElement(blobMetadata.fields()), identity())); @@ -247,33 +272,13 @@ private static Map readNdvs(Table icebergTable, long snapshotId, String ndv = blobMetadata.properties().get(APACHE_DATASKETCHES_THETA_V1_NDV_PROPERTY); if (ndv == null) { log.debug("Blob %s is missing %s property", blobMetadata.type(), APACHE_DATASKETCHES_THETA_V1_NDV_PROPERTY); - remainingColumnIds.remove(fieldId); } else { - remainingColumnIds.remove(fieldId); ndvByColumnId.put(fieldId, parseLong(ndv)); } } }); - // TODO (https://github.com/trinodb/trino/issues/15397): remove support for Trino-specific statistics properties - Iterator> properties = icebergTable.properties().entrySet().iterator(); - while (!remainingColumnIds.isEmpty() && properties.hasNext()) { - Entry entry = properties.next(); - String key = entry.getKey(); - String value = entry.getValue(); - if (key.startsWith(TRINO_STATS_PREFIX)) { - Matcher matcher = TRINO_STATS_NDV_PATTERN.matcher(key); - if (matcher.matches()) { - int columnId = Integer.parseInt(matcher.group("columnId")); - if (remainingColumnIds.remove(columnId)) { - long ndv = parseLong(value); - ndvByColumnId.put(columnId, ndv); - } - } - } - } - return ndvByColumnId.buildOrThrow(); } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsWriter.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsWriter.java index 473df30b3316..69674474326c 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsWriter.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableStatisticsWriter.java @@ -98,12 +98,8 @@ public StatisticsFile writeStatisticsFile( StatsUpdateMode updateMode, CollectedStatistics collectedStatistics) { - Snapshot snapshot = table.snapshot(snapshotId); TableOperations operations = ((HasTableOperations) table).operations(); FileIO fileIO = operations.io(); - long snapshotSequenceNumber = snapshot.sequenceNumber(); - Schema schema = table.schemas().get(snapshot.schemaId()); - collectedStatistics = mergeStatisticsIfNecessary( table, snapshotId, @@ -112,6 +108,23 @@ public StatisticsFile writeStatisticsFile( collectedStatistics); Map ndvSketches = collectedStatistics.ndvSketches(); + return writeStatisticsFile(session, table, fileIO, snapshotId, ndvSketches); + } + + public StatisticsFile rewriteStatisticsFile(ConnectorSession session, Table table, long snapshotId) + { + TableOperations operations = ((HasTableOperations) table).operations(); + FileIO fileIO = operations.io(); + // This will rewrite old statistics file as ndvSketches map is empty + return writeStatisticsFile(session, table, fileIO, snapshotId, Map.of()); + } + + private GenericStatisticsFile writeStatisticsFile(ConnectorSession session, Table table, FileIO fileIO, long snapshotId, Map ndvSketches) + { + Snapshot snapshot = table.snapshot(snapshotId); + long snapshotSequenceNumber = snapshot.sequenceNumber(); + TableOperations operations = ((HasTableOperations) table).operations(); + Schema schema = table.schemas().get(snapshot.schemaId()); Set validFieldIds = stream( Traverser.forTree((Types.NestedField nestedField) -> { Type type = nestedField.type(); diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableType.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableType.java index 7141f488d7da..14dcbe856e82 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableType.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/TableType.java @@ -17,10 +17,15 @@ public enum TableType { DATA, HISTORY, + METADATA_LOG_ENTRIES, SNAPSHOTS, + ALL_MANIFESTS, MANIFESTS, PARTITIONS, FILES, + ALL_ENTRIES, + ENTRIES, PROPERTIES, - REFS + REFS, + MATERIALIZED_VIEW_STORAGE, } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractIcebergTableOperations.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractIcebergTableOperations.java index 36d09f2d0d19..fe21f1b1ef79 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractIcebergTableOperations.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractIcebergTableOperations.java @@ -17,8 +17,11 @@ import dev.failsafe.RetryPolicy; import io.trino.annotation.NotThreadSafe; import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.plugin.hive.HiveType; import io.trino.plugin.hive.metastore.Column; import io.trino.plugin.hive.metastore.StorageFormat; +import io.trino.plugin.iceberg.IcebergExceptions; import io.trino.plugin.iceberg.util.HiveSchemaUtil; import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorSession; @@ -27,27 +30,25 @@ import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.exceptions.CommitFailedException; -import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.LocationProvider; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.types.Types.NestedField; -import java.io.FileNotFoundException; +import java.io.UncheckedIOException; import java.time.Duration; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.OptionalInt; +import java.util.function.Function; import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.ImmutableList.toImmutableList; -import static io.trino.plugin.hive.HiveType.toHiveType; -import static io.trino.plugin.hive.util.HiveClassNames.FILE_INPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.FILE_OUTPUT_FORMAT_CLASS; -import static io.trino.plugin.hive.util.HiveClassNames.LAZY_SIMPLE_SERDE_CLASS; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; -import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_MISSING_METADATA; +import static io.trino.plugin.iceberg.IcebergExceptions.translateMetadataException; +import static io.trino.plugin.iceberg.IcebergTableName.isMaterializedViewStorage; import static io.trino.plugin.iceberg.IcebergUtil.METADATA_FOLDER_NAME; import static io.trino.plugin.iceberg.IcebergUtil.fixBrokenMetadataLocation; import static io.trino.plugin.iceberg.IcebergUtil.getLocationProvider; @@ -59,6 +60,7 @@ import static java.util.Objects.requireNonNull; import static java.util.UUID.randomUUID; import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; +import static org.apache.iceberg.CatalogUtil.deleteRemovedMetadataFiles; import static org.apache.iceberg.TableMetadataParser.getFileExtension; import static org.apache.iceberg.TableProperties.METADATA_COMPRESSION; import static org.apache.iceberg.TableProperties.METADATA_COMPRESSION_DEFAULT; @@ -69,6 +71,9 @@ public abstract class AbstractIcebergTableOperations implements IcebergTableOperations { + public static final String FILE_INPUT_FORMAT_CLASS = "org.apache.hadoop.mapred.FileInputFormat"; + public static final String FILE_OUTPUT_FORMAT_CLASS = "org.apache.hadoop.mapred.FileOutputFormat"; + public static final String LAZY_SIMPLE_SERDE_CLASS = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"; public static final StorageFormat ICEBERG_METASTORE_STORAGE_FORMAT = StorageFormat.create( LAZY_SIMPLE_SERDE_CLASS, FILE_INPUT_FORMAT_CLASS, @@ -152,6 +157,11 @@ public void commit(@Nullable TableMetadata base, TableMetadata metadata) return; } + if (isMaterializedViewStorage(tableName)) { + commitMaterializedViewRefresh(base, metadata); + return; + } + if (base == null) { if (PROVIDER_PROPERTY_VALUE.equals(metadata.properties().get(PROVIDER_PROPERTY_KEY))) { // Assume this is a table executing migrate procedure @@ -165,6 +175,7 @@ public void commit(@Nullable TableMetadata base, TableMetadata metadata) } else { commitToExistingTable(base, metadata); + deleteRemovedMetadataFiles(fileIo, base, metadata); } shouldRefresh = true; @@ -176,6 +187,8 @@ public void commit(@Nullable TableMetadata base, TableMetadata metadata) protected abstract void commitToExistingTable(TableMetadata base, TableMetadata metadata); + protected abstract void commitMaterializedViewRefresh(TableMetadata base, TableMetadata metadata); + @Override public FileIO io() { @@ -224,6 +237,13 @@ protected String writeNewMetadata(TableMetadata metadata, int newVersion) } protected void refreshFromMetadataLocation(String newLocation) + { + refreshFromMetadataLocation( + newLocation, + metadataLocation -> TableMetadataParser.read(fileIo, fileIo.newInputFile(metadataLocation))); + } + + protected void refreshFromMetadataLocation(String newLocation, Function metadataLoader) { // use null-safe equality check because new tables have a null metadata location if (Objects.equals(currentMetadataLocation, newLocation)) { @@ -231,24 +251,27 @@ protected void refreshFromMetadataLocation(String newLocation) return; } + // a table that is replaced doesn't need its metadata reloaded + if (newLocation == null) { + shouldRefresh = false; + return; + } + TableMetadata newMetadata; try { newMetadata = Failsafe.with(RetryPolicy.builder() - .withMaxRetries(20) + .withMaxRetries(3) .withBackoff(100, 5000, MILLIS, 4.0) - .withMaxDuration(Duration.ofMinutes(10)) - .abortOn(failure -> failure instanceof ValidationException || isNotFoundException(failure)) + .withMaxDuration(Duration.ofMinutes(3)) + .abortOn(throwable -> TrinoFileSystem.isUnrecoverableException(throwable) || IcebergExceptions.isFatalException(throwable)) .build()) - .get(() -> TableMetadataParser.read(fileIo, io().newInputFile(newLocation))); + .get(() -> metadataLoader.apply(newLocation)); + } + catch (UncheckedIOException e) { + throw new TrinoException(ICEBERG_INVALID_METADATA, "Error accessing metadata file for table %s".formatted(getSchemaTableName().toString()), e); } catch (Throwable failure) { - if (isNotFoundException(failure)) { - throw new TrinoException(ICEBERG_MISSING_METADATA, "Metadata not found in metadata location for table " + getSchemaTableName(), failure); - } - if (failure instanceof ValidationException) { - throw new TrinoException(ICEBERG_INVALID_METADATA, "Invalid metadata file for table " + getSchemaTableName(), failure); - } - throw failure; + throw translateMetadataException(failure, getSchemaTableName().toString()); } String newUUID = newMetadata.uuid(); @@ -263,14 +286,6 @@ protected void refreshFromMetadataLocation(String newLocation) shouldRefresh = false; } - private static boolean isNotFoundException(Throwable failure) - { - // qualified name, as this is NOT the io.trino.spi.connector.NotFoundException - return failure instanceof org.apache.iceberg.exceptions.NotFoundException || - // This is used in context where the code cannot throw a checked exception, so FileNotFoundException would need to be wrapped - failure.getCause() instanceof FileNotFoundException; - } - protected static String newTableMetadataFilePath(TableMetadata meta, int newVersion) { String codec = meta.property(METADATA_COMPRESSION, METADATA_COMPRESSION_DEFAULT); @@ -291,8 +306,9 @@ public static List toHiveColumns(List columns) return columns.stream() .map(column -> new Column( column.name(), - toHiveType(HiveSchemaUtil.convert(column.type())), - Optional.empty())) + HiveType.toHiveType(HiveSchemaUtil.convert(column.type())), + Optional.empty(), + Map.of())) .collect(toImmutableList()); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractTrinoCatalog.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractTrinoCatalog.java index 6747414e60c8..c5675380f34d 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractTrinoCatalog.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/AbstractTrinoCatalog.java @@ -14,16 +14,18 @@ package io.trino.plugin.iceberg.catalog; import com.google.common.collect.ImmutableMap; -import dev.failsafe.Failsafe; -import dev.failsafe.RetryPolicy; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.metastore.TableInfo; import io.trino.plugin.base.CatalogName; import io.trino.plugin.hive.HiveMetadata; import io.trino.plugin.iceberg.ColumnIdentity; import io.trino.plugin.iceberg.IcebergMaterializedViewDefinition; import io.trino.plugin.iceberg.IcebergUtil; import io.trino.plugin.iceberg.PartitionTransforms.ColumnTransform; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; +import io.trino.plugin.iceberg.fileio.ForwardingOutputFile; import io.trino.spi.TrinoException; import io.trino.spi.connector.CatalogSchemaTableName; import io.trino.spi.connector.ColumnMetadata; @@ -32,6 +34,8 @@ import io.trino.spi.connector.ConnectorTableMetadata; import io.trino.spi.connector.ConnectorViewDefinition; import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.TableNotFoundException; +import io.trino.spi.connector.ViewNotFoundException; import io.trino.spi.type.ArrayType; import io.trino.spi.type.CharType; import io.trino.spi.type.MapType; @@ -43,46 +47,52 @@ import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.BaseTable; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.TableOperations; import org.apache.iceberg.Transaction; import org.apache.iceberg.types.Types; import java.io.IOException; -import java.time.Duration; -import java.time.temporal.ChronoUnit; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.stream.Stream; -import static com.google.common.base.Throwables.throwIfUnchecked; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.trino.metastore.TableInfo.ICEBERG_MATERIALIZED_VIEW_COMMENT; import static io.trino.plugin.hive.HiveMetadata.STORAGE_TABLE; import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; -import static io.trino.plugin.hive.ViewReaderUtil.ICEBERG_MATERIALIZED_VIEW_COMMENT; import static io.trino.plugin.hive.ViewReaderUtil.PRESTO_VIEW_FLAG; -import static io.trino.plugin.hive.metastore.glue.converter.GlueToTrinoConverter.mappedCopy; import static io.trino.plugin.hive.util.HiveUtil.escapeTableName; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; -import static io.trino.plugin.iceberg.IcebergMaterializedViewAdditionalProperties.STORAGE_SCHEMA; -import static io.trino.plugin.iceberg.IcebergMaterializedViewAdditionalProperties.getStorageSchema; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static io.trino.plugin.iceberg.IcebergMaterializedViewDefinition.decodeMaterializedViewData; -import static io.trino.plugin.iceberg.IcebergTableProperties.FILE_FORMAT_PROPERTY; +import static io.trino.plugin.iceberg.IcebergMaterializedViewProperties.STORAGE_SCHEMA; +import static io.trino.plugin.iceberg.IcebergMaterializedViewProperties.getStorageSchema; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; +import static io.trino.plugin.iceberg.IcebergTableName.tableNameWithType; import static io.trino.plugin.iceberg.IcebergTableProperties.getPartitioning; +import static io.trino.plugin.iceberg.IcebergTableProperties.getSortOrder; +import static io.trino.plugin.iceberg.IcebergTableProperties.getTableLocation; +import static io.trino.plugin.iceberg.IcebergUtil.METADATA_FOLDER_NAME; import static io.trino.plugin.iceberg.IcebergUtil.commit; +import static io.trino.plugin.iceberg.IcebergUtil.createTableProperties; import static io.trino.plugin.iceberg.IcebergUtil.getIcebergTableProperties; import static io.trino.plugin.iceberg.IcebergUtil.schemaFromMetadata; import static io.trino.plugin.iceberg.PartitionFields.parsePartitionFields; import static io.trino.plugin.iceberg.PartitionTransforms.getColumnTransform; +import static io.trino.plugin.iceberg.SortFieldUtils.parseSortFields; +import static io.trino.plugin.iceberg.TableType.MATERIALIZED_VIEW_STORAGE; import static io.trino.plugin.iceberg.TypeConverter.toTrinoType; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; import static io.trino.spi.StandardErrorCode.TABLE_NOT_FOUND; import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.spi.type.SmallintType.SMALLINT; @@ -94,32 +104,43 @@ import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static java.util.UUID.randomUUID; +import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; import static org.apache.iceberg.TableMetadata.newTableMetadata; -import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableMetadataParser.getFileExtension; +import static org.apache.iceberg.TableProperties.METADATA_COMPRESSION_DEFAULT; +import static org.apache.iceberg.Transactions.createOrReplaceTableTransaction; import static org.apache.iceberg.Transactions.createTableTransaction; public abstract class AbstractTrinoCatalog implements TrinoCatalog { public static final String TRINO_CREATED_BY_VALUE = "Trino Iceberg connector"; + public static final String ICEBERG_VIEW_RUN_AS_OWNER = "trino.run-as-owner"; + protected static final String TRINO_CREATED_BY = HiveMetadata.TRINO_CREATED_BY; - protected static final String PRESTO_QUERY_ID_NAME = HiveMetadata.PRESTO_QUERY_ID_NAME; + protected static final String TRINO_QUERY_ID_NAME = "trino_query_id"; private final CatalogName catalogName; - private final TypeManager typeManager; - protected final IcebergTableOperationsProvider tableOperationsProvider; private final boolean useUniqueTableLocation; + protected final TypeManager typeManager; + protected final IcebergTableOperationsProvider tableOperationsProvider; + protected final TrinoFileSystemFactory fileSystemFactory; + protected final ForwardingFileIoFactory fileIoFactory; protected AbstractTrinoCatalog( CatalogName catalogName, + boolean useUniqueTableLocation, TypeManager typeManager, IcebergTableOperationsProvider tableOperationsProvider, - boolean useUniqueTableLocation) + TrinoFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory) { this.catalogName = requireNonNull(catalogName, "catalogName is null"); + this.useUniqueTableLocation = useUniqueTableLocation; this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.tableOperationsProvider = requireNonNull(tableOperationsProvider, "tableOperationsProvider is null"); - this.useUniqueTableLocation = useUniqueTableLocation; + this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.fileIoFactory = requireNonNull(fileIoFactory, "fileIoFactory is null"); } @Override @@ -132,6 +153,7 @@ public void updateTableComment(ConnectorSession session, SchemaTableName schemaT else { icebergTable.updateProperties().set(TABLE_COMMENT, comment.get()).commit(); } + invalidateTableCache(schemaTableName); } @Override @@ -139,18 +161,23 @@ public void updateColumnComment(ConnectorSession session, SchemaTableName schema { Table icebergTable = loadTable(session, schemaTableName); icebergTable.updateSchema().updateColumnDoc(columnIdentity.getName(), comment.orElse(null)).commit(); + invalidateTableCache(schemaTableName); } @Override public Map getViews(ConnectorSession session, Optional namespace) { ImmutableMap.Builder views = ImmutableMap.builder(); - for (SchemaTableName name : listViews(session, namespace)) { + for (TableInfo tableInfo : listTables(session, namespace)) { + if (tableInfo.extendedRelationType() != TableInfo.ExtendedRelationType.TRINO_VIEW) { + continue; + } + SchemaTableName name = tableInfo.tableName(); try { getView(session, name).ifPresent(view -> views.put(name, view)); } catch (TrinoException e) { - if (e.getErrorCode().equals(TABLE_NOT_FOUND.toErrorCode())) { + if (e.getErrorCode().equals(TABLE_NOT_FOUND.toErrorCode()) || e instanceof TableNotFoundException || e instanceof ViewNotFoundException) { // Ignore view that was dropped during query execution (race condition) } else { @@ -164,44 +191,91 @@ public Map getViews(ConnectorSession s @Override public Optional getMaterializedView(ConnectorSession session, SchemaTableName schemaViewName) { + return doGetMaterializedView(session, schemaViewName); + } + + protected abstract Optional doGetMaterializedView(ConnectorSession session, SchemaTableName schemaViewName); + + @Override + public Map getMaterializedViewProperties(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition) + { + SchemaTableName storageTableName = definition.getStorageTable() + .orElseThrow(() -> new TrinoException(ICEBERG_INVALID_METADATA, "Materialized view definition is missing a storage table")) + .getSchemaTableName(); + try { - return Failsafe.with(RetryPolicy.builder() - .withMaxAttempts(10) - .withBackoff(1, 5_000, ChronoUnit.MILLIS, 4) - .withMaxDuration(Duration.ofSeconds(30)) - .abortOn(failure -> !(failure instanceof MaterializedViewMayBeBeingRemovedException)) - .build()) - .get(() -> doGetMaterializedView(session, schemaViewName)); + BaseTable storageTable = loadTable(session, definition.getStorageTable().orElseThrow().getSchemaTableName()); + return ImmutableMap.builder() + .putAll(getIcebergTableProperties(storageTable)) + .put(STORAGE_SCHEMA, storageTableName.getSchemaName()) + .buildOrThrow(); } - catch (MaterializedViewMayBeBeingRemovedException e) { - throwIfUnchecked(e.getCause()); - throw new RuntimeException(e.getCause()); + catch (RuntimeException e) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Unable to load storage table metadata for materialized view: " + viewName); } } - protected abstract Optional doGetMaterializedView(ConnectorSession session, SchemaTableName schemaViewName); - protected Transaction newCreateTableTransaction( ConnectorSession session, SchemaTableName schemaTableName, Schema schema, PartitionSpec partitionSpec, SortOrder sortOrder, - String location, + Optional location, Map properties, Optional owner) { - TableMetadata metadata = newTableMetadata(schema, partitionSpec, sortOrder, location, properties); + TableMetadata metadata = newTableMetadata(schema, partitionSpec, sortOrder, location.orElse(null), properties); TableOperations ops = tableOperationsProvider.createTableOperations( this, session, schemaTableName.getSchemaName(), schemaTableName.getTableName(), owner, - Optional.of(location)); + location); return createTableTransaction(schemaTableName.toString(), ops, metadata); } + protected Transaction newCreateOrReplaceTableTransaction( + ConnectorSession session, + SchemaTableName schemaTableName, + Schema schema, + PartitionSpec partitionSpec, + SortOrder sortOrder, + String location, + Map properties, + Optional owner) + { + BaseTable table; + Optional metadata = Optional.empty(); + try { + table = loadTable(session, new SchemaTableName(schemaTableName.getSchemaName(), schemaTableName.getTableName())); + metadata = Optional.of(table.operations().current()); + } + catch (TableNotFoundException ignore) { + // ignored + } + IcebergTableOperations operations = tableOperationsProvider.createTableOperations( + this, + session, + schemaTableName.getSchemaName(), + schemaTableName.getTableName(), + owner, + Optional.of(location)); + TableMetadata newMetaData; + if (metadata.isPresent()) { + operations.initializeFromMetadata(metadata.get()); + newMetaData = operations.current() + // don't inherit table properties from earlier snapshots + .replaceProperties(properties) + .buildReplacement(schema, partitionSpec, sortOrder, location, properties); + } + else { + newMetaData = newTableMetadata(schema, partitionSpec, sortOrder, location, properties); + } + return createOrReplaceTableTransaction(schemaTableName.toString(), operations, newMetaData); + } + protected String createNewTableName(String baseTableName) { String tableNameLocationComponent = escapeTableName(baseTableName); @@ -221,20 +295,72 @@ protected void deleteTableDirectory(TrinoFileSystem fileSystem, SchemaTableName } } - protected SchemaTableName createMaterializedViewStorageTable(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition) + protected Location createMaterializedViewStorage( + ConnectorSession session, + SchemaTableName viewName, + ConnectorMaterializedViewDefinition definition, + Map materializedViewProperties) + { + if (getStorageSchema(materializedViewProperties).isPresent()) { + throw new TrinoException(NOT_SUPPORTED, "Materialized view property '%s' is not supported when hiding materialized view storage tables is enabled".formatted(STORAGE_SCHEMA)); + } + SchemaTableName storageTableName = new SchemaTableName(viewName.getSchemaName(), tableNameWithType(viewName.getTableName(), MATERIALIZED_VIEW_STORAGE)); + String tableLocation = getTableLocation(materializedViewProperties) + .orElseGet(() -> defaultTableLocation(session, viewName)); + List columns = columnsForMaterializedView(definition, materializedViewProperties); + + Schema schema = schemaFromMetadata(columns); + PartitionSpec partitionSpec = parsePartitionFields(schema, getPartitioning(materializedViewProperties)); + SortOrder sortOrder = parseSortFields(schema, getSortOrder(materializedViewProperties)); + Map properties = createTableProperties(new ConnectorTableMetadata(storageTableName, columns, materializedViewProperties, Optional.empty()), ignore -> false); + + TableMetadata metadata = newTableMetadata(schema, partitionSpec, sortOrder, tableLocation, properties); + + String fileName = format("%05d-%s%s", 0, randomUUID(), getFileExtension(METADATA_COMPRESSION_DEFAULT)); + Location metadataFileLocation = Location.of(tableLocation).appendPath(METADATA_FOLDER_NAME).appendPath(fileName); + + TrinoFileSystem fileSystem = fileSystemFactory.create(session); + TableMetadataParser.write(metadata, new ForwardingOutputFile(fileSystem, metadataFileLocation)); + + return metadataFileLocation; + } + + protected void dropMaterializedViewStorage(ConnectorSession session, TrinoFileSystem fileSystem, String storageMetadataLocation) + throws IOException + { + TableMetadata metadata = TableMetadataParser.read(fileIoFactory.create(fileSystem, isUseFileSizeFromMetadata(session)), storageMetadataLocation); + String storageLocation = metadata.location(); + fileSystem.deleteDirectory(Location.of(storageLocation)); + } + + protected SchemaTableName createMaterializedViewStorageTable( + ConnectorSession session, + SchemaTableName viewName, + ConnectorMaterializedViewDefinition definition, + Map materializedViewProperties) { // Generate a storage table name and create a storage table. The properties in the definition are table properties for the // storage table as indicated in the materialized view definition. String storageTableName = "st_" + randomUUID().toString().replace("-", ""); - Map storageTableProperties = new HashMap<>(definition.getProperties()); - storageTableProperties.putIfAbsent(FILE_FORMAT_PROPERTY, DEFAULT_FILE_FORMAT_DEFAULT); - String storageSchema = getStorageSchema(definition.getProperties()).orElse(viewName.getSchemaName()); + String storageSchema = getStorageSchema(materializedViewProperties).orElse(viewName.getSchemaName()); SchemaTableName storageTable = new SchemaTableName(storageSchema, storageTableName); + List columns = columnsForMaterializedView(definition, materializedViewProperties); + + ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(storageTable, columns, materializedViewProperties, Optional.empty()); + String tableLocation = getTableLocation(tableMetadata.getProperties()) + .orElseGet(() -> defaultTableLocation(session, tableMetadata.getTable())); + Transaction transaction = IcebergUtil.newCreateTableTransaction(this, tableMetadata, session, false, tableLocation, ignore -> false); + AppendFiles appendFiles = transaction.newAppend(); + commit(appendFiles, session); + transaction.commitTransaction(); + return storageTable; + } - Schema schemaWithTimestampTzPreserved = schemaFromMetadata(mappedCopy( - definition.getColumns(), - column -> { + private List columnsForMaterializedView(ConnectorMaterializedViewDefinition definition, Map materializedViewProperties) + { + Schema schemaWithTimestampTzPreserved = schemaFromMetadata(definition.getColumns().stream() + .map(column -> { Type type = typeManager.getType(column.getType()); if (type instanceof TimestampWithTimeZoneType timestampTzType && timestampTzType.getPrecision() <= 6) { // For now preserve timestamptz columns so that we can parse partitioning @@ -244,23 +370,23 @@ protected SchemaTableName createMaterializedViewStorageTable(ConnectorSession se type = typeForMaterializedViewStorageTable(type); } return new ColumnMetadata(column.getName(), type); - })); - PartitionSpec partitionSpec = parsePartitionFields(schemaWithTimestampTzPreserved, getPartitioning(definition.getProperties())); + }) + .collect(toImmutableList())); + PartitionSpec partitionSpec = parsePartitionFields(schemaWithTimestampTzPreserved, getPartitioning(materializedViewProperties)); Set temporalPartitioningSources = partitionSpec.fields().stream() .flatMap(partitionField -> { Types.NestedField sourceField = schemaWithTimestampTzPreserved.findField(partitionField.sourceId()); Type sourceType = toTrinoType(sourceField.type(), typeManager); ColumnTransform columnTransform = getColumnTransform(partitionField, sourceType); - if (!columnTransform.isTemporal()) { + if (!columnTransform.temporal()) { return Stream.of(); } return Stream.of(sourceField.name()); }) .collect(toImmutableSet()); - List columns = mappedCopy( - definition.getColumns(), - column -> { + return definition.getColumns().stream() + .map(column -> { Type type = typeManager.getType(column.getType()); if (type instanceof TimestampWithTimeZoneType timestampTzType && timestampTzType.getPrecision() <= 6 && temporalPartitioningSources.contains(column.getName())) { // Apply point-in-time semantics to maintain partitioning capabilities @@ -270,14 +396,8 @@ protected SchemaTableName createMaterializedViewStorageTable(ConnectorSession se type = typeForMaterializedViewStorageTable(type); } return new ColumnMetadata(column.getName(), type); - }); - - ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(storageTable, columns, storageTableProperties, Optional.empty()); - Transaction transaction = IcebergUtil.newCreateTableTransaction(this, tableMetadata, session); - AppendFiles appendFiles = transaction.newAppend(); - commit(appendFiles, session); - transaction.commitTransaction(); - return storageTable; + }) + .collect(toImmutableList()); } /** @@ -334,38 +454,34 @@ private Type typeForMaterializedViewStorageTable(Type type) } protected ConnectorMaterializedViewDefinition getMaterializedViewDefinition( - Table icebergTable, Optional owner, String viewOriginalText, SchemaTableName storageTableName) { IcebergMaterializedViewDefinition definition = decodeMaterializedViewData(viewOriginalText); return new ConnectorMaterializedViewDefinition( - definition.getOriginalSql(), + definition.originalSql(), Optional.of(new CatalogSchemaTableName(catalogName.toString(), storageTableName)), - definition.getCatalog(), - definition.getSchema(), - toSpiMaterializedViewColumns(definition.getColumns()), - definition.getGracePeriod(), - definition.getComment(), + definition.catalog(), + definition.schema(), + toSpiMaterializedViewColumns(definition.columns()), + definition.gracePeriod(), + definition.comment(), owner, - ImmutableMap.builder() - .putAll(getIcebergTableProperties(icebergTable)) - .put(STORAGE_SCHEMA, storageTableName.getSchemaName()) - .buildOrThrow()); + Map.of("path", definition.path())); } protected List toSpiMaterializedViewColumns(List columns) { return columns.stream() - .map(column -> new ConnectorMaterializedViewDefinition.Column(column.getName(), column.getType(), column.getComment())) + .map(column -> new ConnectorMaterializedViewDefinition.Column(column.name(), column.type(), column.comment())) .collect(toImmutableList()); } protected Map createMaterializedViewProperties(ConnectorSession session, SchemaTableName storageTableName) { return ImmutableMap.builder() - .put(PRESTO_QUERY_ID_NAME, session.getQueryId()) + .put(TRINO_QUERY_ID_NAME, session.getQueryId()) .put(STORAGE_SCHEMA, storageTableName.getSchemaName()) .put(STORAGE_TABLE, storageTableName.getTableName()) .put(PRESTO_VIEW_FLAG, "true") @@ -374,12 +490,16 @@ protected Map createMaterializedViewProperties(ConnectorSession .buildOrThrow(); } - protected static class MaterializedViewMayBeBeingRemovedException - extends RuntimeException + protected Map createMaterializedViewProperties(ConnectorSession session, Location storageMetadataLocation) { - public MaterializedViewMayBeBeingRemovedException(Throwable cause) - { - super(requireNonNull(cause, "cause is null")); - } + return ImmutableMap.builder() + .put(TRINO_QUERY_ID_NAME, session.getQueryId()) + .put(METADATA_LOCATION_PROP, storageMetadataLocation.toString()) + .put(PRESTO_VIEW_FLAG, "true") + .put(TRINO_CREATED_BY, TRINO_CREATED_BY_VALUE) + .put(TABLE_COMMENT, ICEBERG_MATERIALIZED_VIEW_COMMENT) + .buildOrThrow(); } + + protected abstract void invalidateTableCache(SchemaTableName schemaTableName); } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/TrinoCatalog.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/TrinoCatalog.java index 46282a4524aa..12d8419cbac3 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/TrinoCatalog.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/TrinoCatalog.java @@ -13,6 +13,7 @@ */ package io.trino.plugin.iceberg.catalog; +import io.trino.metastore.TableInfo; import io.trino.plugin.iceberg.ColumnIdentity; import io.trino.plugin.iceberg.UnknownTableTypeException; import io.trino.spi.connector.CatalogSchemaTableName; @@ -24,6 +25,8 @@ import io.trino.spi.connector.RelationCommentMetadata; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.security.TrinoPrincipal; +import jakarta.annotation.Nullable; +import org.apache.iceberg.BaseTable; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; @@ -39,6 +42,8 @@ import java.util.function.Predicate; import java.util.function.UnaryOperator; +import static com.google.common.collect.ImmutableList.toImmutableList; + /** * An interface to allow different Iceberg catalog implementations in IcebergMetadata. *

@@ -62,6 +67,11 @@ public interface TrinoCatalog void dropNamespace(ConnectorSession session, String namespace); + default Optional getNamespaceSeparator() + { + return Optional.empty(); + } + Map loadNamespaceMetadata(ConnectorSession session, String namespace); Optional getNamespacePrincipal(ConnectorSession session, String namespace); @@ -72,7 +82,17 @@ public interface TrinoCatalog void renameNamespace(ConnectorSession session, String source, String target); - List listTables(ConnectorSession session, Optional namespace); + List listTables(ConnectorSession session, Optional namespace); + + List listIcebergTables(ConnectorSession session, Optional namespace); + + default List listViews(ConnectorSession session, Optional namespace) + { + return listTables(session, namespace).stream() + .filter(info -> info.extendedRelationType() == TableInfo.ExtendedRelationType.TRINO_VIEW) + .map(TableInfo::tableName) + .collect(toImmutableList()); + } Optional> streamRelationColumns( ConnectorSession session, @@ -86,7 +106,21 @@ Optional> streamRelationComments( UnaryOperator> relationFilter, Predicate isRedirected); + default Transaction newTransaction(Table icebergTable) + { + return icebergTable.newTransaction(); + } + Transaction newCreateTableTransaction( + ConnectorSession session, + SchemaTableName schemaTableName, + Schema schema, + PartitionSpec partitionSpec, + SortOrder sortOrder, + Optional location, + Map properties); + + Transaction newCreateOrReplaceTableTransaction( ConnectorSession session, SchemaTableName schemaTableName, Schema schema, @@ -113,7 +147,7 @@ Transaction newCreateTableTransaction( * @return Iceberg table loaded * @throws UnknownTableTypeException if table is not of Iceberg type in the metastore */ - Table loadTable(ConnectorSession session, SchemaTableName schemaTableName); + BaseTable loadTable(ConnectorSession session, SchemaTableName schemaTableName); /** * Bulk load column metadata. The returned map may contain fewer entries then asked for. @@ -126,8 +160,7 @@ Transaction newCreateTableTransaction( void updateViewColumnComment(ConnectorSession session, SchemaTableName schemaViewName, String columnName, Optional comment); - void updateMaterializedViewColumnComment(ConnectorSession session, SchemaTableName schemaViewName, String columnName, Optional comment); - + @Nullable String defaultTableLocation(ConnectorSession session, SchemaTableName schemaTableName); void setTablePrincipal(ConnectorSession session, SchemaTableName schemaTableName, TrinoPrincipal principal); @@ -140,25 +173,28 @@ Transaction newCreateTableTransaction( void dropView(ConnectorSession session, SchemaTableName schemaViewName); - List listViews(ConnectorSession session, Optional namespace); - Map getViews(ConnectorSession session, Optional namespace); Optional getView(ConnectorSession session, SchemaTableName viewName); - List listMaterializedViews(ConnectorSession session, Optional namespace); - void createMaterializedView( ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition, + Map materializedViewProperties, boolean replace, boolean ignoreExisting); + void updateMaterializedViewColumnComment(ConnectorSession session, SchemaTableName schemaViewName, String columnName, Optional comment); + void dropMaterializedView(ConnectorSession session, SchemaTableName viewName); Optional getMaterializedView(ConnectorSession session, SchemaTableName viewName); + Map getMaterializedViewProperties(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition); + + Optional getMaterializedViewStorageTable(ConnectorSession session, SchemaTableName viewName); + void renameMaterializedView(ConnectorSession session, SchemaTableName source, SchemaTableName target); void updateColumnComment(ConnectorSession session, SchemaTableName schemaTableName, ColumnIdentity columnIdentity, Optional comment); diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperations.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperations.java index e25fac198f77..05e4b1357b26 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperations.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperations.java @@ -27,11 +27,14 @@ import org.apache.iceberg.io.FileIO; import java.util.Optional; +import java.util.function.BiFunction; import static com.google.common.base.Preconditions.checkState; import static io.trino.plugin.hive.HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED; import static io.trino.plugin.hive.metastore.PrincipalPrivileges.NO_PRIVILEGES; +import static io.trino.plugin.iceberg.IcebergTableName.tableNameFrom; import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; +import static org.apache.iceberg.BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP; @NotThreadSafe public class FileMetastoreTableOperations @@ -53,9 +56,24 @@ public FileMetastoreTableOperations( protected void commitToExistingTable(TableMetadata base, TableMetadata metadata) { Table currentTable = getTable(); + commitTableUpdate(currentTable, metadata, (table, newMetadataLocation) -> Table.builder(table) + .apply(builder -> updateMetastoreTable(builder, metadata, newMetadataLocation, Optional.of(currentMetadataLocation))) + .build()); + } + + @Override + protected void commitMaterializedViewRefresh(TableMetadata base, TableMetadata metadata) + { + Table materializedView = getTable(database, tableNameFrom(tableName)); + commitTableUpdate(materializedView, metadata, (table, newMetadataLocation) -> Table.builder(table) + .apply(builder -> builder.setParameter(METADATA_LOCATION_PROP, newMetadataLocation).setParameter(PREVIOUS_METADATA_LOCATION_PROP, currentMetadataLocation)) + .build()); + } + private void commitTableUpdate(Table table, TableMetadata metadata, BiFunction tableUpdateFunction) + { checkState(currentMetadataLocation != null, "No current metadata location for existing table"); - String metadataLocation = currentTable.getParameters().get(METADATA_LOCATION_PROP); + String metadataLocation = table.getParameters().get(METADATA_LOCATION_PROP); if (!currentMetadataLocation.equals(metadataLocation)) { throw new CommitFailedException("Metadata location [%s] is not same as table metadata location [%s] for %s", currentMetadataLocation, metadataLocation, getSchemaTableName()); @@ -63,15 +81,13 @@ protected void commitToExistingTable(TableMetadata base, TableMetadata metadata) String newMetadataLocation = writeNewMetadata(metadata, version.orElseThrow() + 1); - Table table = Table.builder(currentTable) - .apply(builder -> updateMetastoreTable(builder, metadata, newMetadataLocation, Optional.of(currentMetadataLocation))) - .build(); + Table updatedTable = tableUpdateFunction.apply(table, newMetadataLocation); // todo privileges should not be replaced for an alter PrincipalPrivileges privileges = table.getOwner().map(MetastoreUtil::buildInitialPrivilegeSet).orElse(NO_PRIVILEGES); try { - metastore.replaceTable(database, tableName, table, privileges); + metastore.replaceTable(database, table.getTableName(), updatedTable, privileges); } catch (RuntimeException e) { if (e instanceof TrinoException trinoException && diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperationsProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperationsProvider.java index b14f952b0e94..6f99baaffc57 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperationsProvider.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/file/FileMetastoreTableOperationsProvider.java @@ -19,22 +19,27 @@ import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog; -import io.trino.plugin.iceberg.fileio.ForwardingFileIo; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; import io.trino.spi.connector.ConnectorSession; import java.util.Optional; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; import static java.util.Objects.requireNonNull; public class FileMetastoreTableOperationsProvider implements IcebergTableOperationsProvider { private final TrinoFileSystemFactory fileSystemFactory; + private final ForwardingFileIoFactory fileIoFactory; @Inject - public FileMetastoreTableOperationsProvider(TrinoFileSystemFactory fileSystemFactory) + public FileMetastoreTableOperationsProvider( + TrinoFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory) { this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.fileIoFactory = requireNonNull(fileIoFactory, "fileIoFactory is null"); } @Override @@ -47,7 +52,7 @@ public IcebergTableOperations createTableOperations( Optional location) { return new FileMetastoreTableOperations( - new ForwardingFileIo(fileSystemFactory.create(session)), + fileIoFactory.create(fileSystemFactory.create(session), isUseFileSizeFromMetadata(session)), ((TrinoHiveCatalog) catalog).getMetastore(), session, database, diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperations.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperations.java index 04cc5645f934..3e64cd588a92 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperations.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperations.java @@ -13,16 +13,6 @@ */ package io.trino.plugin.iceberg.catalog.glue; -import com.amazonaws.services.glue.AWSGlueAsync; -import com.amazonaws.services.glue.model.AlreadyExistsException; -import com.amazonaws.services.glue.model.ConcurrentModificationException; -import com.amazonaws.services.glue.model.CreateTableRequest; -import com.amazonaws.services.glue.model.EntityNotFoundException; -import com.amazonaws.services.glue.model.InvalidInputException; -import com.amazonaws.services.glue.model.ResourceNumberLimitExceededException; -import com.amazonaws.services.glue.model.Table; -import com.amazonaws.services.glue.model.TableInput; -import com.amazonaws.services.glue.model.UpdateTableRequest; import com.google.common.collect.ImmutableMap; import io.trino.plugin.hive.metastore.glue.GlueMetastoreStats; import io.trino.plugin.iceberg.UnknownTableTypeException; @@ -37,18 +27,33 @@ import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.exceptions.CommitStateUnknownException; import org.apache.iceberg.io.FileIO; +import software.amazon.awssdk.services.glue.GlueClient; +import software.amazon.awssdk.services.glue.model.AlreadyExistsException; +import software.amazon.awssdk.services.glue.model.ConcurrentModificationException; +import software.amazon.awssdk.services.glue.model.EntityNotFoundException; +import software.amazon.awssdk.services.glue.model.GlueException; +import software.amazon.awssdk.services.glue.model.InvalidInputException; +import software.amazon.awssdk.services.glue.model.ResourceNumberLimitExceededException; +import software.amazon.awssdk.services.glue.model.StorageDescriptor; +import software.amazon.awssdk.services.glue.model.Table; +import software.amazon.awssdk.services.glue.model.TableInput; +import java.util.HashMap; import java.util.Map; import java.util.Optional; +import java.util.function.BiFunction; import static com.google.common.base.Verify.verify; -import static io.trino.plugin.hive.ViewReaderUtil.isHiveOrPrestoView; -import static io.trino.plugin.hive.ViewReaderUtil.isPrestoView; -import static io.trino.plugin.hive.metastore.glue.converter.GlueToTrinoConverter.getTableParameters; -import static io.trino.plugin.hive.metastore.glue.converter.GlueToTrinoConverter.getTableType; import static io.trino.plugin.hive.util.HiveUtil.isIcebergTable; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_COMMIT_ERROR; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; +import static io.trino.plugin.iceberg.IcebergTableName.isMaterializedViewStorage; +import static io.trino.plugin.iceberg.IcebergTableName.tableNameFrom; +import static io.trino.plugin.iceberg.IcebergUtil.isTrinoMaterializedView; +import static io.trino.plugin.iceberg.IcebergUtil.isTrinoView; +import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getMaterializedViewTableInput; import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getTableInput; +import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getTableType; import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; @@ -59,7 +64,7 @@ public class GlueIcebergTableOperations { private final TypeManager typeManager; private final boolean cacheTableMetadata; - private final AWSGlueAsync glueClient; + private final GlueClient glueClient; private final GlueMetastoreStats stats; private final GetGlueTable getGlueTable; @@ -69,7 +74,7 @@ public class GlueIcebergTableOperations protected GlueIcebergTableOperations( TypeManager typeManager, boolean cacheTableMetadata, - AWSGlueAsync glueClient, + GlueClient glueClient, GlueMetastoreStats stats, GetGlueTable getGlueTable, FileIO fileIo, @@ -90,15 +95,25 @@ protected GlueIcebergTableOperations( @Override protected String getRefreshedLocation(boolean invalidateCaches) { - Table table = getTable(invalidateCaches); - glueVersionId = table.getVersionId(); + boolean isMaterializedViewStorageTable = isMaterializedViewStorage(tableName); - Map parameters = getTableParameters(table); - if (isPrestoView(parameters) && isHiveOrPrestoView(getTableType(table))) { - // this is a Presto Hive view, hence not a table + Table table; + if (isMaterializedViewStorageTable) { + table = getTable(database, tableNameFrom(tableName), invalidateCaches); + } + else { + table = getTable(database, tableName, invalidateCaches); + } + glueVersionId = table.versionId(); + + String tableType = getTableType(table); + Map parameters = table.parameters(); + if (!isMaterializedViewStorageTable && (isTrinoView(tableType, parameters) || isTrinoMaterializedView(tableType, parameters))) { + // this is a Hive view or Trino/Presto view, or Trino materialized view, hence not a table + // TODO table operations should not be constructed for views (remove exception-driven code path) throw new TableNotFoundException(getSchemaTableName()); } - if (!isIcebergTable(parameters)) { + if (!isMaterializedViewStorageTable && !isIcebergTable(parameters)) { throw new UnknownTableTypeException(getSchemaTableName()); } @@ -114,21 +129,22 @@ protected void commitNewTable(TableMetadata metadata) { verify(version.isEmpty(), "commitNewTable called on a table which already exists"); String newMetadataLocation = writeNewMetadata(metadata, 0); - TableInput tableInput = getTableInput(typeManager, tableName, owner, metadata, newMetadataLocation, ImmutableMap.of(), cacheTableMetadata); + TableInput tableInput = getTableInput(typeManager, tableName, owner, metadata, metadata.location(), newMetadataLocation, ImmutableMap.of(), cacheTableMetadata); - CreateTableRequest createTableRequest = new CreateTableRequest() - .withDatabaseName(database) - .withTableInput(tableInput); try { - stats.getCreateTable().call(() -> glueClient.createTable(createTableRequest)); + stats.getCreateTable().call(() -> glueClient.createTable(x -> x + .databaseName(database) + .tableInput(tableInput))); } - catch (AlreadyExistsException - | EntityNotFoundException - | InvalidInputException - | ResourceNumberLimitExceededException e) { - // clean up metadata files corresponding to the current transaction - fileIo.deleteFile(newMetadataLocation); - throw e; + catch (GlueException e) { + if (e instanceof AlreadyExistsException || + e instanceof EntityNotFoundException || + e instanceof InvalidInputException || + e instanceof ResourceNumberLimitExceededException) { + // clean up metadata files corresponding to the current transaction + fileIo.deleteFile(newMetadataLocation); + } + throw new TrinoException(ICEBERG_COMMIT_ERROR, "Cannot commit table creation", e); } shouldRefresh = true; } @@ -136,22 +152,50 @@ protected void commitNewTable(TableMetadata metadata) @Override protected void commitToExistingTable(TableMetadata base, TableMetadata metadata) { - String newMetadataLocation = writeNewMetadata(metadata, version.orElseThrow() + 1); - TableInput tableInput = getTableInput( - typeManager, - tableName, - owner, + commitTableUpdate( + getTable(database, tableName, false), metadata, - newMetadataLocation, - ImmutableMap.of(PREVIOUS_METADATA_LOCATION_PROP, currentMetadataLocation), - cacheTableMetadata); - - UpdateTableRequest updateTableRequest = new UpdateTableRequest() - .withDatabaseName(database) - .withTableInput(tableInput) - .withVersionId(glueVersionId); + (table, newMetadataLocation) -> + getTableInput( + typeManager, + tableName, + owner, + metadata, + Optional.ofNullable(table.storageDescriptor()).map(StorageDescriptor::location).orElse(null), + newMetadataLocation, + ImmutableMap.of(PREVIOUS_METADATA_LOCATION_PROP, currentMetadataLocation), + cacheTableMetadata)); + } + + @Override + protected void commitMaterializedViewRefresh(TableMetadata base, TableMetadata metadata) + { + commitTableUpdate( + getTable(database, tableNameFrom(tableName), false), + metadata, + (table, newMetadataLocation) -> { + Map parameters = new HashMap<>(table.parameters()); + parameters.put(METADATA_LOCATION_PROP, newMetadataLocation); + parameters.put(PREVIOUS_METADATA_LOCATION_PROP, currentMetadataLocation); + + return getMaterializedViewTableInput( + table.name(), + table.viewOriginalText(), + table.owner(), + parameters); + }); + } + + private void commitTableUpdate(Table table, TableMetadata metadata, BiFunction tableUpdateFunction) + { + String newMetadataLocation = writeNewMetadata(metadata, version.orElseThrow() + 1); + TableInput tableInput = tableUpdateFunction.apply(table, newMetadataLocation); + try { - stats.getUpdateTable().call(() -> glueClient.updateTable(updateTableRequest)); + stats.getUpdateTable().call(() -> glueClient.updateTable(x -> x + .databaseName(database) + .tableInput(tableInput) + .versionId(glueVersionId))); } catch (ConcurrentModificationException e) { // CommitFailedException is handled as a special case in the Iceberg library. This commit will automatically retry @@ -159,7 +203,7 @@ protected void commitToExistingTable(TableMetadata base, TableMetadata metadata) } catch (EntityNotFoundException | InvalidInputException | ResourceNumberLimitExceededException e) { // Signal a non-retriable commit failure and eventually clean up metadata files corresponding to the current transaction - throw e; + throw new TrinoException(ICEBERG_COMMIT_ERROR, "Cannot commit table update", e); } catch (RuntimeException e) { // Cannot determine whether the `updateTable` operation was successful, @@ -169,7 +213,7 @@ protected void commitToExistingTable(TableMetadata base, TableMetadata metadata) shouldRefresh = true; } - private Table getTable(boolean invalidateCaches) + private Table getTable(String database, String tableName, boolean invalidateCaches) { return getGlueTable.get(new SchemaTableName(database, tableName), invalidateCaches); } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperationsProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperationsProvider.java index 4b54259edf7a..095597884d7b 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperationsProvider.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergTableOperationsProvider.java @@ -13,41 +13,45 @@ */ package io.trino.plugin.iceberg.catalog.glue; -import com.amazonaws.services.glue.AWSGlueAsync; import com.google.inject.Inject; import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.plugin.hive.metastore.glue.GlueMetastoreStats; import io.trino.plugin.iceberg.catalog.IcebergTableOperations; import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; import io.trino.plugin.iceberg.catalog.TrinoCatalog; -import io.trino.plugin.iceberg.fileio.ForwardingFileIo; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.type.TypeManager; +import software.amazon.awssdk.services.glue.GlueClient; import java.util.Optional; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; import static java.util.Objects.requireNonNull; public class GlueIcebergTableOperationsProvider implements IcebergTableOperationsProvider { + private final TrinoFileSystemFactory fileSystemFactory; + private final ForwardingFileIoFactory fileIoFactory; private final TypeManager typeManager; private final boolean cacheTableMetadata; - private final TrinoFileSystemFactory fileSystemFactory; - private final AWSGlueAsync glueClient; + private final GlueClient glueClient; private final GlueMetastoreStats stats; @Inject public GlueIcebergTableOperationsProvider( + TrinoFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory, TypeManager typeManager, IcebergGlueCatalogConfig catalogConfig, - TrinoFileSystemFactory fileSystemFactory, GlueMetastoreStats stats, - AWSGlueAsync glueClient) + GlueClient glueClient) { + this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.fileIoFactory = requireNonNull(fileIoFactory, "fileIoFactory is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.cacheTableMetadata = catalogConfig.isCacheTableMetadata(); - this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.stats = requireNonNull(stats, "stats is null"); this.glueClient = requireNonNull(glueClient, "glueClient is null"); } @@ -69,7 +73,7 @@ public IcebergTableOperations createTableOperations( // Share Glue Table cache between Catalog and TableOperations so that, when doing metadata queries (e.g. information_schema.columns) // the GetTableRequest is issued once per table. ((TrinoGlueCatalog) catalog)::getTable, - new ForwardingFileIo(fileSystemFactory.create(session)), + fileIoFactory.create(fileSystemFactory.create(session), isUseFileSizeFromMetadata(session)), session, database, table, diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergUtil.java index b8988d3112d2..fd495979f208 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergUtil.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/GlueIcebergUtil.java @@ -13,9 +13,6 @@ */ package io.trino.plugin.iceberg.catalog.glue; -import com.amazonaws.services.glue.model.Column; -import com.amazonaws.services.glue.model.StorageDescriptor; -import com.amazonaws.services.glue.model.TableInput; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.trino.plugin.iceberg.TypeConverter; @@ -24,6 +21,10 @@ import org.apache.iceberg.TableMetadata; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; +import org.gaul.modernizer_maven_annotations.SuppressModernizer; +import software.amazon.awssdk.services.glue.model.Column; +import software.amazon.awssdk.services.glue.model.StorageDescriptor; +import software.amazon.awssdk.services.glue.model.TableInput; import java.util.HashMap; import java.util.List; @@ -34,14 +35,16 @@ import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.collect.ImmutableList.builderWithExpectedSize; +import static io.trino.metastore.TableInfo.ICEBERG_MATERIALIZED_VIEW_COMMENT; import static io.trino.plugin.hive.HiveMetadata.PRESTO_VIEW_EXPANDED_TEXT_MARKER; import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; import static io.trino.plugin.hive.TableType.VIRTUAL_VIEW; -import static io.trino.plugin.hive.ViewReaderUtil.ICEBERG_MATERIALIZED_VIEW_COMMENT; import static io.trino.plugin.iceberg.IcebergUtil.COLUMN_TRINO_NOT_NULL_PROPERTY; import static io.trino.plugin.iceberg.IcebergUtil.COLUMN_TRINO_TYPE_ID_PROPERTY; +import static io.trino.plugin.iceberg.IcebergUtil.TRINO_TABLE_COMMENT_CACHE_PREVENTED; import static io.trino.plugin.iceberg.IcebergUtil.TRINO_TABLE_METADATA_INFO_VALID_FOR; +import static java.lang.String.format; import static java.util.Locale.ENGLISH; import static org.apache.iceberg.BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE; import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; @@ -67,6 +70,7 @@ public static TableInput getTableInput( String tableName, Optional owner, TableMetadata metadata, + @Nullable String tableLocation, String newMetadataLocation, Map parameters, boolean cacheTableMetadata) @@ -76,38 +80,44 @@ public static TableInput getTableInput( parameters.put(METADATA_LOCATION_PROP, newMetadataLocation); parameters.remove(TRINO_TABLE_METADATA_INFO_VALID_FOR); // no longer valid - TableInput tableInput = new TableInput() - .withName(tableName) - .withOwner(owner.orElse(null)) + StorageDescriptor.Builder storageDescriptor = StorageDescriptor.builder() + .location(tableLocation); + + TableInput.Builder tableInput = TableInput.builder() + .name(tableName) + .owner(owner.orElse(null)) // Iceberg does not distinguish managed and external tables, all tables are treated the same and marked as EXTERNAL - .withTableType(EXTERNAL_TABLE.name()); + .tableType(EXTERNAL_TABLE.name()) + .storageDescriptor(storageDescriptor.build()); if (cacheTableMetadata) { // Store table metadata sufficient to answer information_schema.columns and system.metadata.table_comments queries, which are often queried in bulk by e.g. BI tools - String comment = metadata.properties().get(TABLE_COMMENT); Optional> glueColumns = glueColumns(typeManager, metadata); - boolean canPersistComment = (comment == null || comment.length() <= GLUE_TABLE_PARAMETER_LENGTH_LIMIT); - boolean canPersistColumnInfo = glueColumns.isPresent(); - boolean canPersistMetadata = canPersistComment && canPersistColumnInfo; - - if (canPersistMetadata) { - tableInput.withStorageDescriptor(new StorageDescriptor() - .withColumns(glueColumns.get())); + glueColumns.ifPresent(columns -> tableInput.storageDescriptor( + storageDescriptor.columns(columns).build())); - if (comment != null) { + String comment = metadata.properties().get(TABLE_COMMENT); + if (comment != null) { + if (comment.length() <= GLUE_TABLE_PARAMETER_LENGTH_LIMIT) { parameters.put(TABLE_COMMENT, comment); + parameters.remove(TRINO_TABLE_COMMENT_CACHE_PREVENTED); } else { parameters.remove(TABLE_COMMENT); + parameters.put(TRINO_TABLE_COMMENT_CACHE_PREVENTED, "true"); } - parameters.put(TRINO_TABLE_METADATA_INFO_VALID_FOR, newMetadataLocation); } + else { + parameters.remove(TABLE_COMMENT); + parameters.remove(TRINO_TABLE_COMMENT_CACHE_PREVENTED); + } + parameters.put(TRINO_TABLE_METADATA_INFO_VALID_FOR, newMetadataLocation); } - tableInput.withParameters(parameters); + tableInput.parameters(parameters); - return tableInput; + return tableInput.build(); } private static Optional> glueColumns(TypeManager typeManager, TableMetadata metadata) @@ -124,10 +134,10 @@ private static Optional> glueColumns(TypeManager typeManager, Table return Optional.empty(); } String trinoTypeId = TypeConverter.toTrinoType(icebergColumn.type(), typeManager).getTypeId().getId(); - Column column = new Column() - .withName(icebergColumn.name()) - .withType(glueTypeString) - .withComment(icebergColumn.doc()); + Column.Builder column = Column.builder() + .name(icebergColumn.name()) + .type(glueTypeString) + .comment(icebergColumn.doc()); ImmutableMap.Builder parameters = ImmutableMap.builder(); if (icebergColumn.isRequired()) { @@ -140,8 +150,8 @@ private static Optional> glueColumns(TypeManager typeManager, Table // Store type parameter for some (first) column so that we can later detect whether column parameters weren't erased by something. parameters.put(COLUMN_TRINO_TYPE_ID_PROPERTY, trinoTypeId); } - column.setParameters(parameters.buildOrThrow()); - glueColumns.add(column); + column.parameters(parameters.buildOrThrow()); + glueColumns.add(column.build()); firstColumn = false; } @@ -176,20 +186,20 @@ private static String toGlueTypeStringLossy(Type type) return "binary"; case DECIMAL: final Types.DecimalType decimalType = (Types.DecimalType) type; - return String.format("decimal(%s,%s)", decimalType.precision(), decimalType.scale()); + return format("decimal(%s,%s)", decimalType.precision(), decimalType.scale()); case STRUCT: final Types.StructType structType = type.asStructType(); final String nameToType = structType.fields().stream() - .map(f -> String.format("%s:%s", f.name(), toGlueTypeStringLossy(f.type()))) + .map(f -> format("%s:%s", f.name(), toGlueTypeStringLossy(f.type()))) .collect(Collectors.joining(",")); - return String.format("struct<%s>", nameToType); + return format("struct<%s>", nameToType); case LIST: final Types.ListType listType = type.asListType(); - return String.format("array<%s>", toGlueTypeStringLossy(listType.elementType())); + return format("array<%s>", toGlueTypeStringLossy(listType.elementType())); case MAP: final Types.MapType mapType = type.asMapType(); - return String.format( + return format( "map<%s,%s>", toGlueTypeStringLossy(mapType.keyType()), toGlueTypeStringLossy(mapType.valueType())); default: return type.typeId().name().toLowerCase(Locale.ENGLISH); @@ -198,23 +208,38 @@ private static String toGlueTypeStringLossy(Type type) public static TableInput getViewTableInput(String viewName, String viewOriginalText, @Nullable String owner, Map parameters) { - return new TableInput() - .withName(viewName) - .withTableType(VIRTUAL_VIEW.name()) - .withViewOriginalText(viewOriginalText) - .withViewExpandedText(PRESTO_VIEW_EXPANDED_TEXT_MARKER) - .withOwner(owner) - .withParameters(parameters); + return TableInput.builder() + .name(viewName) + .tableType(VIRTUAL_VIEW.name()) + .viewOriginalText(viewOriginalText) + .viewExpandedText(PRESTO_VIEW_EXPANDED_TEXT_MARKER) + .owner(owner) + .parameters(parameters) + .build(); } public static TableInput getMaterializedViewTableInput(String viewName, String viewOriginalText, String owner, Map parameters) { - return new TableInput() - .withName(viewName) - .withTableType(VIRTUAL_VIEW.name()) - .withViewOriginalText(viewOriginalText) - .withViewExpandedText(ICEBERG_MATERIALIZED_VIEW_COMMENT) - .withOwner(owner) - .withParameters(parameters); + return TableInput.builder() + .name(viewName) + .tableType(VIRTUAL_VIEW.name()) + .viewOriginalText(viewOriginalText) + .viewExpandedText(ICEBERG_MATERIALIZED_VIEW_COMMENT) + .owner(owner) + .parameters(parameters) + .build(); + } + + public static String getTableType(software.amazon.awssdk.services.glue.model.Table glueTable) + { + // Athena treats a missing table type as EXTERNAL_TABLE. + return firstNonNull(getTableTypeNullable(glueTable), "EXTERNAL_TABLE"); + } + + @Nullable + @SuppressModernizer // Usage of `Table.tableType` is not allowed. Only this method can call that. + public static String getTableTypeNullable(software.amazon.awssdk.services.glue.model.Table glueTable) + { + return glueTable.tableType(); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/TrinoGlueCatalog.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/TrinoGlueCatalog.java index a3a0a2cb96f7..ed0a7103c200 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/TrinoGlueCatalog.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/glue/TrinoGlueCatalog.java @@ -13,27 +13,8 @@ */ package io.trino.plugin.iceberg.catalog.glue; -import com.amazonaws.AmazonServiceException; -import com.amazonaws.services.glue.AWSGlueAsync; -import com.amazonaws.services.glue.model.AccessDeniedException; -import com.amazonaws.services.glue.model.AlreadyExistsException; -import com.amazonaws.services.glue.model.Column; -import com.amazonaws.services.glue.model.CreateDatabaseRequest; -import com.amazonaws.services.glue.model.CreateTableRequest; -import com.amazonaws.services.glue.model.Database; -import com.amazonaws.services.glue.model.DatabaseInput; -import com.amazonaws.services.glue.model.DeleteDatabaseRequest; -import com.amazonaws.services.glue.model.DeleteTableRequest; -import com.amazonaws.services.glue.model.EntityNotFoundException; -import com.amazonaws.services.glue.model.GetDatabaseRequest; -import com.amazonaws.services.glue.model.GetDatabasesRequest; -import com.amazonaws.services.glue.model.GetDatabasesResult; -import com.amazonaws.services.glue.model.GetTableRequest; -import com.amazonaws.services.glue.model.GetTablesRequest; -import com.amazonaws.services.glue.model.GetTablesResult; -import com.amazonaws.services.glue.model.TableInput; -import com.amazonaws.services.glue.model.UpdateTableRequest; import com.google.common.cache.Cache; +import com.google.common.collect.AbstractIterator; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.util.concurrent.UncheckedExecutionException; @@ -41,7 +22,10 @@ import dev.failsafe.RetryPolicy; import io.airlift.log.Logger; import io.trino.cache.EvictableCacheBuilder; +import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.metastore.TableInfo; import io.trino.plugin.base.CatalogName; import io.trino.plugin.hive.SchemaAlreadyExistsException; import io.trino.plugin.hive.TrinoViewUtil; @@ -52,7 +36,9 @@ import io.trino.plugin.iceberg.IcebergMetadata; import io.trino.plugin.iceberg.UnknownTableTypeException; import io.trino.plugin.iceberg.catalog.AbstractTrinoCatalog; +import io.trino.plugin.iceberg.catalog.IcebergTableOperations; import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; import io.trino.spi.TrinoException; import io.trino.spi.connector.CatalogSchemaTableName; import io.trino.spi.connector.ColumnMetadata; @@ -75,14 +61,29 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.TableOperations; import org.apache.iceberg.Transaction; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.io.FileIO; - +import software.amazon.awssdk.core.exception.SdkException; +import software.amazon.awssdk.services.glue.GlueClient; +import software.amazon.awssdk.services.glue.model.AccessDeniedException; +import software.amazon.awssdk.services.glue.model.AlreadyExistsException; +import software.amazon.awssdk.services.glue.model.Column; +import software.amazon.awssdk.services.glue.model.Database; +import software.amazon.awssdk.services.glue.model.DatabaseInput; +import software.amazon.awssdk.services.glue.model.EntityNotFoundException; +import software.amazon.awssdk.services.glue.model.GetDatabasesResponse; +import software.amazon.awssdk.services.glue.model.GetTablesResponse; +import software.amazon.awssdk.services.glue.model.StorageDescriptor; +import software.amazon.awssdk.services.glue.model.Table; +import software.amazon.awssdk.services.glue.model.TableInput; + +import java.io.IOException; import java.time.Duration; +import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -90,7 +91,9 @@ import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; import java.util.function.Consumer; import java.util.function.Predicate; import java.util.function.UnaryOperator; @@ -100,10 +103,14 @@ import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Throwables.throwIfInstanceOf; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.Streams.stream; import static io.trino.cache.CacheUtils.uncheckedCacheGet; import static io.trino.filesystem.Locations.appendPath; +import static io.trino.plugin.base.util.ExecutorUtil.processWithAdditionalThreads; import static io.trino.plugin.hive.HiveErrorCode.HIVE_DATABASE_LOCATION_ERROR; import static io.trino.plugin.hive.HiveErrorCode.HIVE_METASTORE_ERROR; import static io.trino.plugin.hive.HiveMetadata.STORAGE_TABLE; @@ -111,34 +118,34 @@ import static io.trino.plugin.hive.TableType.VIRTUAL_VIEW; import static io.trino.plugin.hive.TrinoViewUtil.createViewProperties; import static io.trino.plugin.hive.ViewReaderUtil.encodeViewData; -import static io.trino.plugin.hive.ViewReaderUtil.isPrestoView; import static io.trino.plugin.hive.ViewReaderUtil.isTrinoMaterializedView; -import static io.trino.plugin.hive.metastore.glue.AwsSdkUtil.getPaginatedResults; -import static io.trino.plugin.hive.metastore.glue.converter.GlueToTrinoConverter.getColumnParameters; -import static io.trino.plugin.hive.metastore.glue.converter.GlueToTrinoConverter.getTableParameters; -import static io.trino.plugin.hive.metastore.glue.converter.GlueToTrinoConverter.getTableType; -import static io.trino.plugin.hive.metastore.glue.converter.GlueToTrinoConverter.getTableTypeNullable; import static io.trino.plugin.hive.util.HiveUtil.isHiveSystemSchema; import static io.trino.plugin.hive.util.HiveUtil.isIcebergTable; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CATALOG_ERROR; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; -import static io.trino.plugin.iceberg.IcebergMaterializedViewAdditionalProperties.STORAGE_SCHEMA; import static io.trino.plugin.iceberg.IcebergMaterializedViewDefinition.decodeMaterializedViewData; import static io.trino.plugin.iceberg.IcebergMaterializedViewDefinition.encodeMaterializedViewData; import static io.trino.plugin.iceberg.IcebergMaterializedViewDefinition.fromConnectorMaterializedViewDefinition; +import static io.trino.plugin.iceberg.IcebergMaterializedViewProperties.STORAGE_SCHEMA; import static io.trino.plugin.iceberg.IcebergSchemaProperties.LOCATION_PROPERTY; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; +import static io.trino.plugin.iceberg.IcebergTableName.tableNameWithType; import static io.trino.plugin.iceberg.IcebergUtil.COLUMN_TRINO_NOT_NULL_PROPERTY; import static io.trino.plugin.iceberg.IcebergUtil.COLUMN_TRINO_TYPE_ID_PROPERTY; +import static io.trino.plugin.iceberg.IcebergUtil.TRINO_TABLE_COMMENT_CACHE_PREVENTED; import static io.trino.plugin.iceberg.IcebergUtil.TRINO_TABLE_METADATA_INFO_VALID_FOR; import static io.trino.plugin.iceberg.IcebergUtil.getColumnMetadatas; import static io.trino.plugin.iceberg.IcebergUtil.getIcebergTableWithMetadata; import static io.trino.plugin.iceberg.IcebergUtil.getTableComment; +import static io.trino.plugin.iceberg.IcebergUtil.isTrinoView; import static io.trino.plugin.iceberg.IcebergUtil.quotedTableName; -import static io.trino.plugin.iceberg.IcebergUtil.validateTableCanBeDropped; +import static io.trino.plugin.iceberg.TableType.MATERIALIZED_VIEW_STORAGE; import static io.trino.plugin.iceberg.TrinoMetricsReporter.TRINO_METRICS_REPORTER; import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getMaterializedViewTableInput; import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getTableInput; +import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getTableType; +import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getTableTypeNullable; import static io.trino.plugin.iceberg.catalog.glue.GlueIcebergUtil.getViewTableInput; import static io.trino.spi.StandardErrorCode.ALREADY_EXISTS; import static io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR; @@ -148,6 +155,7 @@ import static java.lang.Boolean.parseBoolean; import static java.lang.String.format; import static java.util.Locale.ENGLISH; +import static java.util.Map.entry; import static java.util.Objects.requireNonNull; import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; import static org.apache.iceberg.CatalogUtil.dropTableData; @@ -157,44 +165,57 @@ public class TrinoGlueCatalog { private static final Logger LOG = Logger.get(TrinoGlueCatalog.class); - private static final int PER_QUERY_CACHE_SIZE = 1000; + private static final int PER_QUERY_CACHES_SIZE = 1000; private final String trinoVersion; - private final TypeManager typeManager; private final boolean cacheTableMetadata; - private final TrinoFileSystemFactory fileSystemFactory; private final Optional defaultSchemaLocation; - private final AWSGlueAsync glueClient; + private final GlueClient glueClient; private final GlueMetastoreStats stats; + private final boolean hideMaterializedViewStorageTable; + private final boolean isUsingSystemSecurity; + private final Executor metadataFetchingExecutor; - private final Cache glueTableCache = EvictableCacheBuilder.newBuilder() + private final Cache glueTableCache = EvictableCacheBuilder.newBuilder() // Even though this is query-scoped, this still needs to be bounded. information_schema queries can access large number of tables. - .maximumSize(Math.max(PER_QUERY_CACHE_SIZE, IcebergMetadata.GET_METADATA_BATCH_SIZE)) + .maximumSize(Math.max(PER_QUERY_CACHES_SIZE, IcebergMetadata.GET_METADATA_BATCH_SIZE)) + .build(); + + private final Cache tableMetadataCache = EvictableCacheBuilder.newBuilder() + .maximumSize(PER_QUERY_CACHES_SIZE) + .build(); + private final Cache viewCache = EvictableCacheBuilder.newBuilder() + .maximumSize(PER_QUERY_CACHES_SIZE) + .build(); + private final Cache materializedViewCache = EvictableCacheBuilder.newBuilder() + .maximumSize(PER_QUERY_CACHES_SIZE) .build(); - private final Map tableMetadataCache = new ConcurrentHashMap<>(); - private final Map viewCache = new ConcurrentHashMap<>(); - private final Map materializedViewCache = new ConcurrentHashMap<>(); public TrinoGlueCatalog( CatalogName catalogName, TrinoFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory, TypeManager typeManager, boolean cacheTableMetadata, IcebergTableOperationsProvider tableOperationsProvider, String trinoVersion, - AWSGlueAsync glueClient, + GlueClient glueClient, GlueMetastoreStats stats, + boolean isUsingSystemSecurity, Optional defaultSchemaLocation, - boolean useUniqueTableLocation) + boolean useUniqueTableLocation, + boolean hideMaterializedViewStorageTable, + Executor metadataFetchingExecutor) { - super(catalogName, typeManager, tableOperationsProvider, useUniqueTableLocation); + super(catalogName, useUniqueTableLocation, typeManager, tableOperationsProvider, fileSystemFactory, fileIoFactory); this.trinoVersion = requireNonNull(trinoVersion, "trinoVersion is null"); - this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.cacheTableMetadata = cacheTableMetadata; - this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.glueClient = requireNonNull(glueClient, "glueClient is null"); this.stats = requireNonNull(stats, "stats is null"); + this.isUsingSystemSecurity = isUsingSystemSecurity; this.defaultSchemaLocation = requireNonNull(defaultSchemaLocation, "defaultSchemaLocation is null"); + this.hideMaterializedViewStorageTable = hideMaterializedViewStorageTable; + this.metadataFetchingExecutor = requireNonNull(metadataFetchingExecutor, "metadataFetchingExecutor is null"); } @Override @@ -207,13 +228,13 @@ public boolean namespaceExists(ConnectorSession session, String namespace) } return stats.getGetDatabase().call(() -> { try { - glueClient.getDatabase(new GetDatabaseRequest().withName(namespace)); + glueClient.getDatabase(x -> x.name(namespace)); return true; } catch (EntityNotFoundException e) { return false; } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, e); } }); @@ -223,18 +244,14 @@ public boolean namespaceExists(ConnectorSession session, String namespace) public List listNamespaces(ConnectorSession session) { try { - return getPaginatedResults( - glueClient::getDatabases, - new GetDatabasesRequest(), - GetDatabasesRequest::setNextToken, - GetDatabasesResult::getNextToken, - stats.getGetDatabases()) - .map(GetDatabasesResult::getDatabaseList) - .flatMap(List::stream) - .map(com.amazonaws.services.glue.model.Database::getName) - .collect(toImmutableList()); + return stats.getGetDatabases().call(() -> + glueClient.getDatabasesPaginator(ignore -> {}).stream() + .map(GetDatabasesResponse::databaseList) + .flatMap(List::stream) + .map(Database::name) + .collect(toImmutableList())); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, e); } } @@ -253,12 +270,12 @@ public void dropNamespace(ConnectorSession session, String namespace) try { glueTableCache.invalidateAll(); stats.getDeleteDatabase().call(() -> - glueClient.deleteDatabase(new DeleteDatabaseRequest().withName(namespace))); + glueClient.deleteDatabase(x -> x.name(namespace))); } catch (EntityNotFoundException e) { - throw new SchemaNotFoundException(namespace); + throw new SchemaNotFoundException(namespace, e); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, e); } } @@ -267,22 +284,21 @@ public void dropNamespace(ConnectorSession session, String namespace) public Map loadNamespaceMetadata(ConnectorSession session, String namespace) { try { - GetDatabaseRequest getDatabaseRequest = new GetDatabaseRequest().withName(namespace); Database database = stats.getGetDatabase().call(() -> - glueClient.getDatabase(getDatabaseRequest).getDatabase()); + glueClient.getDatabase(x -> x.name(namespace)).database()); ImmutableMap.Builder metadata = ImmutableMap.builder(); - if (database.getLocationUri() != null) { - metadata.put(LOCATION_PROPERTY, database.getLocationUri()); + if (database.locationUri() != null) { + metadata.put(LOCATION_PROPERTY, database.locationUri()); } - if (database.getParameters() != null) { - metadata.putAll(database.getParameters()); + if (database.parameters() != null) { + metadata.putAll(database.parameters()); } return metadata.buildOrThrow(); } catch (EntityNotFoundException e) { - throw new SchemaNotFoundException(namespace); + throw new SchemaNotFoundException(namespace, e); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, e); } } @@ -301,28 +317,28 @@ public void createNamespace(ConnectorSession session, String namespace, Map - glueClient.createDatabase(new CreateDatabaseRequest() - .withDatabaseInput(createDatabaseInput(namespace, properties)))); + glueClient.createDatabase(x -> x + .databaseInput(createDatabaseInput(namespace, properties)))); } catch (AlreadyExistsException e) { throw new SchemaAlreadyExistsException(namespace); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, e); } } - private DatabaseInput createDatabaseInput(String namespace, Map properties) + private static DatabaseInput createDatabaseInput(String namespace, Map properties) { - DatabaseInput databaseInput = new DatabaseInput().withName(namespace); + DatabaseInput.Builder databaseInput = DatabaseInput.builder().name(namespace); properties.forEach((property, value) -> { switch (property) { - case LOCATION_PROPERTY -> databaseInput.setLocationUri((String) value); + case LOCATION_PROPERTY -> databaseInput.locationUri((String) value); default -> throw new IllegalArgumentException("Unrecognized property: " + property); } }); - return databaseInput; + return databaseInput.build(); } @Override @@ -338,27 +354,45 @@ public void renameNamespace(ConnectorSession session, String source, String targ } @Override - public List listTables(ConnectorSession session, Optional namespace) + public List listTables(ConnectorSession session, Optional namespace) + { + return listTables(session, namespace, ignore -> true); + } + + @Override + public List listIcebergTables(ConnectorSession session, Optional namespace) { - ImmutableList.Builder tables = ImmutableList.builder(); + return listTables(session, namespace, table -> isIcebergTable(table.parameters())).stream() + .map(TableInfo::tableName) + .collect(toImmutableList()); + } + + private List listTables( + ConnectorSession session, + Optional namespace, + Predicate

tablePredicate) + { + List>> tasks = listNamespaces(session, namespace).stream() + .map(glueNamespace -> (Callable>) () -> getGlueTablesWithExceptionHandling(glueNamespace) + .filter(tablePredicate) + .map(table -> mapToTableInfo(glueNamespace, table)) + .collect(toImmutableList())) + .collect(toImmutableList()); try { - List namespaces = listNamespaces(session, namespace); - for (String glueNamespace : namespaces) { - try { - // Add all tables from a namespace together, in case it is removed while fetching paginated results - tables.addAll(getGlueTables(glueNamespace) - .map(table -> new SchemaTableName(glueNamespace, table.getName())) - .collect(toImmutableList())); - } - catch (EntityNotFoundException | AccessDeniedException e) { - // Namespace may have been deleted or permission denied - } - } + return processWithAdditionalThreads(tasks, metadataFetchingExecutor).stream() + .flatMap(Collection::stream) + .collect(toImmutableList()); } - catch (AmazonServiceException e) { - throw new TrinoException(ICEBERG_CATALOG_ERROR, e); + catch (ExecutionException e) { + throw new RuntimeException(e.getCause()); } - return tables.build(); + } + + private TableInfo mapToTableInfo(String glueNamespace, Table table) + { + return new TableInfo( + new SchemaTableName(glueNamespace, table.name()), + TableInfo.ExtendedRelationType.fromTableTypeAndComment(getTableType(table), table.parameters().get(TABLE_COMMENT))); } @Override @@ -370,22 +404,22 @@ public Optional> streamRelationColumns( { ImmutableList.Builder unfilteredResult = ImmutableList.builder(); ImmutableList.Builder filteredResult = ImmutableList.builder(); - Map unprocessed = new HashMap<>(); + Map unprocessed = new HashMap<>(); listNamespaces(session, namespace).stream() .flatMap(glueNamespace -> getGlueTables(glueNamespace) - .map(table -> Map.entry(new SchemaTableName(glueNamespace, table.getName()), table))) + .map(table -> entry(new SchemaTableName(glueNamespace, table.name()), table))) .forEach(entry -> { SchemaTableName name = entry.getKey(); - com.amazonaws.services.glue.model.Table table = entry.getValue(); + Table table = entry.getValue(); String tableType = getTableType(table); - Map tableParameters = getTableParameters(table); + Map tableParameters = table.parameters(); if (isTrinoMaterializedView(tableType, tableParameters)) { - IcebergMaterializedViewDefinition definition = decodeMaterializedViewData(table.getViewOriginalText()); - unfilteredResult.add(RelationColumnsMetadata.forMaterializedView(name, toSpiMaterializedViewColumns(definition.getColumns()))); + IcebergMaterializedViewDefinition definition = decodeMaterializedViewData(table.viewOriginalText()); + unfilteredResult.add(RelationColumnsMetadata.forMaterializedView(name, toSpiMaterializedViewColumns(definition.columns()))); } - else if (isPrestoView(tableParameters)) { - ConnectorViewDefinition definition = ViewReaderUtil.PrestoViewReader.decodeViewData(table.getViewOriginalText()); + else if (isTrinoView(tableType, tableParameters)) { + ConnectorViewDefinition definition = ViewReaderUtil.PrestoViewReader.decodeViewData(table.viewOriginalText()); unfilteredResult.add(RelationColumnsMetadata.forView(name, definition.getColumns())); } else if (isRedirected.test(name)) { @@ -401,7 +435,7 @@ else if (!isIcebergTable(tableParameters)) { } else { unprocessed.put(name, table); - if (unprocessed.size() >= PER_QUERY_CACHE_SIZE) { + if (unprocessed.size() >= PER_QUERY_CACHES_SIZE) { getColumnsFromIcebergMetadata(session, unprocessed, relationFilter, filteredResult::add); unprocessed.clear(); } @@ -427,12 +461,12 @@ else if (!isIcebergTable(tableParameters)) { private void getColumnsFromIcebergMetadata( ConnectorSession session, - Map glueTables, // only Iceberg tables + Map glueTables, // only Iceberg tables UnaryOperator> relationFilter, Consumer resultsCollector) { for (SchemaTableName tableName : relationFilter.apply(glueTables.keySet())) { - com.amazonaws.services.glue.model.Table table = glueTables.get(tableName); + Table table = glueTables.get(tableName); // potentially racy with invalidation, but TrinoGlueCatalog is session-scoped uncheckedCacheGet(glueTableCache, tableName, () -> table); List columns; @@ -441,6 +475,7 @@ private void getColumnsFromIcebergMetadata( } catch (RuntimeException e) { // Table may be concurrently deleted + // TODO detect file not found failure when reading metadata file and silently skip table in such case. Avoid logging warnings for legitimate situations. LOG.warn(e, "Failed to get metadata for table: %s", tableName); return; } @@ -461,22 +496,22 @@ public Optional> streamRelationComments( ImmutableList.Builder unfilteredResult = ImmutableList.builder(); ImmutableList.Builder filteredResult = ImmutableList.builder(); - Map unprocessed = new HashMap<>(); + Map unprocessed = new HashMap<>(); listNamespaces(session, namespace).stream() .flatMap(glueNamespace -> getGlueTables(glueNamespace) - .map(table -> Map.entry(new SchemaTableName(glueNamespace, table.getName()), table))) + .map(table -> entry(new SchemaTableName(glueNamespace, table.name()), table))) .forEach(entry -> { SchemaTableName name = entry.getKey(); - com.amazonaws.services.glue.model.Table table = entry.getValue(); + Table table = entry.getValue(); String tableType = getTableType(table); - Map tableParameters = getTableParameters(table); + Map tableParameters = table.parameters(); if (isTrinoMaterializedView(tableType, tableParameters)) { - Optional comment = decodeMaterializedViewData(table.getViewOriginalText()).getComment(); + Optional comment = decodeMaterializedViewData(table.viewOriginalText()).comment(); unfilteredResult.add(RelationCommentMetadata.forTable(name, comment)); } - else if (isPrestoView(tableParameters)) { - Optional comment = ViewReaderUtil.PrestoViewReader.decodeViewData(table.getViewOriginalText()).getComment(); + else if (isTrinoView(tableType, tableParameters)) { + Optional comment = ViewReaderUtil.PrestoViewReader.decodeViewData(table.viewOriginalText()).getComment(); unfilteredResult.add(RelationCommentMetadata.forTable(name, comment)); } else if (isRedirected.test(name)) { @@ -489,13 +524,14 @@ else if (!isIcebergTable(tableParameters)) { else { String metadataLocation = tableParameters.get(METADATA_LOCATION_PROP); String metadataValidForMetadata = tableParameters.get(TRINO_TABLE_METADATA_INFO_VALID_FOR); - if (metadataValidForMetadata != null && metadataValidForMetadata.equals(metadataLocation)) { + boolean tableCommentWasCached = tableParameters.getOrDefault(TRINO_TABLE_COMMENT_CACHE_PREVENTED, "false").equals("false"); + if (metadataValidForMetadata != null && metadataValidForMetadata.equals(metadataLocation) && tableCommentWasCached) { Optional comment = Optional.ofNullable(tableParameters.get(TABLE_COMMENT)); unfilteredResult.add(RelationCommentMetadata.forTable(name, comment)); } else { unprocessed.put(name, table); - if (unprocessed.size() >= PER_QUERY_CACHE_SIZE) { + if (unprocessed.size() >= PER_QUERY_CACHES_SIZE) { getCommentsFromIcebergMetadata(session, unprocessed, relationFilter, filteredResult::add); unprocessed.clear(); } @@ -521,12 +557,12 @@ else if (!isIcebergTable(tableParameters)) { private void getCommentsFromIcebergMetadata( ConnectorSession session, - Map glueTables, // only Iceberg tables + Map glueTables, // only Iceberg tables UnaryOperator> relationFilter, Consumer resultsCollector) { for (SchemaTableName tableName : relationFilter.apply(glueTables.keySet())) { - com.amazonaws.services.glue.model.Table table = glueTables.get(tableName); + Table table = glueTables.get(tableName); // potentially racy with invalidation, but TrinoGlueCatalog is session-scoped uncheckedCacheGet(glueTableCache, tableName, () -> table); Optional comment; @@ -535,6 +571,7 @@ private void getCommentsFromIcebergMetadata( } catch (RuntimeException e) { // Table may be concurrently deleted + // TODO detect file not found failure when reading metadata file and silently skip table in such case. Avoid logging warnings for legitimate situations. LOG.warn(e, "Failed to get metadata for table: %s", tableName); return; } @@ -543,24 +580,32 @@ private void getCommentsFromIcebergMetadata( } @Override - public Table loadTable(ConnectorSession session, SchemaTableName table) + public BaseTable loadTable(ConnectorSession session, SchemaTableName table) { - if (viewCache.containsKey(table) || materializedViewCache.containsKey(table)) { + if (viewCache.asMap().containsKey(table) || materializedViewCache.asMap().containsKey(table)) { throw new TableNotFoundException(table); } - TableMetadata metadata = tableMetadataCache.computeIfAbsent( - table, - ignore -> { - TableOperations operations = tableOperationsProvider.createTableOperations( - this, - session, - table.getSchemaName(), - table.getTableName(), - Optional.empty(), - Optional.empty()); - return new BaseTable(operations, quotedTableName(table), TRINO_METRICS_REPORTER).operations().current(); - }); + TableMetadata metadata; + try { + metadata = uncheckedCacheGet( + tableMetadataCache, + table, + () -> { + TableOperations operations = tableOperationsProvider.createTableOperations( + this, + session, + table.getSchemaName(), + table.getTableName(), + Optional.empty(), + Optional.empty()); + return new BaseTable(operations, quotedTableName(table), TRINO_METRICS_REPORTER).operations().current(); + }); + } + catch (UncheckedExecutionException e) { + throwIfUnchecked(e.getCause()); + throw e; + } return getIcebergTableWithMetadata( this, @@ -599,45 +644,46 @@ public Map> tryGetColumnMetadata(Connector private Optional> getCachedColumnMetadata(SchemaTableName tableName) { - if (!cacheTableMetadata || viewCache.containsKey(tableName) || materializedViewCache.containsKey(tableName)) { + if (!cacheTableMetadata || viewCache.asMap().containsKey(tableName) || materializedViewCache.asMap().containsKey(tableName)) { return Optional.empty(); } - com.amazonaws.services.glue.model.Table glueTable = getTable(tableName, false); + Table glueTable = getTable(tableName, false); return getCachedColumnMetadata(glueTable); } - private Optional> getCachedColumnMetadata(com.amazonaws.services.glue.model.Table glueTable) + private Optional> getCachedColumnMetadata(Table glueTable) { if (!cacheTableMetadata) { return Optional.empty(); } - Map tableParameters = getTableParameters(glueTable); + Map tableParameters = glueTable.parameters(); String metadataLocation = tableParameters.get(METADATA_LOCATION_PROP); String metadataValidForMetadata = tableParameters.get(TRINO_TABLE_METADATA_INFO_VALID_FOR); + Optional storageDescriptor = Optional.ofNullable(glueTable.storageDescriptor()); if (metadataLocation == null || !metadataLocation.equals(metadataValidForMetadata) || - glueTable.getStorageDescriptor() == null || - glueTable.getStorageDescriptor().getColumns() == null) { + storageDescriptor.isEmpty() || + !storageDescriptor.get().hasColumns()) { return Optional.empty(); } - List glueColumns = glueTable.getStorageDescriptor().getColumns(); - if (glueColumns.stream().noneMatch(column -> getColumnParameters(column).containsKey(COLUMN_TRINO_TYPE_ID_PROPERTY))) { + List glueColumns = storageDescriptor.get().columns(); + if (glueColumns.stream().noneMatch(column -> column.parameters().containsKey(COLUMN_TRINO_TYPE_ID_PROPERTY))) { // No column has type parameter, maybe the parameters were erased return Optional.empty(); } ImmutableList.Builder columns = ImmutableList.builderWithExpectedSize(glueColumns.size()); for (Column glueColumn : glueColumns) { - Map columnParameters = getColumnParameters(glueColumn); - String trinoTypeId = columnParameters.getOrDefault(COLUMN_TRINO_TYPE_ID_PROPERTY, glueColumn.getType()); + Map columnParameters = glueColumn.parameters(); + String trinoTypeId = columnParameters.getOrDefault(COLUMN_TRINO_TYPE_ID_PROPERTY, glueColumn.type()); boolean notNull = parseBoolean(columnParameters.getOrDefault(COLUMN_TRINO_NOT_NULL_PROPERTY, "false")); Type type = typeManager.getType(TypeId.of(trinoTypeId)); columns.add(ColumnMetadata.builder() - .setName(glueColumn.getName()) + .setName(glueColumn.name()) .setType(type) - .setComment(Optional.ofNullable(glueColumn.getComment())) + .setComment(Optional.ofNullable(glueColumn.comment())) .setNullable(!notNull) .build()); } @@ -647,12 +693,11 @@ private Optional> getCachedColumnMetadata(com.amazonaws.ser @Override public void dropTable(ConnectorSession session, SchemaTableName schemaTableName) { - BaseTable table = (BaseTable) loadTable(session, schemaTableName); - validateTableCanBeDropped(table); + BaseTable table = loadTable(session, schemaTableName); try { deleteTable(schemaTableName.getSchemaName(), schemaTableName.getTableName()); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(HIVE_METASTORE_ERROR, e); } try { @@ -664,18 +709,20 @@ public void dropTable(ConnectorSession session, SchemaTableName schemaTableName) LOG.warn(e, "Failed to delete table data referenced by metadata"); } deleteTableDirectory(fileSystemFactory.create(session), schemaTableName, table.location()); + invalidateTableCache(schemaTableName); } @Override public void dropCorruptedTable(ConnectorSession session, SchemaTableName schemaTableName) { - com.amazonaws.services.glue.model.Table table = dropTableFromMetastore(session, schemaTableName); - String metadataLocation = getTableParameters(table).get(METADATA_LOCATION_PROP); + Table table = dropTableFromMetastore(session, schemaTableName); + String metadataLocation = table.parameters().get(METADATA_LOCATION_PROP); if (metadataLocation == null) { throw new TrinoException(ICEBERG_INVALID_METADATA, format("Table %s is missing [%s] property", schemaTableName, METADATA_LOCATION_PROP)); } String tableLocation = metadataLocation.replaceFirst("/metadata/[^/]*$", ""); deleteTableDirectory(fileSystemFactory.create(session), schemaTableName, tableLocation); + invalidateTableCache(schemaTableName); } @Override @@ -685,7 +732,7 @@ public Transaction newCreateTableTransaction( Schema schema, PartitionSpec partitionSpec, SortOrder sortOrder, - String location, + Optional location, Map properties) { return newCreateTableTransaction( @@ -699,6 +746,27 @@ public Transaction newCreateTableTransaction( Optional.of(session.getUser())); } + @Override + public Transaction newCreateOrReplaceTableTransaction( + ConnectorSession session, + SchemaTableName schemaTableName, + Schema schema, + PartitionSpec partitionSpec, + SortOrder sortOrder, + String location, + Map properties) + { + return newCreateOrReplaceTableTransaction( + session, + schemaTableName, + schema, + partitionSpec, + sortOrder, + location, + properties, + Optional.of(session.getUser())); + } + @Override public void registerTable(ConnectorSession session, SchemaTableName schemaTableName, TableMetadata tableMetadata) throws TrinoException @@ -708,6 +776,7 @@ public void registerTable(ConnectorSession session, SchemaTableName schemaTableN schemaTableName.getTableName(), Optional.of(session.getUser()), tableMetadata, + tableMetadata.location(), tableMetadata.metadataFileLocation(), ImmutableMap.of(), cacheTableMetadata); @@ -718,20 +787,21 @@ public void registerTable(ConnectorSession session, SchemaTableName schemaTableN public void unregisterTable(ConnectorSession session, SchemaTableName schemaTableName) { dropTableFromMetastore(session, schemaTableName); + invalidateTableCache(schemaTableName); } - private com.amazonaws.services.glue.model.Table dropTableFromMetastore(ConnectorSession session, SchemaTableName schemaTableName) + private Table dropTableFromMetastore(ConnectorSession session, SchemaTableName schemaTableName) { - com.amazonaws.services.glue.model.Table table = getTableAndCacheMetadata(session, schemaTableName) + Table table = getTableAndCacheMetadata(session, schemaTableName) .orElseThrow(() -> new TableNotFoundException(schemaTableName)); - if (!isIcebergTable(getTableParameters(table))) { + if (!isIcebergTable(table.parameters())) { throw new UnknownTableTypeException(schemaTableName); } try { deleteTable(schemaTableName.getSchemaName(), schemaTableName.getTableName()); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(HIVE_METASTORE_ERROR, e); } return table; @@ -742,9 +812,9 @@ public void renameTable(ConnectorSession session, SchemaTableName from, SchemaTa { boolean newTableCreated = false; try { - com.amazonaws.services.glue.model.Table table = getTableAndCacheMetadata(session, from) + Table table = getTableAndCacheMetadata(session, from) .orElseThrow(() -> new TableNotFoundException(from)); - Map tableParameters = new HashMap<>(getTableParameters(table)); + Map tableParameters = new HashMap<>(table.parameters()); FileIO io = loadTable(session, from).io(); String metadataLocation = tableParameters.remove(METADATA_LOCATION_PROP); if (metadataLocation == null) { @@ -754,14 +824,16 @@ public void renameTable(ConnectorSession session, SchemaTableName from, SchemaTa TableInput tableInput = getTableInput( typeManager, to.getTableName(), - Optional.ofNullable(table.getOwner()), + Optional.ofNullable(table.owner()), metadata, + Optional.ofNullable(table.storageDescriptor()).map(StorageDescriptor::location).orElse(null), metadataLocation, tableParameters, cacheTableMetadata); createTable(to.getSchemaName(), tableInput); newTableCreated = true; deleteTable(from.getSchemaName(), from.getTableName()); + invalidateTableCache(from); } catch (RuntimeException e) { if (newTableCreated) { @@ -778,9 +850,9 @@ public void renameTable(ConnectorSession session, SchemaTableName from, SchemaTa } } - private Optional getTableAndCacheMetadata(ConnectorSession session, SchemaTableName schemaTableName) + private Optional
getTableAndCacheMetadata(ConnectorSession session, SchemaTableName schemaTableName) { - com.amazonaws.services.glue.model.Table table; + Table table; try { table = getTable(schemaTableName, false); } @@ -788,54 +860,57 @@ private Optional getTableAndCacheMetada return Optional.empty(); } - Map parameters = getTableParameters(table); - if (isIcebergTable(parameters) && !tableMetadataCache.containsKey(schemaTableName)) { - if (viewCache.containsKey(schemaTableName) || materializedViewCache.containsKey(schemaTableName)) { + String tableType = getTableType(table); + Map parameters = table.parameters(); + if (isIcebergTable(parameters) && !tableMetadataCache.asMap().containsKey(schemaTableName)) { + if (viewCache.asMap().containsKey(schemaTableName) || materializedViewCache.asMap().containsKey(schemaTableName)) { throw new TrinoException(GENERIC_INTERNAL_ERROR, "Glue table cache inconsistency. Table cannot also be a view/materialized view"); } String metadataLocation = parameters.get(METADATA_LOCATION_PROP); try { // Cache the TableMetadata while we have the Table retrieved anyway - TableOperations operations = tableOperationsProvider.createTableOperations( - this, - session, - schemaTableName.getSchemaName(), - schemaTableName.getTableName(), - Optional.empty(), - Optional.empty()); - FileIO io = operations.io(); - tableMetadataCache.put(schemaTableName, TableMetadataParser.read(io, io.newInputFile(metadataLocation))); + // Note: this is racy from cache invalidation perspective, but it should not matter here + uncheckedCacheGet(tableMetadataCache, schemaTableName, () -> TableMetadataParser.read(fileIoFactory.create(fileSystemFactory.create(session), isUseFileSizeFromMetadata(session)), metadataLocation)); } catch (RuntimeException e) { LOG.warn(e, "Failed to cache table metadata from table at %s", metadataLocation); } } - else if (isTrinoMaterializedView(getTableType(table), parameters)) { - if (viewCache.containsKey(schemaTableName) || tableMetadataCache.containsKey(schemaTableName)) { + else if (isTrinoMaterializedView(tableType, parameters)) { + if (viewCache.asMap().containsKey(schemaTableName) || tableMetadataCache.asMap().containsKey(schemaTableName)) { throw new TrinoException(GENERIC_INTERNAL_ERROR, "Glue table cache inconsistency. Materialized View cannot also be a table or view"); } try { - createMaterializedViewDefinition(session, schemaTableName, table) - .ifPresent(materializedView -> materializedViewCache.put(schemaTableName, materializedView)); + // Note: this is racy from cache invalidation perspective, but it should not matter here + uncheckedCacheGet(materializedViewCache, schemaTableName, () -> { + ConnectorMaterializedViewDefinition materializedView = createMaterializedViewDefinition(schemaTableName, table); + return new MaterializedViewData( + materializedView, + Optional.ofNullable(parameters.get(METADATA_LOCATION_PROP))); + }); } catch (RuntimeException e) { LOG.warn(e, "Failed to cache materialized view from %s", schemaTableName); } } - else if (isPrestoView(parameters) && !viewCache.containsKey(schemaTableName)) { - if (materializedViewCache.containsKey(schemaTableName) || tableMetadataCache.containsKey(schemaTableName)) { + else if (isTrinoView(tableType, parameters) && !viewCache.asMap().containsKey(schemaTableName)) { + if (materializedViewCache.asMap().containsKey(schemaTableName) || tableMetadataCache.asMap().containsKey(schemaTableName)) { throw new TrinoException(GENERIC_INTERNAL_ERROR, "Glue table cache inconsistency. View cannot also be a materialized view or table"); } try { - TrinoViewUtil.getView(schemaTableName, - Optional.ofNullable(table.getViewOriginalText()), - getTableType(table), + TrinoViewUtil.getView( + schemaTableName, + Optional.ofNullable(table.viewOriginalText()), + tableType, parameters, - Optional.ofNullable(table.getOwner())) - .ifPresent(viewDefinition -> viewCache.put(schemaTableName, viewDefinition)); + Optional.ofNullable(table.owner())) + .ifPresent(viewDefinition -> { + // Note: this is racy from cache invalidation perspective, but it should not matter here + uncheckedCacheGet(viewCache, schemaTableName, () -> viewDefinition); + }); } catch (RuntimeException e) { LOG.warn(e, "Failed to cache view from %s", schemaTableName); @@ -848,12 +923,9 @@ else if (isPrestoView(parameters) && !viewCache.containsKey(schemaTableName)) { @Override public String defaultTableLocation(ConnectorSession session, SchemaTableName schemaTableName) { - GetDatabaseRequest getDatabaseRequest = new GetDatabaseRequest() - .withName(schemaTableName.getSchemaName()); String databaseLocation = stats.getGetDatabase().call(() -> - glueClient.getDatabase(getDatabaseRequest) - .getDatabase() - .getLocationUri()); + glueClient.getDatabase(x -> x.name(schemaTableName.getSchemaName())) + .database().locationUri()); String tableName = createNewTableName(schemaTableName.getTableName()); @@ -891,16 +963,17 @@ public void createView(ConnectorSession session, SchemaTableName schemaViewName, Failsafe.with(RetryPolicy.builder() .withMaxRetries(3) .withDelay(Duration.ofMillis(100)) - .abortIf(throwable -> !replace || throwable instanceof ViewAlreadyExistsException) + .handleIf(throwable -> replace && !(throwable instanceof ViewAlreadyExistsException)) + //.abortOn(TrinoFileSystem::isUnrecoverableException) .build()) .run(() -> doCreateView(session, schemaViewName, viewTableInput, replace)); } private void doCreateView(ConnectorSession session, SchemaTableName schemaViewName, TableInput viewTableInput, boolean replace) { - Optional existing = getTableAndCacheMetadata(session, schemaViewName); + Optional
existing = getTableAndCacheMetadata(session, schemaViewName); if (existing.isPresent()) { - if (!replace || !isPrestoView(getTableParameters(existing.get()))) { + if (!replace || !isTrinoView(getTableType(existing.get()), existing.get().parameters())) { // TODO: ViewAlreadyExists is misleading if the name is used by a table https://github.com/trinodb/trino/issues/10037 throw new ViewAlreadyExistsException(schemaViewName); } @@ -922,13 +995,13 @@ public void renameView(ConnectorSession session, SchemaTableName source, SchemaT { boolean newTableCreated = false; try { - com.amazonaws.services.glue.model.Table existingView = getTableAndCacheMetadata(session, source) + Table existingView = getTableAndCacheMetadata(session, source) .orElseThrow(() -> new TableNotFoundException(source)); - viewCache.remove(source); + viewCache.invalidate(source); TableInput viewTableInput = getViewTableInput( target.getTableName(), - existingView.getViewOriginalText(), - existingView.getOwner(), + existingView.viewOriginalText(), + existingView.owner(), createViewProperties(session, trinoVersion, TRINO_CREATED_BY_VALUE)); createTable(target.getSchemaName(), viewTableInput); newTableCreated = true; @@ -963,62 +1036,38 @@ public void dropView(ConnectorSession session, SchemaTableName schemaViewName) } try { - viewCache.remove(schemaViewName); + viewCache.invalidate(schemaViewName); deleteTable(schemaViewName.getSchemaName(), schemaViewName.getTableName()); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(HIVE_METASTORE_ERROR, e); } } - @Override - public List listViews(ConnectorSession session, Optional namespace) - { - ImmutableList.Builder views = ImmutableList.builder(); - try { - List namespaces = listNamespaces(session, namespace); - for (String glueNamespace : namespaces) { - try { - views.addAll(getGlueTables(glueNamespace) - .filter(table -> isPrestoView(getTableParameters(table)) && !isTrinoMaterializedView(getTableType(table), getTableParameters(table))) // TODO isTrinoMaterializedView should not be needed, isPrestoView should not return true for materialized views - .map(table -> new SchemaTableName(glueNamespace, table.getName())) - .collect(toImmutableList())); - } - catch (EntityNotFoundException | AccessDeniedException e) { - // Namespace may have been deleted or permission denied - } - } - } - catch (AmazonServiceException e) { - throw new TrinoException(ICEBERG_CATALOG_ERROR, e); - } - return views.build(); - } - @Override public Optional getView(ConnectorSession session, SchemaTableName viewName) { - ConnectorViewDefinition cachedView = viewCache.get(viewName); + ConnectorViewDefinition cachedView = viewCache.getIfPresent(viewName); if (cachedView != null) { return Optional.of(cachedView); } - if (tableMetadataCache.containsKey(viewName) || materializedViewCache.containsKey(viewName)) { + if (tableMetadataCache.asMap().containsKey(viewName) || materializedViewCache.asMap().containsKey(viewName)) { // Entries in these caches are not views return Optional.empty(); } - Optional table = getTableAndCacheMetadata(session, viewName); + Optional
table = getTableAndCacheMetadata(session, viewName); if (table.isEmpty()) { return Optional.empty(); } - com.amazonaws.services.glue.model.Table viewDefinition = table.get(); + Table viewDefinition = table.get(); return TrinoViewUtil.getView( viewName, - Optional.ofNullable(viewDefinition.getViewOriginalText()), + Optional.ofNullable(viewDefinition.viewOriginalText()), getTableType(viewDefinition), - getTableParameters(viewDefinition), - Optional.ofNullable(viewDefinition.getOwner())); + viewDefinition.parameters(), + Optional.ofNullable(viewDefinition.owner())); } @Override @@ -1068,33 +1117,9 @@ private void updateView(ConnectorSession session, SchemaTableName viewName, Conn try { updateTable(viewName.getSchemaName(), viewTableInput); } - catch (AmazonServiceException e) { - throw new TrinoException(ICEBERG_CATALOG_ERROR, e); - } - } - - @Override - public List listMaterializedViews(ConnectorSession session, Optional namespace) - { - ImmutableList.Builder materializedViews = ImmutableList.builder(); - try { - List namespaces = listNamespaces(session, namespace); - for (String glueNamespace : namespaces) { - try { - materializedViews.addAll(getGlueTables(glueNamespace) - .filter(table -> isTrinoMaterializedView(getTableType(table), getTableParameters(table))) - .map(table -> new SchemaTableName(glueNamespace, table.getName())) - .collect(toImmutableList())); - } - catch (EntityNotFoundException | AccessDeniedException e) { - // Namespace may have been deleted or permission denied - } - } - } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, e); } - return materializedViews.build(); } @Override @@ -1102,13 +1127,14 @@ public void createMaterializedView( ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition, + Map materializedViewProperties, boolean replace, boolean ignoreExisting) { - Optional existing = getTableAndCacheMetadata(session, viewName); + Optional
existing = getTableAndCacheMetadata(session, viewName); if (existing.isPresent()) { - if (!isTrinoMaterializedView(getTableType(existing.get()), getTableParameters(existing.get()))) { + if (!isTrinoMaterializedView(getTableType(existing.get()), existing.get().parameters())) { throw new TrinoException(UNSUPPORTED_TABLE_TYPE, "Existing table is not a Materialized View: " + viewName); } if (!replace) { @@ -1119,13 +1145,55 @@ public void createMaterializedView( } } + if (hideMaterializedViewStorageTable) { + Location storageMetadataLocation = createMaterializedViewStorage(session, viewName, definition, materializedViewProperties); + TableInput materializedViewTableInput = getMaterializedViewTableInput( + viewName.getTableName(), + encodeMaterializedViewData(fromConnectorMaterializedViewDefinition(definition)), + isUsingSystemSecurity ? null : session.getUser(), + createMaterializedViewProperties(session, storageMetadataLocation)); + try { + if (existing.isPresent()) { + updateTable(viewName.getSchemaName(), materializedViewTableInput); + } + else { + createTable(viewName.getSchemaName(), materializedViewTableInput); + } + } + catch (RuntimeException e) { + try { + dropMaterializedViewStorage(session, fileSystemFactory.create(session), storageMetadataLocation.toString()); + } + catch (Exception suppressed) { + LOG.warn(suppressed, "Failed to clean up metadata '%s' for materialized view '%s'", storageMetadataLocation, viewName); + if (e != suppressed) { + e.addSuppressed(suppressed); + } + } + throw e; + } + + existing.ifPresent(existingView -> dropMaterializedViewStorage(session, existingView)); + } + else { + createMaterializedViewWithStorageTable(session, viewName, definition, materializedViewProperties, existing); + } + } + + private void createMaterializedViewWithStorageTable( + ConnectorSession session, + SchemaTableName viewName, + ConnectorMaterializedViewDefinition definition, + Map materializedViewProperties, + Optional
existing) + { // Create the storage table - SchemaTableName storageTable = createMaterializedViewStorageTable(session, viewName, definition); + SchemaTableName storageTable = createMaterializedViewStorageTable(session, viewName, definition, materializedViewProperties); // Create a view indicating the storage table TableInput materializedViewTableInput = getMaterializedViewTableInput( viewName.getTableName(), encodeMaterializedViewData(fromConnectorMaterializedViewDefinition(definition)), - session.getUser(), + isUsingSystemSecurity ? null : session.getUser(), createMaterializedViewProperties(session, storageTable)); if (existing.isPresent()) { @@ -1144,7 +1212,7 @@ public void createMaterializedView( } } } - dropStorageTable(session, existing.get()); + dropMaterializedViewStorage(session, existing.get()); } else { createTable(viewName.getSchemaName(), materializedViewTableInput); @@ -1167,23 +1235,23 @@ public void updateMaterializedViewColumnComment(ConnectorSession session, Schema definition.getGracePeriod(), definition.getComment(), definition.getOwner(), - definition.getProperties()); + Map.of()); //definition.getPath()); - updateMaterializedView(session, viewName, newDefinition); + updateMaterializedView(viewName, newDefinition); } - private void updateMaterializedView(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition newDefinition) + private void updateMaterializedView(SchemaTableName viewName, ConnectorMaterializedViewDefinition newDefinition) { + Table table = getTable(viewName, false); TableInput materializedViewTableInput = getMaterializedViewTableInput( viewName.getTableName(), encodeMaterializedViewData(fromConnectorMaterializedViewDefinition(newDefinition)), - session.getUser(), - createMaterializedViewProperties(session, newDefinition.getStorageTable().orElseThrow().getSchemaTableName())); - + table.owner(), + table.parameters()); try { updateTable(viewName.getSchemaName(), materializedViewTableInput); } - catch (AmazonServiceException e) { + catch (SdkException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, e); } } @@ -1191,29 +1259,39 @@ private void updateMaterializedView(ConnectorSession session, SchemaTableName vi @Override public void dropMaterializedView(ConnectorSession session, SchemaTableName viewName) { - com.amazonaws.services.glue.model.Table view = getTableAndCacheMetadata(session, viewName) + Table view = getTableAndCacheMetadata(session, viewName) .orElseThrow(() -> new MaterializedViewNotFoundException(viewName)); - if (!isTrinoMaterializedView(getTableType(view), getTableParameters(view))) { - throw new TrinoException(UNSUPPORTED_TABLE_TYPE, "Not a Materialized View: " + view.getDatabaseName() + "." + view.getName()); + if (!isTrinoMaterializedView(getTableType(view), view.parameters())) { + throw new TrinoException(UNSUPPORTED_TABLE_TYPE, "Not a Materialized View: " + view.databaseName() + "." + view.name()); } - materializedViewCache.remove(viewName); - dropStorageTable(session, view); - deleteTable(view.getDatabaseName(), view.getName()); + materializedViewCache.invalidate(viewName); + dropMaterializedViewStorage(session, view); + deleteTable(view.databaseName(), view.name()); } - private void dropStorageTable(ConnectorSession session, com.amazonaws.services.glue.model.Table view) + private void dropMaterializedViewStorage(ConnectorSession session, Table view) { - Map parameters = getTableParameters(view); + Map parameters = view.parameters(); String storageTableName = parameters.get(STORAGE_TABLE); if (storageTableName != null) { String storageSchema = Optional.ofNullable(parameters.get(STORAGE_SCHEMA)) - .orElse(view.getDatabaseName()); + .orElse(view.databaseName()); try { dropTable(session, new SchemaTableName(storageSchema, storageTableName)); } catch (TrinoException e) { - LOG.warn(e, "Failed to drop storage table '%s.%s' for materialized view '%s'", storageSchema, storageTableName, view.getName()); + LOG.warn(e, "Failed to drop storage table '%s.%s' for materialized view '%s'", storageSchema, storageTableName, view.name()); + } + } + else { + String storageMetadataLocation = parameters.get(METADATA_LOCATION_PROP); + checkState(storageMetadataLocation != null, "Storage location missing in definition of materialized view " + view.name()); + try { + dropMaterializedViewStorage(session, fileSystemFactory.create(session), storageMetadataLocation); + } + catch (IOException e) { + LOG.warn(e, "Failed to delete storage table metadata '%s' for materialized view '%s'", storageMetadataLocation, view.name()); } } } @@ -1221,63 +1299,116 @@ private void dropStorageTable(ConnectorSession session, com.amazonaws.services.g @Override protected Optional doGetMaterializedView(ConnectorSession session, SchemaTableName viewName) { - ConnectorMaterializedViewDefinition materializedViewDefinition = materializedViewCache.get(viewName); - if (materializedViewDefinition != null) { - return Optional.of(materializedViewDefinition); + MaterializedViewData materializedViewData = materializedViewCache.getIfPresent(viewName); + if (materializedViewData != null) { + return Optional.of(materializedViewData.connectorMaterializedViewDefinition); } - if (tableMetadataCache.containsKey(viewName) || viewCache.containsKey(viewName)) { + if (tableMetadataCache.asMap().containsKey(viewName) || viewCache.asMap().containsKey(viewName)) { // Entries in these caches are not materialized views. return Optional.empty(); } - Optional maybeTable = getTableAndCacheMetadata(session, viewName); + Optional
maybeTable = getTableAndCacheMetadata(session, viewName); if (maybeTable.isEmpty()) { return Optional.empty(); } - com.amazonaws.services.glue.model.Table table = maybeTable.get(); - if (!isTrinoMaterializedView(getTableType(table), getTableParameters(table))) { + Table table = maybeTable.get(); + if (!isTrinoMaterializedView(getTableType(table), table.parameters())) { return Optional.empty(); } - return createMaterializedViewDefinition(session, viewName, table); + return Optional.of(createMaterializedViewDefinition(viewName, table)); } - private Optional createMaterializedViewDefinition( - ConnectorSession session, + private ConnectorMaterializedViewDefinition createMaterializedViewDefinition( SchemaTableName viewName, - com.amazonaws.services.glue.model.Table table) + Table table) { - Map materializedViewParameters = getTableParameters(table); + Map materializedViewParameters = table.parameters(); String storageTable = materializedViewParameters.get(STORAGE_TABLE); - checkState(storageTable != null, "Storage table missing in definition of materialized view " + viewName); - String storageSchema = Optional.ofNullable(materializedViewParameters.get(STORAGE_SCHEMA)) - .orElse(viewName.getSchemaName()); - SchemaTableName storageTableName = new SchemaTableName(storageSchema, storageTable); + String storageMetadataLocation = materializedViewParameters.get(METADATA_LOCATION_PROP); + if ((storageTable == null) == (storageMetadataLocation == null)) { + throw new TrinoException(ICEBERG_BAD_DATA, "Materialized view should have exactly one of the %s properties set: %s".formatted( + ImmutableList.of(STORAGE_TABLE, METADATA_LOCATION_PROP), + materializedViewParameters)); + } + + SchemaTableName storageTableName; + if (storageTable != null) { + String storageSchema = Optional.ofNullable(materializedViewParameters.get(STORAGE_SCHEMA)) + .orElse(viewName.getSchemaName()); + storageTableName = new SchemaTableName(storageSchema, storageTable); + + if (table.viewOriginalText() == null) { + throw new TrinoException(ICEBERG_BAD_DATA, "Materialized view did not have original text " + viewName); + } + } + else { + storageTableName = new SchemaTableName(viewName.getSchemaName(), tableNameWithType(viewName.getTableName(), MATERIALIZED_VIEW_STORAGE)); + } + + return getMaterializedViewDefinition( + Optional.ofNullable(table.owner()), + table.viewOriginalText(), + storageTableName); + } + + @Override + public Optional getMaterializedViewStorageTable(ConnectorSession session, SchemaTableName viewName) + { + String storageMetadataLocation; + MaterializedViewData materializedViewData = materializedViewCache.getIfPresent(viewName); + if (materializedViewData == null) { + Optional
maybeTable = getTableAndCacheMetadata(session, viewName); + if (maybeTable.isEmpty()) { + return Optional.empty(); + } + Table materializedView = maybeTable.get(); + verify(isTrinoMaterializedView(getTableType(materializedView), materializedView.parameters()), + "getMaterializedViewStorageTable received a table, not a materialized view"); + + // TODO getTableAndCacheMetadata saved the value in materializedViewCache, so we could just use that, except when conversion fails + storageMetadataLocation = materializedView.parameters().get(METADATA_LOCATION_PROP); + checkState(storageMetadataLocation != null, "Storage location missing in definition of materialized view " + materializedView.name()); + } + else { + storageMetadataLocation = materializedViewData.storageMetadataLocation + .orElseThrow(() -> new IllegalStateException("Storage location not defined for materialized view " + viewName)); + } + + SchemaTableName storageTableName = new SchemaTableName(viewName.getSchemaName(), tableNameWithType(viewName.getTableName(), MATERIALIZED_VIEW_STORAGE)); + IcebergTableOperations operations = tableOperationsProvider.createTableOperations( + this, + session, + storageTableName.getSchemaName(), + storageTableName.getTableName(), + Optional.empty(), + Optional.empty()); - Table icebergTable; try { - icebergTable = loadTable(session, storageTableName); + TableMetadata metadata = getMaterializedViewTableMetadata(session, storageTableName, storageMetadataLocation); + operations.initializeFromMetadata(metadata); + return Optional.of(new BaseTable(operations, quotedTableName(storageTableName), TRINO_METRICS_REPORTER)); } - catch (RuntimeException e) { - // The materialized view could be removed concurrently. This may manifest in a number of ways, e.g. - // - io.trino.spi.connector.TableNotFoundException - // - org.apache.iceberg.exceptions.NotFoundException when accessing manifest file - // - other failures when reading storage table's metadata files - // Retry, as we're catching broadly. - throw new MaterializedViewMayBeBeingRemovedException(e); - } - - String viewOriginalText = table.getViewOriginalText(); - if (viewOriginalText == null) { - throw new TrinoException(ICEBERG_BAD_DATA, "Materialized view did not have original text " + viewName); - } - return Optional.of(getMaterializedViewDefinition( - icebergTable, - Optional.ofNullable(table.getOwner()), - viewOriginalText, - storageTableName)); + catch (UncheckedExecutionException e) { + // Removed during reading + if (e.getCause() instanceof NotFoundException) { + return Optional.empty(); + } + throw e; + } + } + + private TableMetadata getMaterializedViewTableMetadata(ConnectorSession session, SchemaTableName storageTableName, String storageMetadataLocation) + { + requireNonNull(storageTableName, "storageTableName is null"); + requireNonNull(storageMetadataLocation, "storageMetadataLocation is null"); + return uncheckedCacheGet(tableMetadataCache, storageTableName, () -> { + TrinoFileSystem fileSystem = fileSystemFactory.create(session); + return TableMetadataParser.read(fileIoFactory.create(fileSystem, isUseFileSizeFromMetadata(session)), storageMetadataLocation); + }); } @Override @@ -1285,14 +1416,14 @@ public void renameMaterializedView(ConnectorSession session, SchemaTableName sou { boolean newTableCreated = false; try { - com.amazonaws.services.glue.model.Table glueTable = getTableAndCacheMetadata(session, source) + Table glueTable = getTableAndCacheMetadata(session, source) .orElseThrow(() -> new TableNotFoundException(source)); - materializedViewCache.remove(source); - Map tableParameters = getTableParameters(glueTable); + materializedViewCache.invalidate(source); + Map tableParameters = glueTable.parameters(); if (!isTrinoMaterializedView(getTableType(glueTable), tableParameters)) { throw new TrinoException(UNSUPPORTED_TABLE_TYPE, "Not a Materialized View: " + source); } - TableInput tableInput = getMaterializedViewTableInput(target.getTableName(), glueTable.getViewOriginalText(), glueTable.getOwner(), tableParameters); + TableInput tableInput = getMaterializedViewTableInput(target.getTableName(), glueTable.viewOriginalText(), glueTable.owner(), tableParameters); createTable(target.getSchemaName(), tableInput); newTableCreated = true; deleteTable(source.getSchemaName(), source.getTableName()); @@ -1329,19 +1460,25 @@ public Optional redirectTable(ConnectorSession session, tableName.getSchemaName(), tableName.getTableName().substring(0, metadataMarkerIndex)); - Optional table = getTableAndCacheMetadata(session, new SchemaTableName(tableNameBase.getSchemaName(), tableNameBase.getTableName())); + Optional
table = getTableAndCacheMetadata(session, new SchemaTableName(tableNameBase.getSchemaName(), tableNameBase.getTableName())); if (table.isEmpty() || VIRTUAL_VIEW.name().equals(getTableTypeNullable(table.get()))) { return Optional.empty(); } - if (!isIcebergTable(getTableParameters(table.get()))) { + if (!isIcebergTable(table.get().parameters())) { // After redirecting, use the original table name, with "$partitions" and similar suffixes return Optional.of(new CatalogSchemaTableName(hiveCatalogName, tableName)); } return Optional.empty(); } - com.amazonaws.services.glue.model.Table getTable(SchemaTableName tableName, boolean invalidateCaches) + @Override + protected void invalidateTableCache(SchemaTableName schemaTableName) + { + tableMetadataCache.invalidate(schemaTableName); + } + + Table getTable(SchemaTableName tableName, boolean invalidateCaches) { if (invalidateCaches) { glueTableCache.invalidate(tableName); @@ -1350,14 +1487,18 @@ com.amazonaws.services.glue.model.Table getTable(SchemaTableName tableName, bool try { return uncheckedCacheGet(glueTableCache, tableName, () -> { try { - GetTableRequest getTableRequest = new GetTableRequest() - .withDatabaseName(tableName.getSchemaName()) - .withName(tableName.getTableName()); - return stats.getGetTable().call(() -> glueClient.getTable(getTableRequest).getTable()); + return stats.getGetTable().call(() -> + glueClient.getTable(x -> x + .databaseName(tableName.getSchemaName()) + .name(tableName.getTableName())) + .table()); } catch (EntityNotFoundException e) { throw new TableNotFoundException(tableName, e); } + catch (SdkException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, e); + } }); } catch (UncheckedExecutionException e) { @@ -1366,42 +1507,89 @@ com.amazonaws.services.glue.model.Table getTable(SchemaTableName tableName, bool } } - private Stream getGlueTables(String glueNamespace) + private Stream
getGlueTablesWithExceptionHandling(String glueNamespace) + { + return stream(new AbstractIterator<>() + { + private Iterator
delegate; + + @Override + protected Table computeNext() + { + boolean firstCall = (delegate == null); + try { + if (delegate == null) { + delegate = getGlueTables(glueNamespace) + .iterator(); + } + + if (!delegate.hasNext()) { + return endOfData(); + } + return delegate.next(); + } + catch (EntityNotFoundException e) { + // database does not exist or deleted during iteration + return endOfData(); + } + catch (AccessDeniedException e) { + // permission denied may actually mean "does not exist" + if (!firstCall) { + LOG.warn(e, "Permission denied when getting next batch of tables from namespace %s", glueNamespace); + } + return endOfData(); + } + catch (SdkException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, e); + } + } + }); + } + + private Stream
getGlueTables(String glueNamespace) { - return getPaginatedResults( - glueClient::getTables, - new GetTablesRequest().withDatabaseName(glueNamespace), - GetTablesRequest::setNextToken, - GetTablesResult::getNextToken, - stats.getGetTables()) - .map(GetTablesResult::getTableList) - .flatMap(List::stream); + return stats.getGetTables().call(() -> + glueClient.getTablesPaginator(x -> x.databaseName(glueNamespace)) + .stream() + .map(GetTablesResponse::tableList) + .flatMap(List::stream)); } private void createTable(String schemaName, TableInput tableInput) { glueTableCache.invalidateAll(); stats.getCreateTable().call(() -> - glueClient.createTable(new CreateTableRequest() - .withDatabaseName(schemaName) - .withTableInput(tableInput))); + glueClient.createTable(x -> x + .databaseName(schemaName) + .tableInput(tableInput))); } private void updateTable(String schemaName, TableInput tableInput) { glueTableCache.invalidateAll(); stats.getUpdateTable().call(() -> - glueClient.updateTable(new UpdateTableRequest() - .withDatabaseName(schemaName) - .withTableInput(tableInput))); + glueClient.updateTable(x -> x + .databaseName(schemaName) + .tableInput(tableInput))); } private void deleteTable(String schema, String table) { glueTableCache.invalidateAll(); stats.getDeleteTable().call(() -> - glueClient.deleteTable(new DeleteTableRequest() - .withDatabaseName(schema) - .withName(table))); + glueClient.deleteTable(x -> x + .databaseName(schema) + .name(table))); + } + + private record MaterializedViewData( + ConnectorMaterializedViewDefinition connectorMaterializedViewDefinition, + Optional storageMetadataLocation) + { + private MaterializedViewData + { + requireNonNull(connectorMaterializedViewDefinition, "connectorMaterializedViewDefinition is null"); + requireNonNull(storageMetadataLocation, "storageMetadataLocation is null"); + } } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/AbstractMetastoreTableOperations.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/AbstractMetastoreTableOperations.java index 0774d7103c2e..c1fb8e4c8ce3 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/AbstractMetastoreTableOperations.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/AbstractMetastoreTableOperations.java @@ -14,16 +14,15 @@ package io.trino.plugin.iceberg.catalog.hms; import io.trino.annotation.NotThreadSafe; -import io.trino.plugin.hive.TableAlreadyExistsException; import io.trino.plugin.hive.metastore.MetastoreUtil; import io.trino.plugin.hive.metastore.PrincipalPrivileges; import io.trino.plugin.hive.metastore.Table; import io.trino.plugin.hive.metastore.cache.CachingHiveMetastore; +import io.trino.plugin.iceberg.CreateTableException; import io.trino.plugin.iceberg.UnknownTableTypeException; import io.trino.plugin.iceberg.catalog.AbstractIcebergTableOperations; import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorSession; -import io.trino.spi.connector.SchemaNotFoundException; import io.trino.spi.connector.TableNotFoundException; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.io.FileIO; @@ -33,11 +32,13 @@ import static com.google.common.base.Verify.verify; import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; -import static io.trino.plugin.hive.ViewReaderUtil.isHiveOrPrestoView; -import static io.trino.plugin.hive.ViewReaderUtil.isPrestoView; +import static io.trino.plugin.hive.ViewReaderUtil.isTrinoMaterializedView; import static io.trino.plugin.hive.metastore.PrincipalPrivileges.NO_PRIVILEGES; import static io.trino.plugin.hive.util.HiveUtil.isIcebergTable; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; +import static io.trino.plugin.iceberg.IcebergTableName.isMaterializedViewStorage; +import static io.trino.plugin.iceberg.IcebergTableName.tableNameFrom; +import static io.trino.plugin.iceberg.IcebergUtil.isTrinoView; import static java.lang.String.format; import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; @@ -45,6 +46,8 @@ import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; import static org.apache.iceberg.BaseMetastoreTableOperations.PREVIOUS_METADATA_LOCATION_PROP; import static org.apache.iceberg.BaseMetastoreTableOperations.TABLE_TYPE_PROP; +import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_ID; +import static org.apache.iceberg.TableProperties.CURRENT_SNAPSHOT_TIMESTAMP; @NotThreadSafe public abstract class AbstractMetastoreTableOperations @@ -66,18 +69,28 @@ protected AbstractMetastoreTableOperations( } @Override - protected final String getRefreshedLocation(boolean invalidateCaches) + protected String getRefreshedLocation(boolean invalidateCaches) { if (invalidateCaches) { metastore.invalidateTable(database, tableName); } - Table table = getTable(); - if (isPrestoView(table) && isHiveOrPrestoView(table)) { - // this is a Hive view, hence not a table + boolean isMaterializedViewStorageTable = isMaterializedViewStorage(tableName); + + Table table; + if (isMaterializedViewStorageTable) { + table = getTable(database, tableNameFrom(tableName)); + } + else { + table = getTable(); + } + + if (!isMaterializedViewStorageTable && (isTrinoView(table) || isTrinoMaterializedView(table))) { + // this is a Hive view or Trino/Presto view, or Trino materialized view, hence not a table + // TODO table operations should not be constructed for views (remove exception-driven code path) throw new TableNotFoundException(getSchemaTableName()); } - if (!isIcebergTable(table)) { + if (!isMaterializedViewStorageTable && !isIcebergTable(table)) { throw new UnknownTableTypeException(getSchemaTableName()); } @@ -112,25 +125,36 @@ protected final void commitNewTable(TableMetadata metadata) try { metastore.createTable(table, privileges); } - catch (SchemaNotFoundException - | TableAlreadyExistsException e) { - // clean up metadata files corresponding to the current transaction + catch (Exception e) { + // clean up metadata file corresponding to the current transaction fileIo.deleteFile(newMetadataLocation); - throw e; + // wrap exception in CleanableFailure to ensure that manifest list Avro files are also cleaned up + throw new CreateTableException(e, getSchemaTableName()); } } protected Table.Builder updateMetastoreTable(Table.Builder builder, TableMetadata metadata, String metadataLocation, Optional previousMetadataLocation) { - return builder + builder .setDataColumns(toHiveColumns(metadata.schema().columns())) .withStorage(storage -> storage.setLocation(metadata.location())) .setParameter(METADATA_LOCATION_PROP, metadataLocation) .setParameter(PREVIOUS_METADATA_LOCATION_PROP, previousMetadataLocation) .setParameter(TABLE_COMMENT, Optional.ofNullable(metadata.properties().get(TABLE_COMMENT))); + if (metadata.currentSnapshot() != null) { + builder + .setParameter(CURRENT_SNAPSHOT_ID, String.valueOf(metadata.currentSnapshot().snapshotId())) + .setParameter(CURRENT_SNAPSHOT_TIMESTAMP, String.valueOf(metadata.currentSnapshot().timestampMillis())); + } + return builder; } protected Table getTable() + { + return getTable(database, tableName); + } + + protected Table getTable(String database, String tableName) { return metastore.getTable(database, tableName) .orElseThrow(() -> new TableNotFoundException(getSchemaTableName())); diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/TrinoHiveCatalog.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/TrinoHiveCatalog.java index 65c89fe3f517..954977ccad10 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/TrinoHiveCatalog.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/hms/TrinoHiveCatalog.java @@ -13,12 +13,17 @@ */ package io.trino.plugin.iceberg.catalog.hms; +import com.google.common.cache.Cache; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; +import com.google.common.util.concurrent.UncheckedExecutionException; import io.airlift.log.Logger; +import io.trino.cache.EvictableCacheBuilder; import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.metastore.TableInfo; +import io.trino.metastore.TableInfo.ExtendedRelationType; import io.trino.plugin.base.CatalogName; import io.trino.plugin.hive.HiveSchemaProperties; import io.trino.plugin.hive.TrinoViewHiveMetastore; @@ -29,9 +34,12 @@ import io.trino.plugin.hive.metastore.PrincipalPrivileges; import io.trino.plugin.hive.metastore.cache.CachingHiveMetastore; import io.trino.plugin.hive.util.HiveUtil; +import io.trino.plugin.iceberg.IcebergTableName; import io.trino.plugin.iceberg.UnknownTableTypeException; import io.trino.plugin.iceberg.catalog.AbstractTrinoCatalog; +import io.trino.plugin.iceberg.catalog.IcebergTableOperations; import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; import io.trino.spi.TrinoException; import io.trino.spi.connector.CatalogSchemaTableName; import io.trino.spi.connector.ColumnMetadata; @@ -51,48 +59,58 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; -import org.apache.iceberg.Table; import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.Transaction; +import org.apache.iceberg.exceptions.NotFoundException; import java.io.IOException; +import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; import java.util.function.Predicate; import java.util.function.UnaryOperator; import java.util.stream.Collectors; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Throwables.throwIfUnchecked; +import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.cache.CacheUtils.uncheckedCacheGet; import static io.trino.filesystem.Locations.appendPath; +import static io.trino.metastore.TableInfo.ICEBERG_MATERIALIZED_VIEW_COMMENT; +import static io.trino.plugin.base.util.ExecutorUtil.processWithAdditionalThreads; import static io.trino.plugin.hive.HiveErrorCode.HIVE_DATABASE_LOCATION_ERROR; import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA; import static io.trino.plugin.hive.HiveMetadata.STORAGE_TABLE; -import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; import static io.trino.plugin.hive.HiveType.HIVE_STRING; import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; import static io.trino.plugin.hive.TableType.VIRTUAL_VIEW; -import static io.trino.plugin.hive.ViewReaderUtil.ICEBERG_MATERIALIZED_VIEW_COMMENT; -import static io.trino.plugin.hive.ViewReaderUtil.encodeViewData; -import static io.trino.plugin.hive.ViewReaderUtil.isHiveOrPrestoView; import static io.trino.plugin.hive.ViewReaderUtil.isTrinoMaterializedView; import static io.trino.plugin.hive.metastore.MetastoreUtil.buildInitialPrivilegeSet; import static io.trino.plugin.hive.metastore.PrincipalPrivileges.NO_PRIVILEGES; import static io.trino.plugin.hive.metastore.StorageFormat.VIEW_STORAGE_FORMAT; import static io.trino.plugin.hive.util.HiveUtil.isHiveSystemSchema; import static io.trino.plugin.hive.util.HiveUtil.isIcebergTable; -import static io.trino.plugin.iceberg.IcebergMaterializedViewAdditionalProperties.STORAGE_SCHEMA; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA; import static io.trino.plugin.iceberg.IcebergMaterializedViewDefinition.encodeMaterializedViewData; import static io.trino.plugin.iceberg.IcebergMaterializedViewDefinition.fromConnectorMaterializedViewDefinition; +import static io.trino.plugin.iceberg.IcebergMaterializedViewProperties.STORAGE_SCHEMA; import static io.trino.plugin.iceberg.IcebergSchemaProperties.LOCATION_PROPERTY; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; import static io.trino.plugin.iceberg.IcebergUtil.getIcebergTableWithMetadata; +import static io.trino.plugin.iceberg.IcebergUtil.isSomeKindOfAView; import static io.trino.plugin.iceberg.IcebergUtil.loadIcebergTable; -import static io.trino.plugin.iceberg.IcebergUtil.validateTableCanBeDropped; +import static io.trino.plugin.iceberg.IcebergUtil.quotedTableName; +import static io.trino.plugin.iceberg.TableType.MATERIALIZED_VIEW_STORAGE; +import static io.trino.plugin.iceberg.TrinoMetricsReporter.TRINO_METRICS_REPORTER; import static io.trino.plugin.iceberg.catalog.AbstractIcebergTableOperations.ICEBERG_METASTORE_STORAGE_FORMAT; import static io.trino.plugin.iceberg.catalog.AbstractIcebergTableOperations.toHiveColumns; import static io.trino.spi.StandardErrorCode.ALREADY_EXISTS; @@ -113,35 +131,40 @@ public class TrinoHiveCatalog extends AbstractTrinoCatalog { private static final Logger log = Logger.get(TrinoHiveCatalog.class); - public static final String DEPENDS_ON_TABLES = "dependsOnTables"; - // Value should be ISO-8601 formatted time instant - public static final String TRINO_QUERY_START_TIME = "trino-query-start-time"; + private static final int PER_QUERY_CACHE_SIZE = 1000; private final CachingHiveMetastore metastore; private final TrinoViewHiveMetastore trinoViewHiveMetastore; - private final TrinoFileSystemFactory fileSystemFactory; private final boolean isUsingSystemSecurity; private final boolean deleteSchemaLocationsFallback; + private final boolean hideMaterializedViewStorageTable; + private final Executor metadataFetchingExecutor; - private final Map tableMetadataCache = new ConcurrentHashMap<>(); + private final Cache tableMetadataCache = EvictableCacheBuilder.newBuilder() + .maximumSize(PER_QUERY_CACHE_SIZE) + .build(); public TrinoHiveCatalog( CatalogName catalogName, CachingHiveMetastore metastore, TrinoViewHiveMetastore trinoViewHiveMetastore, TrinoFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory, TypeManager typeManager, IcebergTableOperationsProvider tableOperationsProvider, boolean useUniqueTableLocation, boolean isUsingSystemSecurity, - boolean deleteSchemaLocationsFallback) + boolean deleteSchemaLocationsFallback, + boolean hideMaterializedViewStorageTable, + Executor metadataFetchingExecutor) { - super(catalogName, typeManager, tableOperationsProvider, useUniqueTableLocation); + super(catalogName, useUniqueTableLocation, typeManager, tableOperationsProvider, fileSystemFactory, fileIoFactory); this.metastore = requireNonNull(metastore, "metastore is null"); this.trinoViewHiveMetastore = requireNonNull(trinoViewHiveMetastore, "trinoViewHiveMetastore is null"); - this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.isUsingSystemSecurity = isUsingSystemSecurity; this.deleteSchemaLocationsFallback = deleteSchemaLocationsFallback; + this.hideMaterializedViewStorageTable = hideMaterializedViewStorageTable; + this.metadataFetchingExecutor = requireNonNull(metadataFetchingExecutor, "metadataFetchingExecutor is null"); } public CachingHiveMetastore getMetastore() @@ -266,7 +289,7 @@ public Transaction newCreateTableTransaction( Schema schema, PartitionSpec partitionSpec, SortOrder sortOrder, - String location, + Optional location, Map properties) { return newCreateTableTransaction( @@ -280,6 +303,26 @@ public Transaction newCreateTableTransaction( isUsingSystemSecurity ? Optional.empty() : Optional.of(session.getUser())); } + @Override + public Transaction newCreateOrReplaceTableTransaction( + ConnectorSession session, + SchemaTableName schemaTableName, + Schema schema, PartitionSpec partitionSpec, + SortOrder sortOrder, + String location, + Map properties) + { + return newCreateOrReplaceTableTransaction( + session, + schemaTableName, + schema, + partitionSpec, + sortOrder, + location, + properties, + isUsingSystemSecurity ? Optional.empty() : Optional.of(session.getUser())); + } + @Override public void registerTable(ConnectorSession session, SchemaTableName schemaTableName, TableMetadata tableMetadata) throws TrinoException @@ -297,6 +340,7 @@ public void registerTable(ConnectorSession session, SchemaTableName schemaTableN .withStorage(storage -> storage.setStorageFormat(ICEBERG_METASTORE_STORAGE_FORMAT)) // This is a must-have property for the EXTERNAL_TABLE table type .setParameter("EXTERNAL", "TRUE") + .setParameter(TRINO_QUERY_ID_NAME, session.getQueryId()) .setParameter(TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE.toUpperCase(ENGLISH)) .setParameter(METADATA_LOCATION_PROP, tableMetadata.metadataFileLocation()); @@ -308,16 +352,48 @@ public void registerTable(ConnectorSession session, SchemaTableName schemaTableN public void unregisterTable(ConnectorSession session, SchemaTableName schemaTableName) { dropTableFromMetastore(schemaTableName); + invalidateTableCache(schemaTableName); + } + + private List getTables(String schema) + { + return metastore.getAllTables(schema).stream() + .map(table -> new TableInfo(new SchemaTableName(schema, table), ExtendedRelationType.TABLE)) + .toList(); + } + + @Override + public List listTables(ConnectorSession session, Optional namespace) + { + List>> tasks = listNamespaces(session, namespace).stream() + .map(schema -> (Callable>) () -> getTables(schema)) + .collect(toImmutableList()); + try { + return processWithAdditionalThreads(tasks, metadataFetchingExecutor).stream() + .flatMap(Collection::stream) + .collect(toImmutableList()); + } + catch (ExecutionException e) { + throw new RuntimeException(e.getCause()); + } } @Override - public List listTables(ConnectorSession session, Optional namespace) + public List listIcebergTables(ConnectorSession session, Optional namespace) { - ImmutableSet.Builder tablesListBuilder = ImmutableSet.builder(); - for (String schemaName : listNamespaces(session, namespace)) { - metastore.getAllTables(schemaName).forEach(tableName -> tablesListBuilder.add(new SchemaTableName(schemaName, tableName))); + List>> tasks = listNamespaces(session, namespace).stream() + .map(schema -> (Callable>) () -> metastore.getTablesWithParameter(schema, TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE.toLowerCase(ENGLISH)).stream() + .map(tableName -> new SchemaTableName(schema, tableName)) + .collect(toImmutableList())) + .collect(toImmutableList()); + try { + return processWithAdditionalThreads(tasks, metadataFetchingExecutor).stream() + .flatMap(Collection::stream) + .collect(toImmutableList()); + } + catch (ExecutionException e) { + throw new RuntimeException(e.getCause()); } - return tablesListBuilder.build().asList(); } @Override @@ -343,9 +419,8 @@ public Optional> streamRelationComments( @Override public void dropTable(ConnectorSession session, SchemaTableName schemaTableName) { - BaseTable table = (BaseTable) loadTable(session, schemaTableName); + BaseTable table = loadTable(session, schemaTableName); TableMetadata metadata = table.operations().current(); - validateTableCanBeDropped(table); io.trino.plugin.hive.metastore.Table metastoreTable = metastore.getTable(schemaTableName.getSchemaName(), schemaTableName.getTableName()) .orElseThrow(() -> new TableNotFoundException(schemaTableName)); @@ -364,6 +439,7 @@ public void dropTable(ConnectorSession session, SchemaTableName schemaTableName) log.warn(e, "Failed to delete table data referenced by metadata"); } deleteTableDirectory(fileSystemFactory.create(session), schemaTableName, metastoreTable.getStorage().getLocation()); + invalidateTableCache(schemaTableName); } @Override @@ -371,6 +447,7 @@ public void dropCorruptedTable(ConnectorSession session, SchemaTableName schemaT { io.trino.plugin.hive.metastore.Table table = dropTableFromMetastore(schemaTableName); deleteTableDirectory(fileSystemFactory.create(session), schemaTableName, table.getStorage().getLocation()); + invalidateTableCache(schemaTableName); } private io.trino.plugin.hive.metastore.Table dropTableFromMetastore(SchemaTableName schemaTableName) @@ -392,14 +469,23 @@ private io.trino.plugin.hive.metastore.Table dropTableFromMetastore(SchemaTableN public void renameTable(ConnectorSession session, SchemaTableName from, SchemaTableName to) { metastore.renameTable(from.getSchemaName(), from.getTableName(), to.getSchemaName(), to.getTableName()); + invalidateTableCache(from); } @Override - public Table loadTable(ConnectorSession session, SchemaTableName schemaTableName) + public BaseTable loadTable(ConnectorSession session, SchemaTableName schemaTableName) { - TableMetadata metadata = tableMetadataCache.computeIfAbsent( - schemaTableName, - ignore -> ((BaseTable) loadIcebergTable(this, tableOperationsProvider, session, schemaTableName)).operations().current()); + TableMetadata metadata; + try { + metadata = uncheckedCacheGet( + tableMetadataCache, + schemaTableName, + () -> loadIcebergTable(this, tableOperationsProvider, session, schemaTableName).operations().current()); + } + catch (UncheckedExecutionException e) { + throwIfUnchecked(e.getCause()); + throw e; + } return getIcebergTableWithMetadata(this, tableOperationsProvider, session, schemaTableName, metadata); } @@ -422,16 +508,6 @@ public void updateViewColumnComment(ConnectorSession session, SchemaTableName vi trinoViewHiveMetastore.updateViewColumnComment(session, viewName, columnName, comment); } - private void replaceView(ConnectorSession session, SchemaTableName viewName, io.trino.plugin.hive.metastore.Table view, ConnectorViewDefinition newDefinition) - { - io.trino.plugin.hive.metastore.Table.Builder viewBuilder = io.trino.plugin.hive.metastore.Table.builder(view) - .setViewOriginalText(Optional.of(encodeViewData(newDefinition))); - - PrincipalPrivileges principalPrivileges = isUsingSystemSecurity ? NO_PRIVILEGES : buildInitialPrivilegeSet(session.getUser()); - - metastore.replaceTable(viewName.getSchemaName(), viewName.getTableName(), viewBuilder.build(), principalPrivileges); - } - @Override public String defaultTableLocation(ConnectorSession session, SchemaTableName schemaTableName) { @@ -475,33 +551,18 @@ public void dropView(ConnectorSession session, SchemaTableName schemaViewName) trinoViewHiveMetastore.dropView(schemaViewName); } - @Override - public List listViews(ConnectorSession session, Optional namespace) - { - return trinoViewHiveMetastore.listViews(namespace); - } - @Override public Optional getView(ConnectorSession session, SchemaTableName viewName) { return trinoViewHiveMetastore.getView(viewName); } - @Override - public List listMaterializedViews(ConnectorSession session, Optional namespace) - { - // Filter on ICEBERG_MATERIALIZED_VIEW_COMMENT is used to avoid listing hive views in case of a shared HMS and to distinguish from standard views - return listNamespaces(session, namespace).stream() - .flatMap(schema -> metastore.getTablesWithParameter(schema, TABLE_COMMENT, ICEBERG_MATERIALIZED_VIEW_COMMENT).stream() - .map(table -> new SchemaTableName(schema, table))) - .collect(toImmutableList()); - } - @Override public void createMaterializedView( ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition, + Map materializedViewProperties, boolean replace, boolean ignoreExisting) { @@ -519,11 +580,67 @@ public void createMaterializedView( } } - SchemaTableName storageTable = createMaterializedViewStorageTable(session, viewName, definition); + if (hideMaterializedViewStorageTable) { + Location storageMetadataLocation = createMaterializedViewStorage(session, viewName, definition, materializedViewProperties); + + Map viewProperties = createMaterializedViewProperties(session, storageMetadataLocation); + Column dummyColumn = new Column("dummy", HIVE_STRING, Optional.empty(), ImmutableMap.of()); + io.trino.plugin.hive.metastore.Table.Builder tableBuilder = io.trino.plugin.hive.metastore.Table.builder() + .setDatabaseName(viewName.getSchemaName()) + .setTableName(viewName.getTableName()) + .setOwner(isUsingSystemSecurity ? Optional.empty() : Optional.of(session.getUser())) + .setTableType(VIRTUAL_VIEW.name()) + .setDataColumns(ImmutableList.of(dummyColumn)) + .setPartitionColumns(ImmutableList.of()) + .setParameters(viewProperties) + .withStorage(storage -> storage.setStorageFormat(VIEW_STORAGE_FORMAT)) + .withStorage(storage -> storage.setLocation("")) + .setViewOriginalText(Optional.of( + encodeMaterializedViewData(fromConnectorMaterializedViewDefinition(definition)))) + .setViewExpandedText(Optional.of("/* " + ICEBERG_MATERIALIZED_VIEW_COMMENT + " */")); + io.trino.plugin.hive.metastore.Table table = tableBuilder.build(); + PrincipalPrivileges principalPrivileges = isUsingSystemSecurity ? NO_PRIVILEGES : buildInitialPrivilegeSet(session.getUser()); + + try { + if (existing.isPresent()) { + metastore.replaceTable(viewName.getSchemaName(), viewName.getTableName(), table, principalPrivileges); + } + else { + metastore.createTable(table, principalPrivileges); + } + } + catch (RuntimeException e) { + try { + dropMaterializedViewStorage(session, fileSystemFactory.create(session), storageMetadataLocation.toString()); + } + catch (Exception suppressed) { + log.warn(suppressed, "Failed to clean up metadata '%s' for materialized view '%s'", storageMetadataLocation, viewName); + if (e != suppressed) { + e.addSuppressed(suppressed); + } + } + throw e; + } + + existing.ifPresent(existingView -> dropMaterializedViewStorage(session, existingView)); + } + else { + createMaterializedViewWithStorageTable(session, viewName, definition, materializedViewProperties, existing); + } + } + + private void createMaterializedViewWithStorageTable( + ConnectorSession session, + SchemaTableName viewName, + ConnectorMaterializedViewDefinition definition, + Map materializedViewProperties, + Optional existing) + { + SchemaTableName storageTable = createMaterializedViewStorageTable(session, viewName, definition, materializedViewProperties); // Create a view indicating the storage table Map viewProperties = createMaterializedViewProperties(session, storageTable); - Column dummyColumn = new Column("dummy", HIVE_STRING, Optional.empty()); + Column dummyColumn = new Column("dummy", HIVE_STRING, Optional.empty(), Map.of()); io.trino.plugin.hive.metastore.Table.Builder tableBuilder = io.trino.plugin.hive.metastore.Table.builder() .setDatabaseName(viewName.getSchemaName()) @@ -581,7 +698,7 @@ public void updateMaterializedViewColumnComment(ConnectorSession session, Schema definition.getGracePeriod(), definition.getComment(), definition.getOwner(), - definition.getProperties()); + Map.of()); replaceMaterializedView(session, viewName, existing, newDefinition); } @@ -607,18 +724,34 @@ public void dropMaterializedView(ConnectorSession session, SchemaTableName viewN throw new TrinoException(UNSUPPORTED_TABLE_TYPE, "Not a Materialized View: " + viewName); } + dropMaterializedViewStorage(session, view); + metastore.dropTable(viewName.getSchemaName(), viewName.getTableName(), true); + } + + private void dropMaterializedViewStorage(ConnectorSession session, io.trino.plugin.hive.metastore.Table view) + { + SchemaTableName viewName = view.getSchemaTableName(); String storageTableName = view.getParameters().get(STORAGE_TABLE); if (storageTableName != null) { String storageSchema = Optional.ofNullable(view.getParameters().get(STORAGE_SCHEMA)) .orElse(viewName.getSchemaName()); try { - metastore.dropTable(storageSchema, storageTableName, true); + dropTable(session, new SchemaTableName(storageSchema, storageTableName)); } catch (TrinoException e) { log.warn(e, "Failed to drop storage table '%s.%s' for materialized view '%s'", storageSchema, storageTableName, viewName); } } - metastore.dropTable(viewName.getSchemaName(), viewName.getTableName(), true); + else { + String storageMetadataLocation = view.getParameters().get(METADATA_LOCATION_PROP); + checkState(storageMetadataLocation != null, "Storage location missing in definition of materialized view " + viewName); + try { + dropMaterializedViewStorage(session, fileSystemFactory.create(session), storageMetadataLocation); + } + catch (IOException e) { + log.warn(e, "Failed to delete storage table metadata '%s' for materialized view '%s'", storageMetadataLocation, viewName); + } + } } @Override @@ -629,40 +762,81 @@ protected Optional doGetMaterializedView(Co return Optional.empty(); } - io.trino.plugin.hive.metastore.Table table = tableOptional.get(); - if (!isTrinoMaterializedView(table.getTableType(), table.getParameters())) { + io.trino.plugin.hive.metastore.Table materializedView = tableOptional.get(); + if (!isTrinoMaterializedView(materializedView.getTableType(), materializedView.getParameters())) { return Optional.empty(); } - io.trino.plugin.hive.metastore.Table materializedView = tableOptional.get(); String storageTable = materializedView.getParameters().get(STORAGE_TABLE); - checkState(storageTable != null, "Storage table missing in definition of materialized view " + viewName); - String storageSchema = Optional.ofNullable(materializedView.getParameters().get(STORAGE_SCHEMA)) - .orElse(viewName.getSchemaName()); - SchemaTableName storageTableName = new SchemaTableName(storageSchema, storageTable); + String storageMetadataLocation = materializedView.getParameters().get(METADATA_LOCATION_PROP); + if ((storageTable == null) == (storageMetadataLocation == null)) { + throw new TrinoException(ICEBERG_BAD_DATA, "Materialized view should have exactly one of the %s properties set: %s".formatted( + ImmutableList.of(STORAGE_TABLE, METADATA_LOCATION_PROP), + materializedView.getParameters())); + } - Table icebergTable; - try { - icebergTable = loadTable(session, storageTableName); + SchemaTableName storageTableName; + if (storageTable != null) { + String storageSchema = Optional.ofNullable(materializedView.getParameters().get(STORAGE_SCHEMA)) + .orElse(viewName.getSchemaName()); + storageTableName = new SchemaTableName(storageSchema, storageTable); } - catch (RuntimeException e) { - // The materialized view could be removed concurrently. This may manifest in a number of ways, e.g. - // - io.trino.spi.connector.TableNotFoundException - // - org.apache.iceberg.exceptions.NotFoundException when accessing manifest file - // - other failures when reading storage table's metadata files - // Retry, as we're catching broadly. - metastore.invalidateTable(viewName.getSchemaName(), viewName.getTableName()); - metastore.invalidateTable(storageSchema, storageTable); - throw new MaterializedViewMayBeBeingRemovedException(e); + else { + storageTableName = new SchemaTableName(viewName.getSchemaName(), IcebergTableName.tableNameWithType(viewName.getTableName(), MATERIALIZED_VIEW_STORAGE)); } + return Optional.of(getMaterializedViewDefinition( - icebergTable, - table.getOwner(), + materializedView.getOwner(), materializedView.getViewOriginalText() .orElseThrow(() -> new TrinoException(HIVE_INVALID_METADATA, "No view original text: " + viewName)), storageTableName)); } + @Override + public Optional getMaterializedViewStorageTable(ConnectorSession session, SchemaTableName viewName) + { + Optional tableOptional = metastore.getTable(viewName.getSchemaName(), viewName.getTableName()); + if (tableOptional.isEmpty()) { + return Optional.empty(); + } + + io.trino.plugin.hive.metastore.Table materializedView = tableOptional.get(); + verify(isTrinoMaterializedView(materializedView.getTableType(), materializedView.getParameters()), + "getMaterializedViewStorageTable received a table, not a materialized view"); + + SchemaTableName storageTableName = new SchemaTableName(viewName.getSchemaName(), IcebergTableName.tableNameWithType(viewName.getTableName(), MATERIALIZED_VIEW_STORAGE)); + IcebergTableOperations operations = tableOperationsProvider.createTableOperations( + this, + session, + storageTableName.getSchemaName(), + storageTableName.getTableName(), + Optional.empty(), + Optional.empty()); + + try { + TableMetadata metadata = getMaterializedViewTableMetadata(session, storageTableName, materializedView); + operations.initializeFromMetadata(metadata); + return Optional.of(new BaseTable(operations, quotedTableName(storageTableName), TRINO_METRICS_REPORTER)); + } + catch (UncheckedExecutionException e) { + // Removed during reading + if (e.getCause() instanceof NotFoundException) { + return Optional.empty(); + } + throw e; + } + } + + private TableMetadata getMaterializedViewTableMetadata(ConnectorSession session, SchemaTableName storageTableName, io.trino.plugin.hive.metastore.Table materializedView) + { + return uncheckedCacheGet(tableMetadataCache, storageTableName, () -> { + String storageMetadataLocation = materializedView.getParameters().get(METADATA_LOCATION_PROP); + checkState(storageMetadataLocation != null, "Storage location missing in definition of materialized view " + materializedView.getTableName()); + TrinoFileSystem fileSystem = fileSystemFactory.create(session); + return TableMetadataParser.read(fileIoFactory.create(fileSystem, isUseFileSizeFromMetadata(session)), storageMetadataLocation); + }); + } + @Override public void renameMaterializedView(ConnectorSession session, SchemaTableName source, SchemaTableName target) { @@ -699,7 +873,7 @@ public Optional redirectTable(ConnectorSession session, Optional table = metastore.getTable(tableNameBase.getSchemaName(), tableNameBase.getTableName()); - if (table.isEmpty() || isHiveOrPrestoView(table.get().getTableType())) { + if (table.isEmpty() || isSomeKindOfAView(table.get())) { return Optional.empty(); } if (!isIcebergTable(table.get())) { @@ -708,4 +882,10 @@ public Optional redirectTable(ConnectorSession session, } return Optional.empty(); } + + @Override + protected void invalidateTableCache(SchemaTableName schemaTableName) + { + tableMetadataCache.invalidate(schemaTableName); + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/IcebergJdbcTableOperationsProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/IcebergJdbcTableOperationsProvider.java index ad8c861afc08..3adb3b2c70be 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/IcebergJdbcTableOperationsProvider.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/IcebergJdbcTableOperationsProvider.java @@ -18,24 +18,30 @@ import io.trino.plugin.iceberg.catalog.IcebergTableOperations; import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; import io.trino.plugin.iceberg.catalog.TrinoCatalog; -import io.trino.plugin.iceberg.fileio.ForwardingFileIo; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; import io.trino.spi.connector.ConnectorSession; import java.util.Optional; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; import static java.util.Objects.requireNonNull; public class IcebergJdbcTableOperationsProvider implements IcebergTableOperationsProvider { private final TrinoFileSystemFactory fileSystemFactory; + private final ForwardingFileIoFactory fileIoFactory; private final IcebergJdbcClient jdbcClient; @Inject - public IcebergJdbcTableOperationsProvider(IcebergJdbcClient jdbcClient, TrinoFileSystemFactory fileSystemFactory) + public IcebergJdbcTableOperationsProvider( + TrinoFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory, + IcebergJdbcClient jdbcClient) { - this.jdbcClient = requireNonNull(jdbcClient, "jdbcClient is null"); this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.fileIoFactory = requireNonNull(fileIoFactory, "fileIoFactory is null"); + this.jdbcClient = requireNonNull(jdbcClient, "jdbcClient is null"); } @Override @@ -48,7 +54,7 @@ public IcebergTableOperations createTableOperations( Optional location) { return new IcebergJdbcTableOperations( - new ForwardingFileIo(fileSystemFactory.create(session)), + fileIoFactory.create(fileSystemFactory.create(session), isUseFileSizeFromMetadata(session)), jdbcClient, session, database, diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/TrinoJdbcCatalog.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/TrinoJdbcCatalog.java index 092aef4da5b7..de619ed15484 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/TrinoJdbcCatalog.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/jdbc/TrinoJdbcCatalog.java @@ -207,7 +207,7 @@ public Transaction newCreateTableTransaction( Schema schema, PartitionSpec partitionSpec, SortOrder sortOrder, - String location, + Optional location, Map properties) { if (!listNamespaces(session, Optional.of(schemaTableName.getSchemaName())).contains(schemaTableName.getSchemaName())) { diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogConfig.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogConfig.java index 492d0b3fa289..a69d662222c8 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogConfig.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogConfig.java @@ -15,16 +15,48 @@ import io.airlift.configuration.Config; import io.airlift.configuration.ConfigDescription; +import io.airlift.configuration.ConfigSecuritySensitive; +import io.airlift.units.Duration; +import io.airlift.units.MinDuration; +import jakarta.validation.constraints.AssertTrue; import jakarta.validation.constraints.NotEmpty; import jakarta.validation.constraints.NotNull; import java.net.URI; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.plugin.iceberg.catalog.nessie.IcebergNessieCatalogConfig.Security.BEARER; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.function.Predicate.isEqual; +import static org.projectnessie.client.NessieConfigConstants.DEFAULT_CONNECT_TIMEOUT_MILLIS; +import static org.projectnessie.client.NessieConfigConstants.DEFAULT_READ_TIMEOUT_MILLIS; public class IcebergNessieCatalogConfig { + public enum Security + { + BEARER, + } + + public enum ClientApiVersion + { + V1, + V2, + } + private String defaultReferenceName = "main"; private String defaultWarehouseDir; private URI serverUri; + private Duration readTimeout = new Duration(DEFAULT_READ_TIMEOUT_MILLIS, MILLISECONDS); + private Duration connectionTimeout = new Duration(DEFAULT_CONNECT_TIMEOUT_MILLIS, MILLISECONDS); + private boolean enableCompression = true; + private Security security; + private Optional bearerToken = Optional.empty(); + private Optional clientAPIVersion = Optional.empty(); + private static final Pattern VERSION_PATTERN = Pattern.compile("/v(\\d+)$"); @NotNull public String getDefaultReferenceName() @@ -67,4 +99,113 @@ public IcebergNessieCatalogConfig setDefaultWarehouseDir(String defaultWarehouse this.defaultWarehouseDir = defaultWarehouseDir; return this; } + + @MinDuration("1ms") + public Duration getReadTimeout() + { + return readTimeout; + } + + @Config("iceberg.nessie-catalog.read-timeout") + @ConfigDescription("The read timeout for the client.") + public IcebergNessieCatalogConfig setReadTimeout(Duration readTimeout) + { + this.readTimeout = readTimeout; + return this; + } + + @MinDuration("1ms") + public Duration getConnectionTimeout() + { + return connectionTimeout; + } + + @Config("iceberg.nessie-catalog.connection-timeout") + @ConfigDescription("The connection timeout for the client.") + public IcebergNessieCatalogConfig setConnectionTimeout(Duration connectionTimeout) + { + this.connectionTimeout = connectionTimeout; + return this; + } + + public boolean isCompressionEnabled() + { + return enableCompression; + } + + @Config("iceberg.nessie-catalog.enable-compression") + @ConfigDescription("Configure whether compression should be enabled or not.") + public IcebergNessieCatalogConfig setCompressionEnabled(boolean enableCompression) + { + this.enableCompression = enableCompression; + return this; + } + + public Optional getSecurity() + { + return Optional.ofNullable(security); + } + + @Config("iceberg.nessie-catalog.authentication.type") + @ConfigDescription("The authentication type to use") + public IcebergNessieCatalogConfig setSecurity(Security security) + { + this.security = security; + return this; + } + + public Optional getBearerToken() + { + return bearerToken; + } + + @Config("iceberg.nessie-catalog.authentication.token") + @ConfigDescription("The token to use with BEARER authentication") + @ConfigSecuritySensitive + public IcebergNessieCatalogConfig setBearerToken(String token) + { + this.bearerToken = Optional.ofNullable(token); + return this; + } + + @AssertTrue(message = "'iceberg.nessie-catalog.authentication.token' must be configured only with 'iceberg.nessie-catalog.authentication.type' BEARER") + public boolean isTokenConfiguredWithoutType() + { + return getSecurity().filter(isEqual(BEARER)).isPresent() || getBearerToken().isEmpty(); + } + + @AssertTrue(message = "'iceberg.nessie-catalog.authentication.token' must be configured with 'iceberg.nessie-catalog.authentication.type' BEARER") + public boolean isMissingTokenForBearerAuth() + { + return getSecurity().filter(isEqual(BEARER)).isEmpty() || getBearerToken().isPresent(); + } + + public Optional getClientAPIVersion() + { + return clientAPIVersion; + } + + @Config("iceberg.nessie-catalog.client-api-version") + @ConfigDescription("Client API version to use") + public IcebergNessieCatalogConfig setClientAPIVersion(ClientApiVersion version) + { + this.clientAPIVersion = Optional.ofNullable(version); + return this; + } + + protected IcebergNessieCatalogConfig.ClientApiVersion inferVersionFromURI() + { + checkArgument(serverUri != null, "URI is not specified in the catalog properties"); + // match for uri ending with /v1, /v2 etc + Matcher matcher = VERSION_PATTERN.matcher(serverUri.toString()); + if (!matcher.find()) { + throw new IllegalArgumentException("URI doesn't end with the version: %s. Please configure `client-api-version` in the catalog properties explicitly.".formatted(serverUri)); + } + + return switch (matcher.group(1)) { + case "1" -> ClientApiVersion.V1; + case "2" -> ClientApiVersion.V2; + default -> throw new IllegalArgumentException("Unknown API version in the URI: " + matcher.group(1)); + }; + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogModule.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogModule.java index 809c39589bd5..81378918c22c 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogModule.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieCatalogModule.java @@ -22,10 +22,13 @@ import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; import org.apache.iceberg.nessie.NessieIcebergClient; +import org.projectnessie.client.NessieClientBuilder; import org.projectnessie.client.api.NessieApiV1; -import org.projectnessie.client.http.HttpClientBuilder; +import org.projectnessie.client.api.NessieApiV2; +import org.projectnessie.client.auth.BearerAuthenticationProvider; import static io.airlift.configuration.ConfigBinder.configBinder; +import static java.lang.Math.toIntExact; import static org.weakref.jmx.guice.ExportBinder.newExporter; public class IcebergNessieCatalogModule @@ -45,11 +48,23 @@ protected void setup(Binder binder) @Singleton public static NessieIcebergClient createNessieIcebergClient(IcebergNessieCatalogConfig icebergNessieCatalogConfig) { - return new NessieIcebergClient( - HttpClientBuilder.builder() - .withUri(icebergNessieCatalogConfig.getServerUri()) - .withEnableApiCompatibilityCheck(false) - .build(NessieApiV1.class), + NessieClientBuilder builder = NessieClientBuilder.createClientBuilderFromSystemSettings() + .withUri(icebergNessieCatalogConfig.getServerUri()) + .withDisableCompression(!icebergNessieCatalogConfig.isCompressionEnabled()) + .withReadTimeout(toIntExact(icebergNessieCatalogConfig.getReadTimeout().toMillis())) + .withConnectionTimeout(toIntExact(icebergNessieCatalogConfig.getConnectionTimeout().toMillis())); + + icebergNessieCatalogConfig.getBearerToken() + .ifPresent(token -> builder.withAuthentication(BearerAuthenticationProvider.create(token))); + + IcebergNessieCatalogConfig.ClientApiVersion clientApiVersion = icebergNessieCatalogConfig.getClientAPIVersion() + .orElseGet(icebergNessieCatalogConfig::inferVersionFromURI); + NessieApiV1 api = switch (clientApiVersion) { + case V1 -> builder.build(NessieApiV1.class); + case V2 -> builder.build(NessieApiV2.class); + }; + + return new NessieIcebergClient(api, icebergNessieCatalogConfig.getDefaultReferenceName(), null, ImmutableMap.of()); diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieTableOperations.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieTableOperations.java index 75be83f8293d..28dd3f3b4ccb 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieTableOperations.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/IcebergNessieTableOperations.java @@ -96,8 +96,14 @@ protected String getRefreshedLocation(boolean invalidateCaches) protected void commitNewTable(TableMetadata metadata) { verify(version.isEmpty(), "commitNewTable called on a table which already exists"); + String contentId = table == null ? null : table.getId(); try { - nessieClient.commitTable(null, metadata, writeNewMetadata(metadata, 0), table, toKey(new SchemaTableName(database, this.tableName))); + nessieClient.commitTable( + null, + metadata, + writeNewMetadata(metadata, 0), + contentId, + toKey(database, tableName)); } catch (NessieNotFoundException e) { throw new TrinoException(ICEBERG_COMMIT_ERROR, format("Cannot commit: ref '%s' no longer exists", nessieClient.refName()), e); @@ -113,8 +119,16 @@ protected void commitNewTable(TableMetadata metadata) protected void commitToExistingTable(TableMetadata base, TableMetadata metadata) { verify(version.orElseThrow() >= 0, "commitToExistingTable called on a new table"); + if (table == null) { + table = nessieClient.table(toIdentifier(new SchemaTableName(database, tableName))); + } try { - nessieClient.commitTable(base, metadata, writeNewMetadata(metadata, version.getAsInt() + 1), table, toKey(new SchemaTableName(database, this.tableName))); + nessieClient.commitTable( + base, + metadata, + writeNewMetadata(metadata, version.getAsInt() + 1), + table.getId(), + toKey(database, tableName)); } catch (NessieNotFoundException e) { throw new TrinoException(ICEBERG_COMMIT_ERROR, format("Cannot commit: ref '%s' no longer exists", nessieClient.refName()), e); @@ -126,8 +140,9 @@ protected void commitToExistingTable(TableMetadata base, TableMetadata metadata) shouldRefresh = true; } - private static ContentKey toKey(SchemaTableName tableName) + private static ContentKey toKey(String databaseName, String tableName) { - return ContentKey.of(Namespace.parse(tableName.getSchemaName()), tableName.getTableName()); + SchemaTableName schemaTableName = new SchemaTableName(databaseName, tableName); + return ContentKey.of(Namespace.parse(schemaTableName.getSchemaName()), schemaTableName.getTableName()); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/TrinoNessieCatalog.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/TrinoNessieCatalog.java index 36ae7c6fac00..5fb0b6446767 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/TrinoNessieCatalog.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/nessie/TrinoNessieCatalog.java @@ -239,7 +239,7 @@ public Transaction newCreateTableTransaction( Schema schema, PartitionSpec partitionSpec, SortOrder sortOrder, - String location, + Optional location, Map properties) { return newCreateTableTransaction( diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/AwsProperties.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/AwsProperties.java new file mode 100644 index 000000000000..c28e7e35e5b0 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/AwsProperties.java @@ -0,0 +1,22 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.catalog.rest; + +import java.util.Map; + +@FunctionalInterface +public interface AwsProperties +{ + Map get(); +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogConfig.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogConfig.java index af8e3a5fe0b4..0f6672f2925e 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogConfig.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogConfig.java @@ -15,17 +15,29 @@ import io.airlift.configuration.Config; import io.airlift.configuration.ConfigDescription; +import io.airlift.configuration.DefunctConfig; +import io.airlift.units.Duration; +import io.airlift.units.MinDuration; import jakarta.validation.constraints.NotNull; +import org.apache.iceberg.CatalogProperties; import java.net.URI; import java.util.Optional; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.MINUTES; + +@DefunctConfig({ + "iceberg.rest-catalog.parent-namespace", + "iceberg.rest-catalog.sigv4-enabled", +}) public class IcebergRestCatalogConfig { public enum Security { NONE, OAUTH2, + SIGV4, } public enum SessionType @@ -35,9 +47,16 @@ public enum SessionType } private URI restUri; + private Optional prefix = Optional.empty(); private Optional warehouse = Optional.empty(); + private boolean nestedNamespaceEnabled; private Security security = Security.NONE; private SessionType sessionType = SessionType.NONE; + private Duration sessionTimeout = new Duration(CatalogProperties.AUTH_SESSION_TIMEOUT_MS_DEFAULT, MILLISECONDS); + private boolean vendedCredentialsEnabled; + private boolean viewEndpointsEnabled = true; + private boolean caseInsensitiveNameMatching; + private Duration caseInsensitiveNameMatchingCacheTtl = new Duration(1, MINUTES); @NotNull public URI getBaseUri() @@ -55,6 +74,45 @@ public IcebergRestCatalogConfig setBaseUri(String uri) return this; } + public Optional getPrefix() + { + return prefix; + } + + @Config("iceberg.rest-catalog.prefix") + @ConfigDescription("The prefix for the resource path to use with the REST catalog server") + public IcebergRestCatalogConfig setPrefix(String prefix) + { + this.prefix = Optional.ofNullable(prefix); + return this; + } + + public Optional getWarehouse() + { + return warehouse; + } + + @Config("iceberg.rest-catalog.warehouse") + @ConfigDescription("The warehouse location/identifier to use with the REST catalog server") + public IcebergRestCatalogConfig setWarehouse(String warehouse) + { + this.warehouse = Optional.ofNullable(warehouse); + return this; + } + + public boolean isNestedNamespaceEnabled() + { + return nestedNamespaceEnabled; + } + + @Config("iceberg.rest-catalog.nested-namespace-enabled") + @ConfigDescription("Support querying objects under nested namespace") + public IcebergRestCatalogConfig setNestedNamespaceEnabled(boolean nestedNamespaceEnabled) + { + this.nestedNamespaceEnabled = nestedNamespaceEnabled; + return this; + } + @NotNull public Security getSecurity() { @@ -83,16 +141,72 @@ public IcebergRestCatalogConfig setSessionType(SessionType sessionType) return this; } - public Optional getWarehouse() + @NotNull + @MinDuration("0ms") + public Duration getSessionTimeout() { - return warehouse; + return sessionTimeout; } - @Config("iceberg.rest-catalog.warehouse") - @ConfigDescription("The warehouse location/identifier to use with the REST catalog server") - public IcebergRestCatalogConfig setWarehouse(String warehouse) + @Config("iceberg.rest-catalog.session-timeout") + @ConfigDescription("Duration to keep authentication session in cache") + public IcebergRestCatalogConfig setSessionTimeout(Duration sessionTimeout) { - this.warehouse = Optional.ofNullable(warehouse); + this.sessionTimeout = sessionTimeout; + return this; + } + + public boolean isVendedCredentialsEnabled() + { + return vendedCredentialsEnabled; + } + + @Config("iceberg.rest-catalog.vended-credentials-enabled") + @ConfigDescription("Use credentials provided by the REST backend for file system access") + public IcebergRestCatalogConfig setVendedCredentialsEnabled(boolean vendedCredentialsEnabled) + { + this.vendedCredentialsEnabled = vendedCredentialsEnabled; + return this; + } + + public boolean isViewEndpointsEnabled() + { + return viewEndpointsEnabled; + } + + @Config("iceberg.rest-catalog.view-endpoints-enabled") + @ConfigDescription("Enable view endpoints") + public IcebergRestCatalogConfig setViewEndpointsEnabled(boolean viewEndpointsEnabled) + { + this.viewEndpointsEnabled = viewEndpointsEnabled; + return this; + } + + public boolean isCaseInsensitiveNameMatching() + { + return caseInsensitiveNameMatching; + } + + @Config("iceberg.rest-catalog.case-insensitive-name-matching") + @ConfigDescription("Match object names case-insensitively") + public IcebergRestCatalogConfig setCaseInsensitiveNameMatching(boolean caseInsensitiveNameMatching) + { + this.caseInsensitiveNameMatching = caseInsensitiveNameMatching; + return this; + } + + @NotNull + @MinDuration("0ms") + public Duration getCaseInsensitiveNameMatchingCacheTtl() + { + return caseInsensitiveNameMatchingCacheTtl; + } + + @Config("iceberg.rest-catalog.case-insensitive-name-matching.cache-ttl") + @ConfigDescription("Duration to keep case insensitive object mapping prior to eviction") + public IcebergRestCatalogConfig setCaseInsensitiveNameMatchingCacheTtl(Duration caseInsensitiveNameMatchingCacheTtl) + { + this.caseInsensitiveNameMatchingCacheTtl = caseInsensitiveNameMatchingCacheTtl; return this; } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogFileSystemFactory.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogFileSystemFactory.java new file mode 100644 index 000000000000..b5e777c7ee15 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogFileSystemFactory.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.catalog.rest; + +import com.google.common.collect.ImmutableMap; +import com.google.inject.Inject; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.plugin.iceberg.IcebergFileSystemFactory; +import io.trino.spi.security.ConnectorIdentity; + +import java.util.Map; + +import static io.trino.filesystem.s3.S3FileSystemConstants.EXTRA_CREDENTIALS_ACCESS_KEY_PROPERTY; +import static io.trino.filesystem.s3.S3FileSystemConstants.EXTRA_CREDENTIALS_SECRET_KEY_PROPERTY; +import static io.trino.filesystem.s3.S3FileSystemConstants.EXTRA_CREDENTIALS_SESSION_TOKEN_PROPERTY; +import static java.util.Objects.requireNonNull; + +public class IcebergRestCatalogFileSystemFactory + implements IcebergFileSystemFactory +{ + private static final String VENDED_S3_ACCESS_KEY = "s3.access-key-id"; + private static final String VENDED_S3_SECRET_KEY = "s3.secret-access-key"; + private static final String VENDED_S3_SESSION_TOKEN = "s3.session-token"; + + private final TrinoFileSystemFactory fileSystemFactory; + private final boolean vendedCredentialsEnabled; + + @Inject + public IcebergRestCatalogFileSystemFactory(TrinoFileSystemFactory fileSystemFactory, IcebergRestCatalogConfig config) + { + this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.vendedCredentialsEnabled = config.isVendedCredentialsEnabled(); + } + + @Override + public TrinoFileSystem create(ConnectorIdentity identity, Map fileIoProperties) + { + if (vendedCredentialsEnabled && + fileIoProperties.containsKey(VENDED_S3_ACCESS_KEY) && + fileIoProperties.containsKey(VENDED_S3_SECRET_KEY) && + fileIoProperties.containsKey(VENDED_S3_SESSION_TOKEN)) { + // Do not include original credentials as they should not be used in vended mode + ConnectorIdentity identityWithExtraCredentials = ConnectorIdentity.forUser(identity.getUser()) + .withGroups(identity.getGroups()) + .withPrincipal(identity.getPrincipal()) + .withEnabledSystemRoles(identity.getEnabledSystemRoles()) + .withConnectorRole(identity.getConnectorRole()) + .withExtraCredentials(ImmutableMap.builder() + .put(EXTRA_CREDENTIALS_ACCESS_KEY_PROPERTY, fileIoProperties.get(VENDED_S3_ACCESS_KEY)) + .put(EXTRA_CREDENTIALS_SECRET_KEY_PROPERTY, fileIoProperties.get(VENDED_S3_SECRET_KEY)) + .put(EXTRA_CREDENTIALS_SESSION_TOKEN_PROPERTY, fileIoProperties.get(VENDED_S3_SESSION_TOKEN)) + .buildOrThrow()) + .build(); + return fileSystemFactory.create(identityWithExtraCredentials); + } + + return fileSystemFactory.create(identity); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogModule.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogModule.java index 8950212bbbaf..f8cdf2648123 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogModule.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogModule.java @@ -16,11 +16,16 @@ import com.google.inject.Binder; import com.google.inject.Scopes; import io.airlift.configuration.AbstractConfigurationAwareModule; +import io.trino.plugin.iceberg.IcebergConfig; +import io.trino.plugin.iceberg.IcebergFileSystemFactory; import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; import io.trino.plugin.iceberg.catalog.rest.IcebergRestCatalogConfig.Security; +import io.trino.spi.TrinoException; +import static com.google.inject.multibindings.OptionalBinder.newOptionalBinder; import static io.airlift.configuration.ConditionalModule.conditionalModule; import static io.airlift.configuration.ConfigBinder.configBinder; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; public class IcebergRestCatalogModule extends AbstractConfigurationAwareModule @@ -34,7 +39,22 @@ protected void setup(Binder binder) config -> config.getSecurity() == Security.OAUTH2, new OAuth2SecurityModule(), new NoneSecurityModule())); + install(conditionalModule( + IcebergRestCatalogConfig.class, + config -> config.getSecurity() == Security.SIGV4, + new SigV4SecurityModule())); + install(conditionalModule( + IcebergRestCatalogConfig.class, + config -> config.getSecurity() == Security.NONE, + new NoneSecurityModule())); binder.bind(TrinoCatalogFactory.class).to(TrinoIcebergRestCatalogFactory.class).in(Scopes.SINGLETON); + newOptionalBinder(binder, IcebergFileSystemFactory.class).setBinding().to(IcebergRestCatalogFileSystemFactory.class).in(Scopes.SINGLETON); + + IcebergConfig icebergConfig = buildConfigObject(IcebergConfig.class); + IcebergRestCatalogConfig restCatalogConfig = buildConfigObject(IcebergRestCatalogConfig.class); + if (restCatalogConfig.isVendedCredentialsEnabled() && icebergConfig.isRegisterTableProcedureEnabled()) { + throw new TrinoException(NOT_SUPPORTED, "Using the `register_table` procedure with vended credentials is currently not supported"); + } } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogSigV4Config.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogSigV4Config.java new file mode 100644 index 000000000000..e7c80e628762 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/IcebergRestCatalogSigV4Config.java @@ -0,0 +1,38 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.catalog.rest; + +import io.airlift.configuration.Config; +import io.airlift.configuration.ConfigDescription; +import jakarta.validation.constraints.NotNull; +import org.apache.iceberg.aws.AwsProperties; + +public class IcebergRestCatalogSigV4Config +{ + private String signingName = AwsProperties.REST_SIGNING_NAME_DEFAULT; + + @NotNull + public String getSigningName() + { + return signingName; + } + + @Config("iceberg.rest-catalog.signing-name") + @ConfigDescription("AWS SigV4 signing service name") + public IcebergRestCatalogSigV4Config setSigningName(String signingName) + { + this.signingName = signingName; + return this; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4AwsCredentialProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4AwsCredentialProvider.java new file mode 100644 index 000000000000..67f3f3f7adf4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4AwsCredentialProvider.java @@ -0,0 +1,130 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.catalog.rest; + +import io.trino.spi.TrinoException; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentials; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.identity.spi.AwsCredentialsIdentity; +import software.amazon.awssdk.identity.spi.ResolveIdentityRequest; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.sts.StsClient; +import software.amazon.awssdk.services.sts.StsClientBuilder; +import software.amazon.awssdk.services.sts.auth.StsAssumeRoleCredentialsProvider; + +import java.net.URI; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.function.Consumer; + +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CATALOG_ERROR; +import static java.util.Objects.requireNonNull; + +public class SigV4AwsCredentialProvider + implements AwsCredentialsProvider +{ + static final String AWS_STS_ACCESS_KEY_ID = "aws_sts_access_key_id"; + static final String AWS_STS_SECRET_ACCESS_KEY = "aws_sts_secret_access_key"; + static final String AWS_STS_SIGNER_REGION = "aws_sts_signer_region"; + static final String AWS_STS_REGION = "aws_sts_region"; + static final String AWS_STS_ENDPOINT = "aws_sts_endpoint"; + + static final String AWS_IAM_ROLE = "aws_iam_role"; + static final String AWS_ROLE_EXTERNAL_ID = "aws_external_id"; + static final String AWS_IAM_ROLE_SESSION_NAME = "aws_iam_role_session_name"; + + private final AwsCredentialsProvider delegate; + + public SigV4AwsCredentialProvider(AwsCredentialsProvider delegate) + { + this.delegate = requireNonNull(delegate, "delegate is null"); + } + + public static SigV4AwsCredentialProvider create(Map properties) + { + if (properties.containsKey(AWS_IAM_ROLE)) { + String accessKey = properties.get(AWS_STS_ACCESS_KEY_ID); + String secretAccessKey = properties.get(AWS_STS_SECRET_ACCESS_KEY); + + Optional staticCredentialsProvider = createStaticCredentialsProvider(accessKey, secretAccessKey); + return new SigV4AwsCredentialProvider(StsAssumeRoleCredentialsProvider.builder() + .refreshRequest(request -> request + .roleArn(properties.get(AWS_IAM_ROLE)) + .roleSessionName(AWS_IAM_ROLE_SESSION_NAME) + .externalId(properties.get(AWS_ROLE_EXTERNAL_ID))) + .stsClient(createStsClient( + properties.get(AWS_STS_ENDPOINT), + properties.get(AWS_STS_REGION), + properties.get(AWS_STS_SIGNER_REGION), + staticCredentialsProvider)) + .asyncCredentialUpdateEnabled(true) + .build()); + } + + throw new TrinoException(ICEBERG_CATALOG_ERROR, "IAM role configs are not configured"); + } + + @Override + public CompletableFuture resolveIdentity(Consumer consumer) + { + return delegate.resolveIdentity(consumer); + } + + @Override + public CompletableFuture resolveIdentity() + { + return delegate.resolveIdentity(); + } + + @Override + public AwsCredentials resolveCredentials() + { + return delegate.resolveCredentials(); + } + + @Override + public Class identityType() + { + return delegate.identityType(); + } + + @Override + public CompletableFuture resolveIdentity(ResolveIdentityRequest request) + { + return delegate.resolveIdentity(request); + } + + private static Optional createStaticCredentialsProvider(String accessKey, String secretKey) + { + if (accessKey != null || secretKey != null) { + return Optional.of(StaticCredentialsProvider.create( + AwsBasicCredentials.create(accessKey, secretKey))); + } + return Optional.empty(); + } + + private static StsClient createStsClient(String stsEndpoint, String stsRegion, String region, Optional credentialsProvider) + { + StsClientBuilder sts = StsClient.builder(); + Optional.ofNullable(stsEndpoint).map(URI::create).ifPresent(sts::endpointOverride); + Optional.ofNullable(stsRegion) + .or(() -> Optional.ofNullable(region)) + .map(Region::of).ifPresent(sts::region); + credentialsProvider.ifPresent(sts::credentialsProvider); + return sts.build(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4AwsProperties.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4AwsProperties.java new file mode 100644 index 000000000000..24bd98cca508 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4AwsProperties.java @@ -0,0 +1,92 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.catalog.rest; + +import com.google.common.collect.ImmutableMap; +import com.google.inject.Inject; +import io.trino.filesystem.s3.S3FileSystemConfig; + +import java.util.Map; +import java.util.Optional; + +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_IAM_ROLE; +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_IAM_ROLE_SESSION_NAME; +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_ROLE_EXTERNAL_ID; +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_STS_ACCESS_KEY_ID; +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_STS_ENDPOINT; +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_STS_REGION; +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_STS_SECRET_ACCESS_KEY; +import static io.trino.plugin.iceberg.catalog.rest.SigV4AwsCredentialProvider.AWS_STS_SIGNER_REGION; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.aws.AwsClientProperties.CLIENT_CREDENTIALS_PROVIDER; +import static org.apache.iceberg.aws.AwsProperties.REST_ACCESS_KEY_ID; +import static org.apache.iceberg.aws.AwsProperties.REST_SECRET_ACCESS_KEY; +import static org.apache.iceberg.aws.AwsProperties.REST_SIGNER_REGION; +import static org.apache.iceberg.aws.AwsProperties.REST_SIGNING_NAME; + +public class SigV4AwsProperties + implements SecurityProperties +{ + // Copy of `org.apache.iceberg.aws.AwsClientProperties.CLIENT_CREDENTIAL_PROVIDER_PREFIX` https://github.com/apache/iceberg/blob/ab6fc83ec0269736355a0a89c51e44e822264da8/aws/src/main/java/org/apache/iceberg/aws/AwsClientProperties.java#L69 + private static final String CLIENT_CREDENTIAL_PROVIDER_PREFIX = "client.credentials-provider."; + + private static final String CLIENT_CREDENTIAL_AWS_ACCESS_KEY_ID = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_STS_ACCESS_KEY_ID; + private static final String CLIENT_CREDENTIAL_AWS_SECRET_ACCESS_KEY = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_STS_SECRET_ACCESS_KEY; + private static final String CLIENT_CREDENTIAL_AWS_SIGNER_REGION = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_STS_SIGNER_REGION; + + private static final String CLIENT_CREDENTIAL_AWS_STS_REGION = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_STS_REGION; + private static final String CLIENT_CREDENTIAL_AWS_STS_ENDPOINT = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_STS_ENDPOINT; + private static final String CLIENT_CREDENTIAL_AWS_IAM_ROLE = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_IAM_ROLE; + private static final String CLIENT_CREDENTIAL_AWS_ROLE_EXTERNAL_ID = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_ROLE_EXTERNAL_ID; + private static final String CLIENT_CREDENTIAL_AWS_IAM_ROLE_SESSION_NAME = CLIENT_CREDENTIAL_PROVIDER_PREFIX + AWS_IAM_ROLE_SESSION_NAME; + + private final Map properties; + + @Inject + public SigV4AwsProperties(IcebergRestCatalogSigV4Config sigV4Config, S3FileSystemConfig s3Config) + { + ImmutableMap.Builder builder = ImmutableMap.builder() + .put("rest.auth.type", "sigv4") + .put(REST_SIGNING_NAME, sigV4Config.getSigningName()) + .put(REST_SIGNER_REGION, requireNonNull(s3Config.getRegion(), "s3.region is null")) + .put("rest-metrics-reporting-enabled", "false"); + + if (s3Config.getIamRole() != null) { + builder + .put(CLIENT_CREDENTIALS_PROVIDER, SigV4AwsCredentialProvider.class.getName()) + .put(CLIENT_CREDENTIAL_AWS_IAM_ROLE, s3Config.getIamRole()) + .put(CLIENT_CREDENTIAL_AWS_IAM_ROLE_SESSION_NAME, "trino-iceberg-rest-catalog") + .put(CLIENT_CREDENTIAL_AWS_SIGNER_REGION, s3Config.getRegion()); + Optional.ofNullable(s3Config.getExternalId()).ifPresent(externalId -> builder.put(CLIENT_CREDENTIAL_AWS_ROLE_EXTERNAL_ID, externalId)); + + Optional.ofNullable(s3Config.getStsRegion()).ifPresent(stsRegion -> builder.put(CLIENT_CREDENTIAL_AWS_STS_REGION, stsRegion)); + Optional.ofNullable(s3Config.getAwsAccessKey()).ifPresent(accessKey -> builder.put(CLIENT_CREDENTIAL_AWS_ACCESS_KEY_ID, accessKey)); + Optional.ofNullable(s3Config.getAwsSecretKey()).ifPresent(secretAccessKey -> builder.put(CLIENT_CREDENTIAL_AWS_SECRET_ACCESS_KEY, secretAccessKey)); + Optional.ofNullable(s3Config.getStsEndpoint()).ifPresent(endpoint -> builder.put(CLIENT_CREDENTIAL_AWS_STS_ENDPOINT, endpoint)); + } + else { + builder + .put(REST_ACCESS_KEY_ID, requireNonNull(s3Config.getAwsAccessKey(), "s3.aws-access-key is null")) + .put(REST_SECRET_ACCESS_KEY, requireNonNull(s3Config.getAwsSecretKey(), "s3.aws-secret-key is null")); + } + + properties = builder.buildOrThrow(); + } + + @Override + public Map get() + { + return properties; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4SecurityModule.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4SecurityModule.java new file mode 100644 index 000000000000..5fb685092b6d --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/SigV4SecurityModule.java @@ -0,0 +1,30 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.catalog.rest; + +import com.google.inject.Binder; +import io.airlift.configuration.AbstractConfigurationAwareModule; + +import static io.airlift.configuration.ConfigBinder.configBinder; + +public class SigV4SecurityModule + extends AbstractConfigurationAwareModule +{ + @Override + protected void setup(Binder binder) + { + configBinder(binder).bindConfig(IcebergRestCatalogSigV4Config.class); + binder.bind(SecurityProperties.class).to(SigV4AwsProperties.class); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoIcebergRestCatalogFactory.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoIcebergRestCatalogFactory.java index e883cc3beb4b..c0a3d258d4fc 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoIcebergRestCatalogFactory.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoIcebergRestCatalogFactory.java @@ -13,49 +13,73 @@ */ package io.trino.plugin.iceberg.catalog.rest; +import com.google.common.cache.Cache; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; import com.google.errorprone.annotations.concurrent.GuardedBy; import com.google.inject.Inject; -import io.trino.filesystem.TrinoFileSystemFactory; +import io.airlift.units.Duration; +import io.trino.cache.EvictableCacheBuilder; import io.trino.plugin.base.CatalogName; import io.trino.plugin.hive.NodeVersion; import io.trino.plugin.iceberg.IcebergConfig; +import io.trino.plugin.iceberg.IcebergFileSystemFactory; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; import io.trino.plugin.iceberg.catalog.rest.IcebergRestCatalogConfig.SessionType; import io.trino.plugin.iceberg.fileio.ForwardingFileIo; import io.trino.spi.security.ConnectorIdentity; +import io.trino.spi.type.TypeManager; import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.rest.HTTPClient; import org.apache.iceberg.rest.RESTSessionCatalog; +import org.apache.iceberg.rest.RESTUtil; import java.net.URI; +import java.util.Map; import java.util.Optional; +import java.util.Set; import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static org.apache.iceberg.CatalogProperties.AUTH_SESSION_TIMEOUT_MS; +import static org.apache.iceberg.rest.auth.OAuth2Properties.CREDENTIAL; +import static org.apache.iceberg.rest.auth.OAuth2Properties.TOKEN; public class TrinoIcebergRestCatalogFactory implements TrinoCatalogFactory { - private final TrinoFileSystemFactory fileSystemFactory; + private final IcebergFileSystemFactory fileSystemFactory; private final CatalogName catalogName; private final String trinoVersion; private final URI serverUri; + private final Optional prefix; private final Optional warehouse; + private final boolean nestedNamespaceEnabled; private final SessionType sessionType; + private final Duration sessionTimeout; + private final boolean vendedCredentialsEnabled; + private final boolean viewEndpointsEnabled; private final SecurityProperties securityProperties; private final boolean uniqueTableLocation; + private final TypeManager typeManager; + private final boolean caseInsensitiveNameMatching; + private final Cache remoteNamespaceMappingCache; + private final Cache remoteTableMappingCache; @GuardedBy("this") private RESTSessionCatalog icebergCatalog; @Inject public TrinoIcebergRestCatalogFactory( - TrinoFileSystemFactory fileSystemFactory, + IcebergFileSystemFactory fileSystemFactory, CatalogName catalogName, IcebergRestCatalogConfig restConfig, SecurityProperties securityProperties, IcebergConfig icebergConfig, + TypeManager typeManager, NodeVersion nodeVersion) { this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); @@ -63,11 +87,26 @@ public TrinoIcebergRestCatalogFactory( this.trinoVersion = requireNonNull(nodeVersion, "nodeVersion is null").toString(); requireNonNull(restConfig, "restConfig is null"); this.serverUri = restConfig.getBaseUri(); + this.prefix = restConfig.getPrefix(); this.warehouse = restConfig.getWarehouse(); + this.nestedNamespaceEnabled = restConfig.isNestedNamespaceEnabled(); this.sessionType = restConfig.getSessionType(); + this.sessionTimeout = restConfig.getSessionTimeout(); + this.vendedCredentialsEnabled = restConfig.isVendedCredentialsEnabled(); + this.viewEndpointsEnabled = restConfig.isViewEndpointsEnabled(); this.securityProperties = requireNonNull(securityProperties, "securityProperties is null"); requireNonNull(icebergConfig, "icebergConfig is null"); this.uniqueTableLocation = icebergConfig.isUniqueTableLocation(); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + this.caseInsensitiveNameMatching = restConfig.isCaseInsensitiveNameMatching(); + this.remoteNamespaceMappingCache = EvictableCacheBuilder.newBuilder() + .expireAfterWrite(restConfig.getCaseInsensitiveNameMatchingCacheTtl().toMillis(), MILLISECONDS) + .shareNothingWhenDisabled() + .build(); + this.remoteTableMappingCache = EvictableCacheBuilder.newBuilder() + .expireAfterWrite(restConfig.getCaseInsensitiveNameMatchingCacheTtl().toMillis(), MILLISECONDS) + .shareNothingWhenDisabled() + .build(); } @Override @@ -79,21 +118,48 @@ public synchronized TrinoCatalog create(ConnectorIdentity identity) ImmutableMap.Builder properties = ImmutableMap.builder(); properties.put(CatalogProperties.URI, serverUri.toString()); warehouse.ifPresent(location -> properties.put(CatalogProperties.WAREHOUSE_LOCATION, location)); + prefix.ifPresent(prefix -> properties.put("prefix", prefix)); + properties.put("view-endpoints-supported", Boolean.toString(viewEndpointsEnabled)); properties.put("trino-version", trinoVersion); + properties.put(AUTH_SESSION_TIMEOUT_MS, String.valueOf(sessionTimeout.toMillis())); properties.putAll(securityProperties.get()); + + if (vendedCredentialsEnabled) { + properties.put("header.X-Iceberg-Access-Delegation", "vended-credentials"); + } + RESTSessionCatalog icebergCatalogInstance = new RESTSessionCatalog( - config -> HTTPClient.builder(config).uri(config.get(CatalogProperties.URI)).build(), + config -> HTTPClient.builder(config) + .uri(config.get(CatalogProperties.URI)) + .withHeaders(RESTUtil.configHeaders(config)) + .build(), (context, config) -> { ConnectorIdentity currentIdentity = (context.wrappedIdentity() != null) ? ((ConnectorIdentity) context.wrappedIdentity()) : ConnectorIdentity.ofUser("fake"); - return new ForwardingFileIo(fileSystemFactory.create(currentIdentity)); + return new ForwardingFileIo(fileSystemFactory.create(currentIdentity, config)); }); icebergCatalogInstance.initialize(catalogName.toString(), properties.buildOrThrow()); icebergCatalog = icebergCatalogInstance; } - return new TrinoRestCatalog(icebergCatalog, catalogName, sessionType, trinoVersion, uniqueTableLocation); + // `OAuth2Properties.SCOPE` is not set as scope passed through credentials is unused in + // https://github.com/apache/iceberg/blob/229d8f6fcd109e6c8943ea7cbb41dab746c6d0ed/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java#L714-L721 + Map credentials = Maps.filterKeys(securityProperties.get(), key -> Set.of(TOKEN, CREDENTIAL).contains(key)); + + return new TrinoRestCatalog( + icebergCatalog, + catalogName, + sessionType, + credentials, + nestedNamespaceEnabled, + trinoVersion, + typeManager, + uniqueTableLocation, + caseInsensitiveNameMatching, + remoteNamespaceMappingCache, + remoteTableMappingCache, + viewEndpointsEnabled); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoRestCatalog.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoRestCatalog.java index 1b474bb5c989..d8ba1cfe6e04 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoRestCatalog.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/catalog/rest/TrinoRestCatalog.java @@ -13,14 +13,21 @@ */ package io.trino.plugin.iceberg.catalog.rest; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.cache.Cache; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; +import com.google.common.util.concurrent.UncheckedExecutionException; +import io.airlift.log.Logger; import io.jsonwebtoken.impl.DefaultJwtBuilder; import io.jsonwebtoken.jackson.io.JacksonSerializer; +import io.trino.cache.EvictableCacheBuilder; import io.trino.plugin.base.CatalogName; import io.trino.plugin.iceberg.ColumnIdentity; import io.trino.plugin.iceberg.IcebergSchemaProperties; +import io.trino.plugin.iceberg.IcebergUtil; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.rest.IcebergRestCatalogConfig.SessionType; import io.trino.spi.TrinoException; @@ -34,7 +41,9 @@ import io.trino.spi.connector.SchemaNotFoundException; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.connector.TableNotFoundException; +import io.trino.spi.connector.ViewNotFoundException; import io.trino.spi.security.TrinoPrincipal; +import io.trino.spi.type.TypeManager; import org.apache.iceberg.BaseTable; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -42,85 +51,170 @@ import org.apache.iceberg.Table; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.Transaction; +import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.catalog.SessionCatalog; import org.apache.iceberg.catalog.SessionCatalog.SessionContext; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.ForbiddenException; import org.apache.iceberg.exceptions.NoSuchNamespaceException; import org.apache.iceberg.exceptions.NoSuchTableException; +import org.apache.iceberg.exceptions.NoSuchViewException; import org.apache.iceberg.exceptions.RESTException; import org.apache.iceberg.rest.RESTSessionCatalog; import org.apache.iceberg.rest.auth.OAuth2Properties; - +import org.apache.iceberg.view.ReplaceViewVersion; +import org.apache.iceberg.view.SQLViewRepresentation; +import org.apache.iceberg.view.UpdateViewProperties; +import org.apache.iceberg.view.View; +import org.apache.iceberg.view.ViewBuilder; +import org.apache.iceberg.view.ViewRepresentation; +import org.apache.iceberg.view.ViewVersion; + +import java.util.Arrays; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; import java.util.function.Predicate; +import java.util.function.Supplier; import java.util.function.UnaryOperator; +import java.util.stream.Stream; -import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.cache.CacheUtils.uncheckedCacheGet; import static io.trino.filesystem.Locations.appendPath; -import static io.trino.plugin.hive.HiveMetadata.TABLE_COMMENT; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CATALOG_ERROR; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_UNSUPPORTED_VIEW_DIALECT; import static io.trino.plugin.iceberg.IcebergUtil.quotedTableName; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; import static java.lang.String.format; +import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; import static java.util.UUID.randomUUID; +import static org.apache.iceberg.view.ViewProperties.COMMENT; public class TrinoRestCatalog implements TrinoCatalog { + public static final String TABLE_COMMENT = "comment"; + public static final String TRINO_CREATED_BY_VALUE = "Trino Iceberg connector"; + public static final String ICEBERG_VIEW_RUN_AS_OWNER = "trino.run-as-owner"; + + private static final Logger log = Logger.get(TrinoRestCatalog.class); + + private static final int PER_QUERY_CACHE_SIZE = 1000; + private static final String NAMESPACE_SEPARATOR = "."; + private final RESTSessionCatalog restSessionCatalog; private final CatalogName catalogName; + private final TypeManager typeManager; private final SessionType sessionType; + private final Map credentials; + private final boolean nestedNamespaceEnabled; private final String trinoVersion; private final boolean useUniqueTableLocation; + private final boolean caseInsensitiveNameMatching; + private final Cache remoteNamespaceMappingCache; + private final Cache remoteTableMappingCache; + private final boolean viewEndpointsEnabled; - private final Map tableCache = new ConcurrentHashMap<>(); + private final Cache tableCache = EvictableCacheBuilder.newBuilder() + .maximumSize(PER_QUERY_CACHE_SIZE) + .build(); public TrinoRestCatalog( RESTSessionCatalog restSessionCatalog, CatalogName catalogName, SessionType sessionType, + Map credentials, + boolean nestedNamespaceEnabled, String trinoVersion, - boolean useUniqueTableLocation) + TypeManager typeManager, + boolean useUniqueTableLocation, + boolean caseInsensitiveNameMatching, + Cache remoteNamespaceMappingCache, + Cache remoteTableMappingCache, + boolean viewEndpointsEnabled) { this.restSessionCatalog = requireNonNull(restSessionCatalog, "restSessionCatalog is null"); this.catalogName = requireNonNull(catalogName, "catalogName is null"); this.sessionType = requireNonNull(sessionType, "sessionType is null"); + this.credentials = ImmutableMap.copyOf(requireNonNull(credentials, "credentials is null")); + this.nestedNamespaceEnabled = nestedNamespaceEnabled; this.trinoVersion = requireNonNull(trinoVersion, "trinoVersion is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.useUniqueTableLocation = useUniqueTableLocation; + this.caseInsensitiveNameMatching = caseInsensitiveNameMatching; + this.remoteNamespaceMappingCache = requireNonNull(remoteNamespaceMappingCache, "remoteNamespaceMappingCache is null"); + this.remoteTableMappingCache = requireNonNull(remoteTableMappingCache, "remoteTableMappingCache is null"); + this.viewEndpointsEnabled = viewEndpointsEnabled; + } + + //@Override + public Optional getNamespaceSeparator() + { + return Optional.of(NAMESPACE_SEPARATOR); } @Override public boolean namespaceExists(ConnectorSession session, String namespace) { - return restSessionCatalog.namespaceExists(convert(session), Namespace.of(namespace)); + try { + return restSessionCatalog.namespaceExists(convert(session), toRemoteNamespace(session, toNamespace(namespace))); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to check namespace '%s'".formatted(namespace), e); + } } @Override public List listNamespaces(ConnectorSession session) { - return restSessionCatalog.listNamespaces(convert(session)).stream() - .map(Namespace::toString) - .collect(toImmutableList()); + if (nestedNamespaceEnabled) { + return collectNamespaces(session, Namespace.empty()); + } + try { + return restSessionCatalog.listNamespaces(convert(session)).stream() + .map(this::toSchemaName) + .collect(toImmutableList()); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list namespaces", e); + } + } + + private List collectNamespaces(ConnectorSession session, Namespace parentNamespace) + { + try { + return restSessionCatalog.listNamespaces(convert(session), parentNamespace).stream() + .flatMap(childNamespace -> Stream.concat( + Stream.of(childNamespace.toString()), + collectNamespaces(session, childNamespace).stream())) + .collect(toImmutableList()); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list namespaces", e); + } } @Override public void dropNamespace(ConnectorSession session, String namespace) { try { - restSessionCatalog.dropNamespace(convert(session), Namespace.of(namespace)); + restSessionCatalog.dropNamespace(convert(session), toRemoteNamespace(session, toNamespace(namespace))); } catch (NoSuchNamespaceException e) { throw new SchemaNotFoundException(namespace); } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to drop namespace '%s'".formatted(namespace), e); + } + if (caseInsensitiveNameMatching) { + remoteNamespaceMappingCache.invalidate(toNamespace(namespace)); + } } @Override @@ -128,11 +222,14 @@ public Map loadNamespaceMetadata(ConnectorSession session, Strin { try { // Return immutable metadata as direct modifications will not be reflected on the namespace - return ImmutableMap.copyOf(restSessionCatalog.loadNamespaceMetadata(convert(session), Namespace.of(namespace))); + return ImmutableMap.copyOf(restSessionCatalog.loadNamespaceMetadata(convert(session), toRemoteNamespace(session, toNamespace(namespace)))); } catch (NoSuchNamespaceException e) { throw new SchemaNotFoundException(namespace); } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to load metadata for namespace '%s'".formatted(namespace), e); + } } @Override @@ -145,15 +242,20 @@ public Optional getNamespacePrincipal(ConnectorSession session, @Override public void createNamespace(ConnectorSession session, String namespace, Map properties, TrinoPrincipal owner) { - restSessionCatalog.createNamespace( - convert(session), - Namespace.of(namespace), - Maps.transformValues(properties, property -> { - if (property instanceof String stringProperty) { - return stringProperty; - } - throw new TrinoException(NOT_SUPPORTED, "Non-string properties are not support for Iceberg REST catalog"); - })); + try { + restSessionCatalog.createNamespace( + convert(session), + toNamespace(namespace), + Maps.transformValues(properties, property -> { + if (property instanceof String stringProperty) { + return stringProperty; + } + throw new TrinoException(NOT_SUPPORTED, "Non-string properties are not support for Iceberg REST catalog"); + })); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to create namespace '%s'".formatted(namespace), e); + } } @Override @@ -170,34 +272,111 @@ public void renameNamespace(ConnectorSession session, String source, String targ @Override public List listTables(ConnectorSession session, Optional namespace) + { + return listTablesInternal(session, namespace).stream() + .map(TableInfo::tableName) + .collect(toImmutableList()); + } + + //@Override + private List listTablesInternal(ConnectorSession session, Optional namespace) { SessionContext sessionContext = convert(session); - List namespaces; + List namespaces = listNamespaces(session, namespace); - if (namespace.isPresent() && namespaceExists(session, namespace.get())) { - namespaces = ImmutableList.of(Namespace.of(namespace.get())); - } - else { - namespaces = listNamespaces(session).stream() - .map(Namespace::of) - .collect(toImmutableList()); + ImmutableList.Builder tables = ImmutableList.builder(); + for (Namespace restNamespace : namespaces) { + listTableIdentifiers(restNamespace, () -> { + try { + return restSessionCatalog.listTables(sessionContext, toRemoteNamespace(session, restNamespace)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list tables", e); + } + }).stream() + .map(id -> new TableInfo(SchemaTableName.schemaTableName(toSchemaName(id.namespace()), id.name()), TableInfo.ExtendedRelationType.TABLE)) + .forEach(tables::add); + if (viewEndpointsEnabled) { + listTableIdentifiers(restNamespace, () -> { + try { + return restSessionCatalog.listViews(sessionContext, toRemoteNamespace(session, restNamespace)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list views", e); + } + }).stream() + .map(id -> new TableInfo(SchemaTableName.schemaTableName(toSchemaName(id.namespace()), id.name()), TableInfo.ExtendedRelationType.OTHER_VIEW)) + .forEach(tables::add); + } } + return tables.build(); + } + + //@Override + public List listIcebergTables(ConnectorSession session, Optional namespace) + { + SessionContext sessionContext = convert(session); + List namespaces = listNamespaces(session, namespace); ImmutableList.Builder tables = ImmutableList.builder(); for (Namespace restNamespace : namespaces) { - try { - tables.addAll( - restSessionCatalog.listTables(sessionContext, restNamespace).stream() - .map(id -> SchemaTableName.schemaTableName(id.namespace().toString(), id.name())) - .collect(toImmutableList())); - } - catch (NoSuchNamespaceException e) { - // Namespace may have been deleted during listing - } + listTableIdentifiers(restNamespace, () -> { + try { + return restSessionCatalog.listTables(sessionContext, toRemoteNamespace(session, restNamespace)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list tables", e); + } + }).stream() + .map(id -> SchemaTableName.schemaTableName(toSchemaName(id.namespace()), id.name())) + .forEach(tables::add); } return tables.build(); } + @Override + public List listViews(ConnectorSession session, Optional namespace) + { + if (!viewEndpointsEnabled) { + return ImmutableList.of(); + } + + SessionContext sessionContext = convert(session); + List namespaces = listNamespaces(session, namespace); + + ImmutableList.Builder viewNames = ImmutableList.builder(); + for (Namespace restNamespace : namespaces) { + listTableIdentifiers(restNamespace, () -> { + try { + return restSessionCatalog.listViews(sessionContext, toRemoteNamespace(session, restNamespace)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list views", e); + } + }).stream() + .map(id -> SchemaTableName.schemaTableName(id.namespace().toString(), id.name())) + .forEach(viewNames::add); + } + return viewNames.build(); + } + + private static List listTableIdentifiers(Namespace restNamespace, Supplier> tableIdentifiersProvider) + { + try { + return tableIdentifiersProvider.get(); + } + catch (NoSuchNamespaceException e) { + // Namespace may have been deleted during listing + } + catch (ForbiddenException e) { + log.debug(e, "Failed to list tables from %s namespace because of insufficient permissions", restNamespace); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, format("Failed to list tables from namespace: %s", restNamespace), e); + } + return ImmutableList.of(); + } + @Override public Optional> streamRelationColumns( ConnectorSession session, @@ -220,6 +399,32 @@ public Optional> streamRelationComments( @Override public Transaction newCreateTableTransaction( + ConnectorSession session, + SchemaTableName schemaTableName, + Schema schema, + PartitionSpec partitionSpec, + SortOrder sortOrder, + Optional location, + Map properties) + { + try { + Catalog.TableBuilder tableBuilder = restSessionCatalog.buildTable(convert(session), toRemoteTable(session, schemaTableName, true), schema) + .withPartitionSpec(partitionSpec) + .withSortOrder(sortOrder) + .withProperties(properties); + if (location.isEmpty()) { + // TODO Replace with createTransaction once S3 Tables supports stage-create option + return tableBuilder.create().newTransaction(); + } + return tableBuilder.withLocation(location.get()).createTransaction(); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to create transaction", e); + } + } + + //@Override + public Transaction newCreateOrReplaceTableTransaction( ConnectorSession session, SchemaTableName schemaTableName, Schema schema, @@ -228,32 +433,59 @@ public Transaction newCreateTableTransaction( String location, Map properties) { - return restSessionCatalog.buildTable(convert(session), toIdentifier(schemaTableName), schema) - .withPartitionSpec(partitionSpec) - .withSortOrder(sortOrder) - .withLocation(location) - .withProperties(properties) - .createTransaction(); + try { + return restSessionCatalog.buildTable(convert(session), toRemoteTable(session, schemaTableName, true), schema) + .withPartitionSpec(partitionSpec) + .withSortOrder(sortOrder) + .withLocation(location) + .withProperties(properties) + .createOrReplaceTransaction(); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to create transaction", e); + } } @Override public void registerTable(ConnectorSession session, SchemaTableName tableName, TableMetadata tableMetadata) { - throw new TrinoException(NOT_SUPPORTED, "registerTable is not supported for Iceberg REST catalog"); + TableIdentifier tableIdentifier = TableIdentifier.of(toRemoteNamespace(session, toNamespace(tableName.getSchemaName())), tableName.getTableName()); + try { + restSessionCatalog.registerTable(convert(session), tableIdentifier, tableMetadata.metadataFileLocation()); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to register table '%s'".formatted(tableName.getTableName()), e); + } } @Override public void unregisterTable(ConnectorSession session, SchemaTableName tableName) { - throw new TrinoException(NOT_SUPPORTED, "unregisterTable is not supported for Iceberg REST catalogs"); + try { + if (!restSessionCatalog.dropTable(convert(session), toRemoteTable(session, tableName, true))) { + throw new TableNotFoundException(tableName); + } + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to unregister table '%s'".formatted(tableName.getTableName()), e); + } + invalidateTableCache(tableName); + invalidateTableMappingCache(tableName); } @Override public void dropTable(ConnectorSession session, SchemaTableName schemaTableName) { - if (!restSessionCatalog.purgeTable(convert(session), toIdentifier(schemaTableName))) { - throw new TrinoException(ICEBERG_CATALOG_ERROR, format("Failed to drop table: %s", schemaTableName)); + try { + if (!restSessionCatalog.purgeTable(convert(session), toRemoteTable(session, schemaTableName, true))) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to drop table '%s'".formatted(schemaTableName)); + } + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to drop table '%s'".formatted(schemaTableName.getTableName()), e); } + invalidateTableCache(schemaTableName); + invalidateTableMappingCache(schemaTableName); } @Override @@ -268,31 +500,58 @@ public void dropCorruptedTable(ConnectorSession session, SchemaTableName schemaT public void renameTable(ConnectorSession session, SchemaTableName from, SchemaTableName to) { try { - restSessionCatalog.renameTable(convert(session), toIdentifier(from), toIdentifier(to)); + restSessionCatalog.renameTable(convert(session), toRemoteTable(session, from, true), toRemoteTable(session, to, true)); } catch (RESTException e) { throw new TrinoException(ICEBERG_CATALOG_ERROR, format("Failed to rename table %s to %s", from, to), e); } + invalidateTableCache(from); + invalidateTableMappingCache(from); } @Override - public Table loadTable(ConnectorSession session, SchemaTableName schemaTableName) + public BaseTable loadTable(ConnectorSession session, SchemaTableName schemaTableName) { + Namespace namespace = toNamespace(schemaTableName.getSchemaName()); try { - return tableCache.computeIfAbsent( - schemaTableName.toString(), - key -> { - BaseTable baseTable = (BaseTable) restSessionCatalog.loadTable(convert(session), toIdentifier(schemaTableName)); + return uncheckedCacheGet( + tableCache, + schemaTableName, + () -> { + BaseTable baseTable; + try { + baseTable = (BaseTable) restSessionCatalog.loadTable(convert(session), toRemoteObject(session, schemaTableName)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to load table '%s'".formatted(schemaTableName.getTableName()), e); + } // Creating a new base table is necessary to adhere to Trino's expectations for quoted table names - return new BaseTable(baseTable.operations(), quotedTableName(schemaTableName)); + return new BaseTable(baseTable.operations(), quotedTableName(schemaTableName), baseTable.reporter()); }); } - catch (NoSuchTableException e) { - throw new TableNotFoundException(schemaTableName, e); + catch (UncheckedExecutionException e) { + if (e.getCause() instanceof NoSuchTableException) { + throw new TableNotFoundException(schemaTableName, e.getCause()); + } + throw new TrinoException(ICEBERG_CATALOG_ERROR, format("Failed to load table: %s in %s namespace", schemaTableName.getTableName(), namespace), e.getCause()); + } + } + + private TableIdentifier toRemoteObject(ConnectorSession session, SchemaTableName schemaTableName) + { + TableIdentifier remoteTable = toRemoteTable(session, schemaTableName, false); + if (!remoteTable.name().equals(schemaTableName.getTableName())) { + return remoteTable; + } + + TableIdentifier remoteView = toRemoteView(session, schemaTableName, false); + if (!remoteView.name().equals(schemaTableName.getTableName())) { + return remoteView; } - catch (RuntimeException e) { - throw new TrinoException(ICEBERG_CATALOG_ERROR, format("Failed to load table: %s", schemaTableName), e); + if (remoteView.name().equals(schemaTableName.getTableName()) && remoteTable.name().equals(schemaTableName.getTableName())) { + return remoteTable; } + throw new RuntimeException("Unable to find remote object"); } @Override @@ -304,13 +563,20 @@ public Map> tryGetColumnMetadata(Connector @Override public void updateTableComment(ConnectorSession session, SchemaTableName schemaTableName, Optional comment) { - Table icebergTable = restSessionCatalog.loadTable(convert(session), toIdentifier(schemaTableName)); + Table icebergTable; + try { + icebergTable = restSessionCatalog.loadTable(convert(session), toRemoteTable(session, schemaTableName, true)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to load table '%s'".formatted(schemaTableName.getTableName()), e); + } if (comment.isEmpty()) { icebergTable.updateProperties().remove(TABLE_COMMENT).commit(); } else { icebergTable.updateProperties().set(TABLE_COMMENT, comment.get()).commit(); } + invalidateTableCache(schemaTableName); } @Override @@ -320,11 +586,12 @@ public String defaultTableLocation(ConnectorSession session, SchemaTableName sch Map properties = loadNamespaceMetadata(session, schemaTableName.getSchemaName()); String databaseLocation = (String) properties.get(IcebergSchemaProperties.LOCATION_PROPERTY); - checkArgument(databaseLocation != null, "location must be set for %s", schemaTableName.getSchemaName()); - - if (databaseLocation.endsWith("/")) { - return databaseLocation + tableName; + if (databaseLocation == null) { + // Iceberg REST catalog doesn't require location property. + // S3 Tables doesn't return the property. + return null; } + return appendPath(databaseLocation, tableName); } @@ -346,13 +613,40 @@ public void setTablePrincipal(ConnectorSession session, SchemaTableName schemaTa @Override public void createView(ConnectorSession session, SchemaTableName schemaViewName, ConnectorViewDefinition definition, boolean replace) { - throw new TrinoException(NOT_SUPPORTED, "createView is not supported for Iceberg REST catalog"); + ImmutableMap.Builder properties = ImmutableMap.builder(); + definition.getOwner().ifPresent(owner -> properties.put(ICEBERG_VIEW_RUN_AS_OWNER, owner)); + definition.getComment().ifPresent(comment -> properties.put(COMMENT, comment)); + Schema schema = IcebergUtil.schemaFromViewColumns(typeManager, definition.getColumns()); + ViewBuilder viewBuilder = restSessionCatalog.buildView(convert(session), toRemoteView(session, schemaViewName, true)); + viewBuilder = viewBuilder.withSchema(schema) + .withQuery("trino", definition.getOriginalSql()) + .withDefaultNamespace(toRemoteNamespace(session, toNamespace(schemaViewName.getSchemaName()))) + .withDefaultCatalog(definition.getCatalog().orElse(null)) + .withProperties(properties.buildOrThrow()) + .withLocation(defaultTableLocation(session, schemaViewName)); + try { + if (replace) { + viewBuilder.createOrReplace(); + } + else { + viewBuilder.create(); + } + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to create view '%s'".formatted(schemaViewName.getTableName()), e); + } } @Override public void renameView(ConnectorSession session, SchemaTableName source, SchemaTableName target) { - throw new TrinoException(NOT_SUPPORTED, "renameView is not supported for Iceberg REST catalog"); + try { + restSessionCatalog.renameView(convert(session), toRemoteView(session, source, true), toRemoteView(session, target, true)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to rename view '%s' to '%s'".formatted(source, target), e); + } + invalidateTableMappingCache(source); } @Override @@ -364,25 +658,67 @@ public void setViewPrincipal(ConnectorSession session, SchemaTableName schemaVie @Override public void dropView(ConnectorSession session, SchemaTableName schemaViewName) { - throw new TrinoException(NOT_SUPPORTED, "dropView is not supported for Iceberg REST catalog"); - } - - @Override - public List listViews(ConnectorSession session, Optional namespace) - { - return ImmutableList.of(); + try { + restSessionCatalog.dropView(convert(session), toRemoteView(session, schemaViewName, true)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to drop view '%s'".formatted(schemaViewName.getTableName()), e); + } + invalidateTableMappingCache(schemaViewName); } @Override public Map getViews(ConnectorSession session, Optional namespace) { - return ImmutableMap.of(); + SessionContext sessionContext = convert(session); + ImmutableMap.Builder views = ImmutableMap.builder(); + for (Namespace restNamespace : listNamespaces(session, namespace)) { + List restViews; + try { + restViews = restSessionCatalog.listViews(sessionContext, toRemoteNamespace(session, restNamespace)); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list views", e); + } + for (TableIdentifier restView : restViews) { + SchemaTableName schemaTableName = SchemaTableName.schemaTableName(restView.namespace().toString(), restView.name()); + try { + getView(session, schemaTableName).ifPresent(view -> views.put(schemaTableName, view)); + } + catch (TrinoException e) { + if (e.getErrorCode().equals(ICEBERG_UNSUPPORTED_VIEW_DIALECT.toErrorCode())) { + log.debug(e, "Skip unsupported view dialect: %s", schemaTableName); + continue; + } + throw e; + } + } + } + + return views.buildOrThrow(); } @Override public Optional getView(ConnectorSession session, SchemaTableName viewName) { - return Optional.empty(); + return getIcebergView(session, viewName, false).flatMap(view -> { + SQLViewRepresentation sqlView = view.sqlFor("trino"); + if (!sqlView.dialect().equalsIgnoreCase("trino")) { + throw new TrinoException(ICEBERG_UNSUPPORTED_VIEW_DIALECT, "Cannot read unsupported dialect '%s' for view '%s'".formatted(sqlView.dialect(), viewName)); + } + + Optional comment = Optional.ofNullable(view.properties().get(COMMENT)); + List viewColumns = IcebergUtil.viewColumnsFromSchema(typeManager, view.schema()); + ViewVersion currentVersion = view.currentVersion(); + Optional catalog = Optional.ofNullable(currentVersion.defaultCatalog()); + Optional schema = Optional.empty(); + if (catalog.isPresent() && !currentVersion.defaultNamespace().isEmpty()) { + schema = Optional.of(currentVersion.defaultNamespace().toString()); + } + + Optional owner = Optional.ofNullable(view.properties().get(ICEBERG_VIEW_RUN_AS_OWNER)); + return Optional.of(new ConnectorViewDefinition(sqlView.sql(), catalog, schema, viewColumns, comment, owner, owner.isEmpty())); + }); } @Override @@ -391,12 +727,35 @@ public List listMaterializedViews(ConnectorSession session, Opt return ImmutableList.of(); } + private Optional getIcebergView(ConnectorSession session, SchemaTableName viewName, boolean getCached) + { + if (!viewEndpointsEnabled) { + return Optional.empty(); + } + + try { + return Optional.of(restSessionCatalog.loadView(convert(session), toRemoteView(session, viewName, getCached))); + } + catch (NoSuchViewException e) { + return Optional.empty(); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to load view '%s'".formatted(viewName.getTableName()), e); + } + } + @Override public void createMaterializedView(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition, boolean replace, boolean ignoreExisting) { throw new TrinoException(NOT_SUPPORTED, "createMaterializedView is not supported for Iceberg REST catalog"); } + @Override + public void updateMaterializedViewColumnComment(ConnectorSession session, SchemaTableName schemaViewName, String columnName, Optional comment) + { + throw new TrinoException(NOT_SUPPORTED, "updateMaterializedViewColumnComment is not supported for Iceberg REST catalog"); + } + @Override public void dropMaterializedView(ConnectorSession session, SchemaTableName viewName) { @@ -409,6 +768,18 @@ public Optional getMaterializedView(Connect return Optional.empty(); } + //@Override + public Map getMaterializedViewProperties(ConnectorSession session, SchemaTableName viewName, ConnectorMaterializedViewDefinition definition) + { + throw new TrinoException(NOT_SUPPORTED, "The Iceberg REST catalog does not support materialized views"); + } + + //@Override + public Optional getMaterializedViewStorageTable(ConnectorSession session, SchemaTableName viewName) + { + throw new TrinoException(NOT_SUPPORTED, "The Iceberg REST catalog does not support materialized views"); + } + @Override public void renameMaterializedView(ConnectorSession session, SchemaTableName source, SchemaTableName target) { @@ -432,25 +803,39 @@ public Optional redirectTable(ConnectorSession session, @Override public void updateViewComment(ConnectorSession session, SchemaTableName schemaViewName, Optional comment) { - throw new TrinoException(NOT_SUPPORTED, "updateViewComment is not supported for Iceberg REST catalog"); + View view = getIcebergView(session, schemaViewName, true).orElseThrow(() -> new ViewNotFoundException(schemaViewName)); + UpdateViewProperties updateViewProperties = view.updateProperties(); + comment.ifPresentOrElse( + value -> updateViewProperties.set(COMMENT, value), + () -> updateViewProperties.remove(COMMENT)); + updateViewProperties.commit(); } @Override public void updateViewColumnComment(ConnectorSession session, SchemaTableName schemaViewName, String columnName, Optional comment) { - throw new TrinoException(NOT_SUPPORTED, "updateViewColumnComment is not supported for Iceberg REST catalog"); - } + View view = getIcebergView(session, schemaViewName, true) + .orElseThrow(() -> new ViewNotFoundException(schemaViewName)); + + ViewVersion current = view.currentVersion(); + Schema updatedSchema = IcebergUtil.updateColumnComment(view.schema(), columnName, comment.orElse(null)); + ReplaceViewVersion replaceViewVersion = view.replaceVersion() + .withSchema(updatedSchema) + .withDefaultCatalog(current.defaultCatalog()) + .withDefaultNamespace(current.defaultNamespace()); + for (ViewRepresentation representation : view.currentVersion().representations()) { + if (representation instanceof SQLViewRepresentation sqlViewRepresentation) { + replaceViewVersion.withQuery(sqlViewRepresentation.dialect(), sqlViewRepresentation.sql()); + } + } - @Override - public void updateMaterializedViewColumnComment(ConnectorSession session, SchemaTableName schemaViewName, String columnName, Optional comment) - { - throw new TrinoException(NOT_SUPPORTED, "updateMaterializedViewColumnComment is not supported for Iceberg REST catalog"); + replaceViewVersion.commit(); } private SessionCatalog.SessionContext convert(ConnectorSession session) { return switch (sessionType) { - case NONE -> new SessionContext(randomUUID().toString(), null, null, ImmutableMap.of(), session.getIdentity()); + case NONE -> new SessionContext(randomUUID().toString(), null, credentials, ImmutableMap.of(), session.getIdentity()); case USER -> { String sessionId = format("%s-%s", session.getUser(), session.getSource().orElse("default")); @@ -465,11 +850,11 @@ private SessionCatalog.SessionContext convert(ConnectorSession session) .buildOrThrow(); String subjectJwt = new DefaultJwtBuilder() - .setSubject(session.getUser()) - .setIssuer(trinoVersion) - .setIssuedAt(new Date()) - .addClaims(claims) - .serializeToJsonWith(new JacksonSerializer<>()) + .subject(session.getUser()) + .issuer(trinoVersion) + .issuedAt(new Date()) + .claims(claims) + .json(new JacksonSerializer<>()) .compact(); Map credentials = ImmutableMap.builder() @@ -482,8 +867,156 @@ private SessionCatalog.SessionContext convert(ConnectorSession session) }; } - private static TableIdentifier toIdentifier(SchemaTableName schemaTableName) + private void invalidateTableCache(SchemaTableName schemaTableName) + { + tableCache.invalidate(schemaTableName); + } + + private void invalidateTableMappingCache(SchemaTableName schemaTableName) + { + if (caseInsensitiveNameMatching) { + remoteTableMappingCache.invalidate(toIdentifier(schemaTableName)); + } + } + + private Namespace toNamespace(String schemaName) + { + if (!nestedNamespaceEnabled && schemaName.contains(NAMESPACE_SEPARATOR)) { + throw new TrinoException(NOT_SUPPORTED, "Nested namespace is not enabled for this catalog"); + } + return Namespace.of(Splitter.on(NAMESPACE_SEPARATOR).omitEmptyStrings().trimResults().splitToList(schemaName).toArray(new String[0])); + } + + private String toSchemaName(Namespace namespace) + { + if (!nestedNamespaceEnabled && namespace.length() != 1) { + throw new TrinoException(NOT_SUPPORTED, "Nested namespace is not enabled for this catalog"); + } + return String.join(NAMESPACE_SEPARATOR, namespace.levels()); + } + + private TableIdentifier toIdentifier(SchemaTableName schemaTableName) + { + return TableIdentifier.of(toNamespace(schemaTableName.getSchemaName()), schemaTableName.getTableName()); + } + + private List listNamespaces(ConnectorSession session, Optional namespace) + { + if (namespace.isEmpty()) { + return listNamespaces(session).stream() + .map(this::toNamespace) + .collect(toImmutableList()); + } + + return ImmutableList.of(toNamespace(namespace.get())); + } + + private TableIdentifier toRemoteTable(ConnectorSession session, SchemaTableName schemaTableName, boolean getCached) + { + TableIdentifier tableIdentifier = toIdentifier(schemaTableName); + return toRemoteObject(tableIdentifier, () -> findRemoteTable(session, tableIdentifier), getCached); + } + + private TableIdentifier findRemoteTable(ConnectorSession session, TableIdentifier tableIdentifier) + { + Namespace remoteNamespace = toRemoteNamespace(session, tableIdentifier.namespace()); + List tableIdentifiers; + try { + tableIdentifiers = restSessionCatalog.listTables(convert(session), remoteNamespace); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list tables", e); + } + TableIdentifier matchingTable = null; + for (TableIdentifier identifier : tableIdentifiers) { + if (identifier.name().equalsIgnoreCase(tableIdentifier.name())) { + if (matchingTable != null) { + throw new TrinoException(NOT_SUPPORTED, "Duplicate table names are not supported with Iceberg REST catalog: " + + Joiner.on(", ").join(matchingTable, identifier.name())); + } + matchingTable = identifier; + } + } + return matchingTable == null ? TableIdentifier.of(remoteNamespace, tableIdentifier.name()) : matchingTable; + } + + private TableIdentifier toRemoteView(ConnectorSession session, SchemaTableName schemaViewName, boolean getCached) + { + TableIdentifier tableIdentifier = toIdentifier(schemaViewName); + return toRemoteObject(tableIdentifier, () -> findRemoteView(session, tableIdentifier), getCached); + } + + private TableIdentifier findRemoteView(ConnectorSession session, TableIdentifier tableIdentifier) + { + if (!viewEndpointsEnabled) { + return tableIdentifier; + } + + Namespace remoteNamespace = toRemoteNamespace(session, tableIdentifier.namespace()); + List tableIdentifiers; + try { + tableIdentifiers = restSessionCatalog.listViews(convert(session), remoteNamespace); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list views", e); + } + TableIdentifier matchingView = null; + for (TableIdentifier identifier : tableIdentifiers) { + if (identifier.name().equalsIgnoreCase(tableIdentifier.name())) { + if (matchingView != null) { + throw new TrinoException(NOT_SUPPORTED, "Duplicate view names are not supported with Iceberg REST catalog: " + + Joiner.on(", ").join(matchingView.name(), identifier.name())); + } + matchingView = identifier; + } + } + return matchingView == null ? TableIdentifier.of(remoteNamespace, tableIdentifier.name()) : matchingView; + } + + private TableIdentifier toRemoteObject(TableIdentifier tableIdentifier, Supplier remoteObjectProvider, boolean getCached) + { + if (caseInsensitiveNameMatching) { + if (getCached) { + return uncheckedCacheGet(remoteTableMappingCache, tableIdentifier, remoteObjectProvider); + } + return remoteObjectProvider.get(); + } + return tableIdentifier; + } + + private Namespace toRemoteNamespace(ConnectorSession session, Namespace trinoNamespace) + { + if (caseInsensitiveNameMatching) { + return uncheckedCacheGet(remoteNamespaceMappingCache, trinoNamespace, () -> findRemoteNamespace(session, trinoNamespace)); + } + return trinoNamespace; + } + + private Namespace findRemoteNamespace(ConnectorSession session, Namespace trinoNamespace) + { + List matchingRemoteNamespaces = listNamespaces(session, Namespace.empty()).stream() + .filter(ns -> toTrinoNamespace(ns).equals(trinoNamespace)) + .collect(toImmutableList()); + if (matchingRemoteNamespaces.size() > 1) { + throw new TrinoException(NOT_SUPPORTED, "Duplicate namespace names are not supported with Iceberg REST catalog: " + matchingRemoteNamespaces); + } + return matchingRemoteNamespaces.isEmpty() ? trinoNamespace : matchingRemoteNamespaces.get(0); + } + + private List listNamespaces(ConnectorSession session, Namespace parentNamespace) + { + List childNamespaces; + try { + childNamespaces = restSessionCatalog.listNamespaces(convert(session), parentNamespace); + } + catch (RESTException e) { + throw new TrinoException(ICEBERG_CATALOG_ERROR, "Failed to list namespaces", e); + } + return childNamespaces.stream().flatMap(childNamespace -> Stream.concat(Stream.of(childNamespace), listNamespaces(session, childNamespace).stream())).toList(); + } + + private static Namespace toTrinoNamespace(Namespace namespace) { - return TableIdentifier.of(schemaTableName.getSchemaName(), schemaTableName.getTableName()); + return Namespace.of(Arrays.stream(namespace.levels()).map(level -> level.toLowerCase(ENGLISH)).toArray(String[]::new)); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFile.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFile.java index 17fdcad61e10..b74bd34d0cbc 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFile.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFile.java @@ -13,135 +13,70 @@ */ package io.trino.plugin.iceberg.delete; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import io.airlift.slice.SizeOf; import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.types.Conversions; -import java.nio.ByteBuffer; import java.util.List; -import java.util.Map; import java.util.Optional; -import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.collect.ImmutableMap.toImmutableMap; import static io.airlift.slice.SizeOf.SIZE_OF_INT; import static io.airlift.slice.SizeOf.estimatedSizeOf; import static io.airlift.slice.SizeOf.instanceSize; -import static io.trino.plugin.base.io.ByteBuffers.getWrappedBytes; import static java.util.Objects.requireNonNull; - -public final class DeleteFile +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_POS; + +public record DeleteFile( + FileContent content, + String path, + FileFormat format, + long recordCount, + long fileSizeInBytes, + List equalityFieldIds, + Optional rowPositionLowerBound, + Optional rowPositionUpperBound, + long dataSequenceNumber) { private static final long INSTANCE_SIZE = instanceSize(DeleteFile.class); - private final FileContent content; - private final String path; - private final FileFormat format; - private final long recordCount; - private final long fileSizeInBytes; - private final List equalityFieldIds; - private final Map lowerBounds; - private final Map upperBounds; - public static DeleteFile fromIceberg(org.apache.iceberg.DeleteFile deleteFile) { - Map lowerBounds = firstNonNull(deleteFile.lowerBounds(), ImmutableMap.of()) - .entrySet().stream().collect(toImmutableMap(Map.Entry::getKey, entry -> getWrappedBytes(entry.getValue()).clone())); - Map upperBounds = firstNonNull(deleteFile.upperBounds(), ImmutableMap.of()) - .entrySet().stream().collect(toImmutableMap(Map.Entry::getKey, entry -> getWrappedBytes(entry.getValue()).clone())); + Optional rowPositionLowerBound = Optional.ofNullable(deleteFile.lowerBounds()) + .map(bounds -> bounds.get(DELETE_FILE_POS.fieldId())) + .map(bytes -> Conversions.fromByteBuffer(DELETE_FILE_POS.type(), bytes)); + Optional rowPositionUpperBound = Optional.ofNullable(deleteFile.upperBounds()) + .map(bounds -> bounds.get(DELETE_FILE_POS.fieldId())) + .map(bytes -> Conversions.fromByteBuffer(DELETE_FILE_POS.type(), bytes)); return new DeleteFile( deleteFile.content(), - deleteFile.path().toString(), + deleteFile.location(), deleteFile.format(), deleteFile.recordCount(), deleteFile.fileSizeInBytes(), Optional.ofNullable(deleteFile.equalityFieldIds()).orElseGet(ImmutableList::of), - lowerBounds, - upperBounds); - } - - @JsonCreator - public DeleteFile( - FileContent content, - String path, - FileFormat format, - long recordCount, - long fileSizeInBytes, - List equalityFieldIds, - Map lowerBounds, - Map upperBounds) - { - this.content = requireNonNull(content, "content is null"); - this.path = requireNonNull(path, "path is null"); - this.format = requireNonNull(format, "format is null"); - this.recordCount = recordCount; - this.fileSizeInBytes = fileSizeInBytes; - this.equalityFieldIds = ImmutableList.copyOf(requireNonNull(equalityFieldIds, "equalityFieldIds is null")); - this.lowerBounds = ImmutableMap.copyOf(requireNonNull(lowerBounds, "lowerBounds is null")); - this.upperBounds = ImmutableMap.copyOf(requireNonNull(upperBounds, "upperBounds is null")); - } - - @JsonProperty - public FileContent content() - { - return content; - } - - @JsonProperty - public String path() - { - return path; - } - - @JsonProperty - public FileFormat format() - { - return format; - } - - @JsonProperty - public long recordCount() - { - return recordCount; - } - - @JsonProperty - public long fileSizeInBytes() - { - return fileSizeInBytes; - } - - @JsonProperty - public List equalityFieldIds() - { - return equalityFieldIds; - } - - @JsonProperty - public Map getLowerBounds() - { - return lowerBounds; + rowPositionLowerBound, + rowPositionUpperBound, + deleteFile.dataSequenceNumber()); } - @JsonProperty - public Map getUpperBounds() + public DeleteFile { - return upperBounds; + requireNonNull(content, "content is null"); + requireNonNull(path, "path is null"); + requireNonNull(format, "format is null"); + equalityFieldIds = ImmutableList.copyOf(requireNonNull(equalityFieldIds, "equalityFieldIds is null")); + requireNonNull(rowPositionLowerBound, "rowPositionLowerBound is null"); + requireNonNull(rowPositionUpperBound, "rowPositionUpperBound is null"); } - public long getRetainedSizeInBytes() + public long retainedSizeInBytes() { return INSTANCE_SIZE + estimatedSizeOf(path) - + estimatedSizeOf(equalityFieldIds, ignored -> SIZE_OF_INT) - + estimatedSizeOf(lowerBounds, entry -> SIZE_OF_INT, SizeOf::sizeOf) - + estimatedSizeOf(upperBounds, entry -> SIZE_OF_INT, SizeOf::sizeOf); + + estimatedSizeOf(equalityFieldIds, ignore -> SIZE_OF_INT); } @Override diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFilter.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFilter.java index b061ec276efc..68ba48f49848 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFilter.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteFilter.java @@ -19,5 +19,5 @@ public interface DeleteFilter { - RowPredicate createPredicate(List columns); + RowPredicate createPredicate(List columns, long dataSequenceNumber); } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteManager.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteManager.java new file mode 100644 index 000000000000..5dc29b8482eb --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/DeleteManager.java @@ -0,0 +1,214 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.delete; + +import com.google.common.base.VerifyException; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import io.airlift.slice.Slice; +import io.trino.plugin.iceberg.IcebergColumnHandle; +import io.trino.plugin.iceberg.IcebergPageSourceProvider.ReaderPageSourceWithRowPositions; +import io.trino.plugin.iceberg.delete.EqualityDeleteFilter.EqualityDeleteFilterBuilder; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.NullableValue; +import io.trino.spi.predicate.Range; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.predicate.ValueSet; +import io.trino.spi.type.TypeManager; +import org.apache.iceberg.Schema; +import org.roaringbitmap.longlong.LongBitmapDataProvider; +import org.roaringbitmap.longlong.Roaring64Bitmap; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; + +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.slice.Slices.utf8Slice; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_BAD_DATA; +import static io.trino.plugin.iceberg.IcebergUtil.getColumnHandle; +import static io.trino.plugin.iceberg.IcebergUtil.schemaFromHandles; +import static io.trino.plugin.iceberg.delete.PositionDeleteFilter.readPositionDeletes; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_PATH; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_POS; + +public class DeleteManager +{ + private final TypeManager typeManager; + private final Map, EqualityDeleteFilterBuilder> equalityDeleteFiltersBySchema = new ConcurrentHashMap<>(); + + public DeleteManager(TypeManager typeManager) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + public Optional getDeletePredicate( + String dataFilePath, + long dataSequenceNumber, + List deleteFiles, + List readColumns, + Schema tableSchema, + ReaderPageSourceWithRowPositions readerPageSourceWithRowPositions, + DeletePageSourceProvider deletePageSourceProvider) + { + if (deleteFiles.isEmpty()) { + return Optional.empty(); + } + + List positionDeleteFiles = new ArrayList<>(); + List equalityDeleteFiles = new ArrayList<>(); + for (DeleteFile deleteFile : deleteFiles) { + switch (deleteFile.content()) { + case POSITION_DELETES -> positionDeleteFiles.add(deleteFile); + case EQUALITY_DELETES -> equalityDeleteFiles.add(deleteFile); + case DATA -> throw new VerifyException("DATA is not delete file type"); + } + } + + Optional positionDeletes = createPositionDeleteFilter(dataFilePath, positionDeleteFiles, readerPageSourceWithRowPositions, deletePageSourceProvider) + .map(filter -> filter.createPredicate(readColumns, dataSequenceNumber)); + Optional equalityDeletes = createEqualityDeleteFilter(equalityDeleteFiles, tableSchema, deletePageSourceProvider).stream() + .map(filter -> filter.createPredicate(readColumns, dataSequenceNumber)) + .reduce(RowPredicate::and); + + if (positionDeletes.isEmpty()) { + return equalityDeletes; + } + return equalityDeletes + .map(rowPredicate -> positionDeletes.get().and(rowPredicate)) + .or(() -> positionDeletes); + } + + public interface DeletePageSourceProvider + { + ConnectorPageSource openDeletes( + DeleteFile delete, + List deleteColumns, + TupleDomain tupleDomain); + } + + private Optional createPositionDeleteFilter( + String dataFilePath, + List positionDeleteFiles, + ReaderPageSourceWithRowPositions readerPageSourceWithRowPositions, + DeletePageSourceProvider deletePageSourceProvider) + { + if (positionDeleteFiles.isEmpty()) { + return Optional.empty(); + } + + Slice targetPath = utf8Slice(dataFilePath); + + Optional startRowPosition = readerPageSourceWithRowPositions.startRowPosition(); + Optional endRowPosition = readerPageSourceWithRowPositions.endRowPosition(); + verify(startRowPosition.isPresent() == endRowPosition.isPresent(), "startRowPosition and endRowPosition must be specified together"); + IcebergColumnHandle deleteFilePath = getColumnHandle(DELETE_FILE_PATH, typeManager); + IcebergColumnHandle deleteFilePos = getColumnHandle(DELETE_FILE_POS, typeManager); + List deleteColumns = ImmutableList.of(deleteFilePath, deleteFilePos); + TupleDomain deleteDomain = TupleDomain.fromFixedValues(ImmutableMap.of(deleteFilePath, NullableValue.of(VARCHAR, targetPath))); + if (startRowPosition.isPresent()) { + Range positionRange = Range.range(deleteFilePos.getType(), startRowPosition.get(), true, endRowPosition.get(), true); + TupleDomain positionDomain = TupleDomain.withColumnDomains(ImmutableMap.of(deleteFilePos, Domain.create(ValueSet.ofRanges(positionRange), false))); + deleteDomain = deleteDomain.intersect(positionDomain); + } + + LongBitmapDataProvider deletedRows = new Roaring64Bitmap(); + for (DeleteFile deleteFile : positionDeleteFiles) { + if (shouldLoadPositionDeleteFile(deleteFile, startRowPosition, endRowPosition)) { + try (ConnectorPageSource pageSource = deletePageSourceProvider.openDeletes(deleteFile, deleteColumns, deleteDomain)) { + readPositionDeletes(pageSource, targetPath, deletedRows); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + + if (deletedRows.isEmpty()) { + return Optional.empty(); + } + return Optional.of(new PositionDeleteFilter(deletedRows)); + } + + private static boolean shouldLoadPositionDeleteFile(DeleteFile deleteFile, Optional startRowPosition, Optional endRowPosition) + { + if (startRowPosition.isEmpty()) { + return true; + } + + Optional positionLowerBound = deleteFile.rowPositionLowerBound(); + Optional positionUpperBound = deleteFile.rowPositionUpperBound(); + return (positionLowerBound.isEmpty() || positionLowerBound.get() <= endRowPosition.orElseThrow()) && + (positionUpperBound.isEmpty() || positionUpperBound.get() >= startRowPosition.get()); + } + + private List createEqualityDeleteFilter(List equalityDeleteFiles, Schema schema, DeletePageSourceProvider deletePageSourceProvider) + { + if (equalityDeleteFiles.isEmpty()) { + return List.of(); + } + + // The equality delete files can be loaded in parallel. There may be multiple split threads attempting to load the + // same files. The current thread will only load a file if it is not already being loaded by another thread. + List> pendingLoads = new ArrayList<>(); + Set deleteFilters = new HashSet<>(); + for (DeleteFile deleteFile : equalityDeleteFiles) { + List fieldIds = deleteFile.equalityFieldIds(); + verify(!fieldIds.isEmpty(), "equality field IDs are missing"); + List deleteColumns = fieldIds.stream() + .map(id -> getColumnHandle(schema.findField(id), typeManager)) + .collect(toImmutableList()); + + // each file can have a different set of columns for the equality delete, so we need to create a new builder for each set of columns + EqualityDeleteFilterBuilder builder = equalityDeleteFiltersBySchema.computeIfAbsent(fieldIds, ignore -> EqualityDeleteFilter.builder(schemaFromHandles(deleteColumns))); + deleteFilters.add(builder); + + ListenableFuture loadFuture = builder.readEqualityDeletes(deleteFile, deleteColumns, deletePageSourceProvider); + if (!loadFuture.isDone()) { + pendingLoads.add(loadFuture); + } + } + + // Wait loads happening in other threads + try { + Futures.allAsList(pendingLoads).get(); + } + catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException(e); + } + catch (ExecutionException e) { + // Since execution can happen on another thread, it is not safe to unwrap the exception + throw new TrinoException(ICEBERG_BAD_DATA, "Failed to load equality deletes", e); + } + + return deleteFilters.stream() + .map(EqualityDeleteFilterBuilder::build) + .toList(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/EqualityDeleteFilter.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/EqualityDeleteFilter.java index a41fbc679479..c027fbf43fc9 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/EqualityDeleteFilter.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/EqualityDeleteFilter.java @@ -13,68 +13,139 @@ */ package io.trino.plugin.iceberg.delete; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListenableFutureTask; +import com.google.errorprone.annotations.ThreadSafe; import io.trino.plugin.iceberg.IcebergColumnHandle; +import io.trino.plugin.iceberg.delete.DeleteManager.DeletePageSourceProvider; import io.trino.spi.Page; +import io.trino.spi.TrinoException; import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.predicate.TupleDomain; import io.trino.spi.type.Type; import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.util.StructLikeSet; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.util.StructLikeWrapper; import org.apache.iceberg.util.StructProjection; +import java.io.IOException; +import java.io.UncheckedIOException; import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; -import static io.trino.plugin.iceberg.IcebergUtil.schemaFromHandles; +import static com.google.common.base.Verify.verify; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_CANNOT_OPEN_SPLIT; +import static io.trino.plugin.iceberg.IcebergUtil.structTypeFromHandles; import static java.util.Objects.requireNonNull; public final class EqualityDeleteFilter implements DeleteFilter { - private final Schema schema; - private final StructLikeSet deleteSet; + private final Schema deleteSchema; + private final Map deletedRows; - private EqualityDeleteFilter(Schema schema, StructLikeSet deleteSet) + private EqualityDeleteFilter(Schema deleteSchema, Map deletedRows) { - this.schema = requireNonNull(schema, "schema is null"); - this.deleteSet = requireNonNull(deleteSet, "deleteSet is null"); + this.deleteSchema = requireNonNull(deleteSchema, "deleteSchema is null"); + this.deletedRows = requireNonNull(deletedRows, "deletedRows is null"); } @Override - public RowPredicate createPredicate(List columns) + public RowPredicate createPredicate(List columns, long splitDataSequenceNumber) { + StructType fileStructType = structTypeFromHandles(columns); + StructType deleteStructType = deleteSchema.asStruct(); + if (deleteSchema.columns().stream().anyMatch(column -> fileStructType.field(column.fieldId()) == null)) { + throw new TrinoException(ICEBERG_CANNOT_OPEN_SPLIT, "columns list doesn't contain all equality delete columns"); + } + + StructLikeWrapper structLikeWrapper = StructLikeWrapper.forType(deleteStructType); + StructProjection projection = StructProjection.create(fileStructType, deleteStructType); Type[] types = columns.stream() .map(IcebergColumnHandle::getType) .toArray(Type[]::new); - Schema fileSchema = schemaFromHandles(columns); - StructProjection projection = StructProjection.create(fileSchema, schema); - return (page, position) -> { - StructLike row = new LazyTrinoRow(types, page, position); - return !deleteSet.contains(projection.wrap(row)); + StructProjection row = projection.wrap(new LazyTrinoRow(types, page, position)); + DataSequenceNumber maxDeleteVersion = deletedRows.get(structLikeWrapper.set(row)); + // clear reference to avoid memory leak + structLikeWrapper.set(null); + return maxDeleteVersion == null || maxDeleteVersion.dataSequenceNumber() <= splitDataSequenceNumber; }; } - public static DeleteFilter readEqualityDeletes(ConnectorPageSource pageSource, List columns, Schema tableSchema) + public static EqualityDeleteFilterBuilder builder(Schema deleteSchema) { - Type[] types = columns.stream() - .map(IcebergColumnHandle::getType) - .toArray(Type[]::new); + return new EqualityDeleteFilterBuilder(deleteSchema); + } - Schema deleteSchema = schemaFromHandles(columns); - StructLikeSet deleteSet = StructLikeSet.create(deleteSchema.asStruct()); + @ThreadSafe + public static class EqualityDeleteFilterBuilder + { + private final Schema deleteSchema; + private final Map deletedRows; + private final Map> loadingFiles = new ConcurrentHashMap<>(); - while (!pageSource.isFinished()) { - Page page = pageSource.getNextPage(); - if (page == null) { - continue; - } + private EqualityDeleteFilterBuilder(Schema deleteSchema) + { + this.deleteSchema = requireNonNull(deleteSchema, "deleteSchema is null"); + this.deletedRows = new ConcurrentHashMap<>(); + } + + public ListenableFuture readEqualityDeletes(DeleteFile deleteFile, List deleteColumns, DeletePageSourceProvider deletePageSourceProvider) + { + verify(deleteColumns.size() == deleteSchema.columns().size(), "delete columns size doesn't match delete schema size"); + + // ensure only one thread loads the file + ListenableFutureTask futureTask = loadingFiles.computeIfAbsent( + deleteFile.path(), + key -> ListenableFutureTask.create(() -> readEqualityDeletesInternal(deleteFile, deleteColumns, deletePageSourceProvider), null)); + futureTask.run(); + return Futures.nonCancellationPropagating(futureTask); + } + + private void readEqualityDeletesInternal(DeleteFile deleteFile, List deleteColumns, DeletePageSourceProvider deletePageSourceProvider) + { + DataSequenceNumber sequenceNumber = new DataSequenceNumber(deleteFile.dataSequenceNumber()); + try (ConnectorPageSource pageSource = deletePageSourceProvider.openDeletes(deleteFile, deleteColumns, TupleDomain.all())) { + Type[] types = deleteColumns.stream() + .map(IcebergColumnHandle::getType) + .toArray(Type[]::new); - for (int position = 0; position < page.getPositionCount(); position++) { - deleteSet.add(new TrinoRow(types, page, position)); + StructLikeWrapper wrapper = StructLikeWrapper.forType(deleteSchema.asStruct()); + while (!pageSource.isFinished()) { + Page page = pageSource.getNextPage(); + if (page == null) { + continue; + } + + for (int position = 0; position < page.getPositionCount(); position++) { + TrinoRow row = new TrinoRow(types, page, position); + deletedRows.merge(wrapper.copyFor(row), sequenceNumber, (existing, newValue) -> { + if (existing.dataSequenceNumber() > newValue.dataSequenceNumber()) { + return existing; + } + return newValue; + }); + } + } + } + catch (IOException e) { + throw new UncheckedIOException(e); } } - return new EqualityDeleteFilter(deleteSchema, deleteSet); + /** + * Builds the EqualityDeleteFilter. + * After building the EqualityDeleteFilter, additional rows can be added to this builder, and the filter can be rebuilt. + */ + public EqualityDeleteFilter build() + { + return new EqualityDeleteFilter(deleteSchema, deletedRows); + } } + + private record DataSequenceNumber(long dataSequenceNumber) {} } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/IcebergPositionDeletePageSink.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/PositionDeleteWriter.java similarity index 57% rename from plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/IcebergPositionDeletePageSink.java rename to plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/PositionDeleteWriter.java index 0eb7b80318f5..1f6bc8f71543 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/IcebergPositionDeletePageSink.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/PositionDeleteWriter.java @@ -24,34 +24,33 @@ import io.trino.plugin.iceberg.MetricsWrapper; import io.trino.plugin.iceberg.PartitionData; import io.trino.spi.Page; +import io.trino.spi.PageBuilder; import io.trino.spi.block.Block; import io.trino.spi.block.RunLengthEncodedBlock; -import io.trino.spi.connector.ConnectorPageSink; import io.trino.spi.connector.ConnectorSession; import org.apache.iceberg.FileContent; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.PartitionSpecParser; import org.apache.iceberg.io.LocationProvider; +import org.roaringbitmap.longlong.ImmutableLongBitmapDataProvider; -import java.util.ArrayList; import java.util.Collection; +import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.concurrent.CompletableFuture; -import static com.google.common.base.Preconditions.checkArgument; import static io.airlift.slice.Slices.utf8Slice; import static io.airlift.slice.Slices.wrappedBuffer; import static io.trino.spi.predicate.Utils.nativeValueToBlock; +import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.VarcharType.VARCHAR; import static java.util.Objects.requireNonNull; import static java.util.UUID.randomUUID; -import static java.util.concurrent.CompletableFuture.completedFuture; -public class IcebergPositionDeletePageSink - implements ConnectorPageSink +public class PositionDeleteWriter { private final String dataFilePath; + private final Block dataFilePathBlock; private final PartitionSpec partitionSpec; private final Optional partition; private final String outputPath; @@ -59,10 +58,7 @@ public class IcebergPositionDeletePageSink private final IcebergFileWriter writer; private final IcebergFileFormat fileFormat; - private long validationCpuNanos; - private boolean writtenData; - - public IcebergPositionDeletePageSink( + public PositionDeleteWriter( String dataFilePath, PartitionSpec partitionSpec, Optional partition, @@ -75,12 +71,14 @@ public IcebergPositionDeletePageSink( Map storageProperties) { this.dataFilePath = requireNonNull(dataFilePath, "dataFilePath is null"); + this.dataFilePathBlock = nativeValueToBlock(VARCHAR, utf8Slice(dataFilePath)); this.jsonCodec = requireNonNull(jsonCodec, "jsonCodec is null"); this.partitionSpec = requireNonNull(partitionSpec, "partitionSpec is null"); this.partition = requireNonNull(partition, "partition is null"); this.fileFormat = requireNonNull(fileFormat, "fileFormat is null"); - // prepend query id to a file name so we can determine which files were written by which query. This is needed for opportunistic cleanup of extra files - // which may be present for successfully completing query in presence of failure recovery mechanisms. + // Prepend query ID to the file name, allowing us to determine the files written by a query. + // This is necessary for opportunistic cleanup of extra files, which may be present for + // successfully completed queries in the presence of failure recovery mechanisms. String fileName = fileFormat.toIceberg().addExtension(session.getQueryId() + "-" + randomUUID()); this.outputPath = partition .map(partitionData -> locationProvider.newDataLocation(partitionSpec, partitionData, fileName)) @@ -88,69 +86,52 @@ public IcebergPositionDeletePageSink( this.writer = fileWriterFactory.createPositionDeleteWriter(fileSystem, Location.of(outputPath), session, fileFormat, storageProperties); } - @Override - public long getCompletedBytes() + public Collection write(ImmutableLongBitmapDataProvider rowsToDelete) { - return writer.getWrittenBytes(); - } + writeDeletes(rowsToDelete); + writer.commit(); - @Override - public long getMemoryUsage() - { - return writer.getMemoryUsage(); - } + CommitTaskData task = new CommitTaskData( + outputPath, + fileFormat, + writer.getWrittenBytes(), + new MetricsWrapper(writer.getFileMetrics().metrics()), + PartitionSpecParser.toJson(partitionSpec), + partition.map(PartitionData::toJson), + FileContent.POSITION_DELETES, + Optional.of(dataFilePath), + writer.getFileMetrics().splitOffsets()); - @Override - public long getValidationCpuNanos() - { - return validationCpuNanos; + return List.of(wrappedBuffer(jsonCodec.toJsonBytes(task))); } - @Override - public CompletableFuture appendPage(Page page) + public void abort() { - checkArgument(page.getChannelCount() == 1, "IcebergPositionDeletePageSink expected a Page with only one channel, but got " + page.getChannelCount()); - - Block[] blocks = new Block[2]; - blocks[0] = RunLengthEncodedBlock.create(nativeValueToBlock(VARCHAR, utf8Slice(dataFilePath)), page.getPositionCount()); - blocks[1] = page.getBlock(0); - writer.appendRows(new Page(blocks)); - - writtenData = true; - return NOT_BLOCKED; + writer.rollback(); } - @Override - public CompletableFuture> finish() + private void writeDeletes(ImmutableLongBitmapDataProvider rowsToDelete) { - Collection commitTasks = new ArrayList<>(); - if (writtenData) { - writer.commit(); - CommitTaskData task = new CommitTaskData( - outputPath, - fileFormat, - writer.getWrittenBytes(), - new MetricsWrapper(writer.getMetrics()), - PartitionSpecParser.toJson(partitionSpec), - partition.map(PartitionData::toJson), - FileContent.POSITION_DELETES, - Optional.of(dataFilePath)); - Long recordCount = task.getMetrics().recordCount(); - if (recordCount != null && recordCount > 0) { - commitTasks.add(wrappedBuffer(jsonCodec.toJsonBytes(task))); + PageBuilder pageBuilder = new PageBuilder(List.of(BIGINT)); + + rowsToDelete.forEach(rowPosition -> { + pageBuilder.declarePosition(); + BIGINT.writeLong(pageBuilder.getBlockBuilder(0), rowPosition); + if (pageBuilder.isFull()) { + writePage(pageBuilder.build()); + pageBuilder.reset(); } - validationCpuNanos = writer.getValidationCpuNanos(); - } - else { - // clean up the empty delete file - writer.rollback(); + }); + + if (!pageBuilder.isEmpty()) { + writePage(pageBuilder.build()); } - return completedFuture(commitTasks); } - @Override - public void abort() + private void writePage(Page page) { - writer.rollback(); + writer.appendRows(new Page( + RunLengthEncodedBlock.create(dataFilePathBlock, page.getPositionCount()), + page.getBlock(0))); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/RowPredicate.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/RowPredicate.java index e02b4834bd4e..cd6b9a7ad4e9 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/RowPredicate.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/delete/RowPredicate.java @@ -13,10 +13,12 @@ */ package io.trino.plugin.iceberg.delete; +import com.google.errorprone.annotations.ThreadSafe; import io.trino.spi.Page; import static java.util.Objects.requireNonNull; +@ThreadSafe public interface RowPredicate { boolean test(Page page, int position); @@ -27,7 +29,7 @@ default RowPredicate and(RowPredicate other) return (page, position) -> test(page, position) && other.test(page, position); } - default Page filterPage(Page page) + default void applyFilter(Page page) { int positionCount = page.getPositionCount(); int[] retained = new int[positionCount]; @@ -38,9 +40,8 @@ default Page filterPage(Page page) retainedCount++; } } - if (retainedCount == positionCount) { - return page; + if (retainedCount != positionCount) { + page.getPositions(retained, 0, retainedCount); } - return page.getPositions(retained, 0, retainedCount); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingFileIo.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingFileIo.java index 03dcb9d109d9..81e58b6b9c2d 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingFileIo.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingFileIo.java @@ -13,9 +13,16 @@ */ package io.trino.plugin.iceberg.fileio; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; +import com.google.common.collect.Streams; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; +import io.trino.spi.TrinoException; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.ManifestFile; import org.apache.iceberg.io.BulkDeletionFailureException; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; @@ -24,8 +31,16 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; import java.util.stream.Stream; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.util.concurrent.MoreExecutors.newDirectExecutorService; +import static io.trino.plugin.base.util.ExecutorUtil.processWithAdditionalThreads; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; import static java.util.Objects.requireNonNull; import static java.util.stream.Collectors.joining; @@ -36,10 +51,22 @@ public class ForwardingFileIo private static final int BATCH_DELETE_PATHS_MESSAGE_LIMIT = 5; private final TrinoFileSystem fileSystem; + private final Map properties; + private final boolean useFileSizeFromMetadata; + private final ExecutorService deleteExecutor; - public ForwardingFileIo(TrinoFileSystem fileSystem) + @VisibleForTesting + public ForwardingFileIo(TrinoFileSystem fileSystem, boolean useFileSizeFromMetadata) + { + this(fileSystem, ImmutableMap.of(), useFileSizeFromMetadata, newDirectExecutorService()); + } + + public ForwardingFileIo(TrinoFileSystem fileSystem, Map properties, boolean useFileSizeFromMetadata, ExecutorService deleteExecutor) { this.fileSystem = requireNonNull(fileSystem, "fileSystem is null"); + this.deleteExecutor = requireNonNull(deleteExecutor, "executorService is null"); + this.properties = ImmutableMap.copyOf(requireNonNull(properties, "properties is null")); + this.useFileSizeFromMetadata = useFileSizeFromMetadata; } @Override @@ -51,13 +78,17 @@ public InputFile newInputFile(String path) @Override public InputFile newInputFile(String path, long length) { + if (!useFileSizeFromMetadata) { + return new ForwardingInputFile(fileSystem.newInputFile(Location.of(path))); + } + return new ForwardingInputFile(fileSystem.newInputFile(Location.of(path), length)); } @Override public OutputFile newOutputFile(String path) { - return new ForwardingOutputFile(fileSystem, path); + return new ForwardingOutputFile(fileSystem, Location.of(path)); } @Override @@ -71,12 +102,51 @@ public void deleteFile(String path) } } + @Override + public void deleteFile(InputFile file) + { + SupportsBulkOperations.super.deleteFile(file); + } + + @Override + public void deleteFile(OutputFile file) + { + SupportsBulkOperations.super.deleteFile(file); + } + @Override public void deleteFiles(Iterable pathsToDelete) throws BulkDeletionFailureException { - Iterable> partitions = Iterables.partition(pathsToDelete, DELETE_BATCH_SIZE); - partitions.forEach(this::deleteBatch); + List> tasks = Streams.stream(Iterables.partition(pathsToDelete, DELETE_BATCH_SIZE)) + .map(batch -> (Callable) () -> { + deleteBatch(batch); + return null; + }).collect(toImmutableList()); + try { + processWithAdditionalThreads(tasks, deleteExecutor); + } + catch (ExecutionException e) { + throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed to delete files", e.getCause()); + } + } + + @Override + public InputFile newInputFile(ManifestFile manifest) + { + return SupportsBulkOperations.super.newInputFile(manifest); + } + + @Override + public InputFile newInputFile(DataFile file) + { + return SupportsBulkOperations.super.newInputFile(file); + } + + @Override + public InputFile newInputFile(DeleteFile file) + { + return SupportsBulkOperations.super.newInputFile(file); } private void deleteBatch(List filesToDelete) @@ -95,4 +165,19 @@ private void deleteBatch(List filesToDelete) e); } } + + @Override + public Map properties() + { + return properties; + } + + @Override + public void initialize(Map properties) + { + throw new UnsupportedOperationException("ForwardingFileIO does not support initialization by properties"); + } + + @Override + public void close() {} } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingFileIoFactory.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingFileIoFactory.java new file mode 100644 index 000000000000..0f5a558446c5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingFileIoFactory.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.fileio; + +import com.google.common.collect.ImmutableMap; +import com.google.inject.Inject; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.plugin.iceberg.ForIcebergFileDelete; +import org.apache.iceberg.io.FileIO; + +import java.util.Map; +import java.util.concurrent.ExecutorService; + +import static java.util.Objects.requireNonNull; + +public class ForwardingFileIoFactory +{ + private final ExecutorService deleteExecutor; + + @Inject + public ForwardingFileIoFactory(@ForIcebergFileDelete ExecutorService deleteExecutor) + { + this.deleteExecutor = requireNonNull(deleteExecutor, "deleteExecutor is null"); + } + + public FileIO create(TrinoFileSystem fileSystem) + { + return create(fileSystem, true, ImmutableMap.of()); + } + + public FileIO create(TrinoFileSystem fileSystem, boolean useFileSizeFromMetadata) + { + return create(fileSystem, useFileSizeFromMetadata, ImmutableMap.of()); + } + + public FileIO create(TrinoFileSystem fileSystem, Map properties) + { + return create(fileSystem, true, properties); + } + + public FileIO create(TrinoFileSystem fileSystem, boolean useFileSizeFromMetadata, Map properties) + { + return new ForwardingFileIo(fileSystem, properties, useFileSizeFromMetadata, deleteExecutor); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingOutputFile.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingOutputFile.java index 40a65d5b7a36..0084f3a233d6 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingOutputFile.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/fileio/ForwardingOutputFile.java @@ -33,10 +33,10 @@ public class ForwardingOutputFile private final TrinoFileSystem fileSystem; private final TrinoOutputFile outputFile; - public ForwardingOutputFile(TrinoFileSystem fileSystem, String path) + public ForwardingOutputFile(TrinoFileSystem fileSystem, Location location) { this.fileSystem = requireNonNull(fileSystem, "fileSystem is null"); - this.outputFile = fileSystem.newOutputFile(Location.of(path)); + this.outputFile = fileSystem.newOutputFile(location); } @Override @@ -54,13 +54,8 @@ public PositionOutputStream create() @Override public PositionOutputStream createOrOverwrite() { - try { - // Callers of this method don't have access to memory context, so we skip tracking memory here - return new CountingPositionOutputStream(outputFile.createOrOverwrite()); - } - catch (IOException e) { - throw new UncheckedIOException("Failed to create file: " + location(), e); - } + // Iceberg never overwrites existing files. All callers use unique names. + return create(); } @Override diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/IcebergFunctionProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/IcebergFunctionProvider.java new file mode 100644 index 000000000000..a8e3ceb04685 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/IcebergFunctionProvider.java @@ -0,0 +1,270 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.functions; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Inject; +import io.airlift.slice.Slice; +import io.trino.plugin.iceberg.functions.tablechanges.TableChangesFunctionHandle; +import io.trino.spi.TrinoException; +import io.trino.spi.function.BoundSignature; +import io.trino.spi.function.FunctionDependencies; +import io.trino.spi.function.FunctionId; +import io.trino.spi.function.FunctionMetadata; +import io.trino.spi.function.FunctionProvider; +import io.trino.spi.function.InvocationConvention; +import io.trino.spi.function.ScalarFunctionAdapter; +import io.trino.spi.function.ScalarFunctionImplementation; +import io.trino.spi.function.Signature; +import io.trino.spi.function.table.ConnectorTableFunctionHandle; +import io.trino.spi.type.BigintType; +import io.trino.spi.type.DateType; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.Int128; +import io.trino.spi.type.IntegerType; +import io.trino.spi.type.LongTimestamp; +import io.trino.spi.type.LongTimestampWithTimeZone; +import io.trino.spi.type.SmallintType; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.TimestampWithTimeZoneType; +import io.trino.spi.type.TinyintType; +import io.trino.spi.type.Type; +import io.trino.spi.type.TypeSignature; +import io.trino.spi.type.VarbinaryType; +import io.trino.spi.type.VarcharType; +import org.apache.iceberg.transforms.Transforms; +import org.apache.iceberg.types.Types; + +import java.lang.invoke.MethodHandle; +import java.math.BigDecimal; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static io.trino.plugin.iceberg.util.Timestamps.timestampTzToMicros; +import static io.trino.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT; +import static io.trino.spi.function.InvocationConvention.InvocationArgumentConvention.NEVER_NULL; +import static io.trino.spi.function.InvocationConvention.InvocationReturnConvention.FAIL_ON_NULL; +import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static java.lang.invoke.MethodHandles.lookup; +import static java.lang.invoke.MethodType.methodType; +import static java.util.Collections.nCopies; +import static java.util.Objects.requireNonNull; + +public class IcebergFunctionProvider + implements FunctionProvider +{ + public static final List FUNCTIONS = ImmutableList.builder() + .add(FunctionMetadata.scalarBuilder() + .functionId(new FunctionId("bucket")) + .description("Perform Iceberg bucket transform") + .signature(Signature.builder() + .typeVariable("T") + .returnType(INTEGER.getTypeSignature()) + .argumentTypes(ImmutableList.of(new TypeSignature("T"), INTEGER.getTypeSignature())) + .build()) + .nullable() + .build()) + .build(); + + private static final MethodHandle BUCKET_INTEGER; + private static final MethodHandle BUCKET_SHORT_DECIMAL; + private static final MethodHandle BUCKET_LONG_DECIMAL; + private static final MethodHandle BUCKET_VARCHAR; + private static final MethodHandle BUCKET_VARBINARY; + private static final MethodHandle BUCKET_DATE; + private static final MethodHandle BUCKET_SHORT_TIMESTAMP; + private static final MethodHandle BUCKET_LONG_TIMESTAMP; + private static final MethodHandle BUCKET_SHORT_TIMESTAMP_WITH_TIME_ZONE; + private static final MethodHandle BUCKET_LONG_TIMESTAMP_WITH_TIME_ZONE; + + static { + try { + BUCKET_INTEGER = lookup().findVirtual(IcebergFunctionProvider.class, "bucketInteger", methodType(long.class, long.class, long.class)); + BUCKET_SHORT_DECIMAL = lookup().findVirtual(IcebergFunctionProvider.class, "bucketShortDecimal", methodType(long.class, DecimalType.class, long.class, long.class)); + BUCKET_LONG_DECIMAL = lookup().findVirtual(IcebergFunctionProvider.class, "bucketLongDecimal", methodType(long.class, DecimalType.class, Int128.class, long.class)); + BUCKET_VARCHAR = lookup().findVirtual(IcebergFunctionProvider.class, "bucketVarchar", methodType(long.class, Slice.class, long.class)); + BUCKET_VARBINARY = lookup().findVirtual(IcebergFunctionProvider.class, "bucketVarbinary", methodType(long.class, Slice.class, long.class)); + BUCKET_DATE = lookup().findVirtual(IcebergFunctionProvider.class, "bucketDate", methodType(long.class, long.class, long.class)); + BUCKET_SHORT_TIMESTAMP = lookup().findVirtual(IcebergFunctionProvider.class, "bucketShortTimestamp", methodType(long.class, long.class, long.class)); + BUCKET_LONG_TIMESTAMP = lookup().findVirtual(IcebergFunctionProvider.class, "bucketLongTimestamp", methodType(long.class, LongTimestamp.class, long.class)); + BUCKET_SHORT_TIMESTAMP_WITH_TIME_ZONE = lookup().findVirtual(IcebergFunctionProvider.class, "bucketShortTimestampWithTimeZone", methodType(long.class, long.class, long.class)); + BUCKET_LONG_TIMESTAMP_WITH_TIME_ZONE = lookup().findVirtual(IcebergFunctionProvider.class, "bucketLongTimestampWithTimeZone", methodType(long.class, LongTimestampWithTimeZone.class, long.class)); + } + catch (ReflectiveOperationException e) { + throw new AssertionError(e); + } + } + + private final TableChangesFunctionProcessorProviderFactory tableChangesFunctionProcessorProviderFactory; + + @Inject + public IcebergFunctionProvider(TableChangesFunctionProcessorProviderFactory tableChangesFunctionProcessorProviderFactory) + { + this.tableChangesFunctionProcessorProviderFactory = requireNonNull(tableChangesFunctionProcessorProviderFactory, "tableChangesFunctionProcessorProviderFactory is null"); + } + + @Override + public ScalarFunctionImplementation getScalarFunctionImplementation( + FunctionId functionId, + BoundSignature boundSignature, + FunctionDependencies functionDependencies, + InvocationConvention invocationConvention) + { + List argumentTypes = boundSignature.getArgumentTypes(); + checkArgument(argumentTypes.size() == 2, "Expected two arguments, but got %s", argumentTypes.size()); + checkArgument(argumentTypes.get(argumentTypes.size() - 1) == INTEGER, "The 2nd argument must be integer type, but got %s", argumentTypes.get(argumentTypes.size() - 1)); + Type type = argumentTypes.get(0); + + MethodHandle handle; + if (type instanceof TinyintType || type instanceof SmallintType || type instanceof IntegerType || type instanceof BigintType) { + handle = BUCKET_INTEGER; + } + else if (type instanceof DecimalType decimalType) { + handle = decimalType.isShort() ? BUCKET_SHORT_DECIMAL : BUCKET_LONG_DECIMAL; + } + else if (type instanceof VarcharType) { + handle = BUCKET_VARCHAR; + } + else if (type instanceof VarbinaryType) { + handle = BUCKET_VARBINARY; + } + else if (type instanceof DateType) { + handle = BUCKET_DATE; + } + else if (type instanceof TimestampType timestampType) { + handle = timestampType.isShort() ? BUCKET_SHORT_TIMESTAMP : BUCKET_LONG_TIMESTAMP; + } + else if (type instanceof TimestampWithTimeZoneType timestampWithTimeZoneType) { + handle = timestampWithTimeZoneType.isShort() ? BUCKET_SHORT_TIMESTAMP_WITH_TIME_ZONE : BUCKET_LONG_TIMESTAMP_WITH_TIME_ZONE; + } + else { + throw new TrinoException(INVALID_FUNCTION_ARGUMENT, "Unsupported type: " + type); + } + + handle = handle.bindTo(this); + + if (type instanceof DecimalType decimalType) { + handle = handle.bindTo(decimalType); + } + + InvocationConvention actualConvention = new InvocationConvention( + nCopies(boundSignature.getArity(), NEVER_NULL), + FAIL_ON_NULL, + false, + false); + + handle = ScalarFunctionAdapter.adapt( + handle, + boundSignature.getReturnType(), + boundSignature.getArgumentTypes(), + actualConvention, + invocationConvention); + + return ScalarFunctionImplementation.builder() + .methodHandle(handle) + .build(); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketInteger(long value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.LongType.get()) + .apply(value); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketShortDecimal(DecimalType decimalType, long value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.DecimalType.of(decimalType.getPrecision(), decimalType.getScale())) + .apply(BigDecimal.valueOf(value)); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketLongDecimal(DecimalType decimalType, Int128 value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.DecimalType.of(decimalType.getPrecision(), decimalType.getScale())) + .apply(new BigDecimal(value.toBigInteger())); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketVarchar(Slice value, long numberOfBuckets) + { + return (long) Transforms.bucket((int) numberOfBuckets) + .bind(Types.StringType.get()) + .apply(value.toStringUtf8()); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketVarbinary(Slice value, long numberOfBuckets) + { + return (long) Transforms.bucket((int) numberOfBuckets) + .bind(Types.BinaryType.get()) + .apply(value.toByteBuffer()); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketDate(long value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.DateType.get()) + .apply((int) value); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketShortTimestamp(long value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.TimestampType.withoutZone()) + .apply(value); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketLongTimestamp(LongTimestamp value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.TimestampType.withoutZone()) + .apply(value.getEpochMicros()); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketShortTimestampWithTimeZone(long value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.TimestampType.withZone()) + .apply(unpackMillisUtc(value) * MICROSECONDS_PER_MILLISECOND); + } + + @SuppressWarnings("MethodMayBeStatic") + public long bucketLongTimestampWithTimeZone(LongTimestampWithTimeZone value, long numberOfBuckets) + { + return Transforms.bucket((int) numberOfBuckets) + .bind(Types.TimestampType.withZone()) + .apply(timestampTzToMicros(value)); + } + + @Override + public TableFunctionProcessorProviderFactory getTableFunctionProcessorProviderFactory(ConnectorTableFunctionHandle functionHandle) + { + if (functionHandle instanceof TableChangesFunctionHandle) { + return new ClassLoaderSafeTableFunctionProcessorProviderFactory(tableChangesFunctionProcessorProviderFactory, getClass().getClassLoader()); + } + + throw new UnsupportedOperationException("Unsupported function: " + functionHandle); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunction.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunction.java new file mode 100644 index 000000000000..408f6d4d1933 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunction.java @@ -0,0 +1,206 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.functions.tablechanges; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Inject; +import io.airlift.slice.Slice; +import io.trino.plugin.iceberg.ColumnIdentity; +import io.trino.plugin.iceberg.IcebergColumnHandle; +import io.trino.plugin.iceberg.IcebergUtil; +import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorAccessControl; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.function.table.AbstractConnectorTableFunction; +import io.trino.spi.function.table.Argument; +import io.trino.spi.function.table.Descriptor; +import io.trino.spi.function.table.ScalarArgument; +import io.trino.spi.function.table.ScalarArgumentSpecification; +import io.trino.spi.function.table.TableFunctionAnalysis; +import io.trino.spi.type.TypeManager; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.util.SnapshotUtil; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.PRIMITIVE; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_ORDINAL_ID; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_ORDINAL_NAME; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_TIMESTAMP_ID; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_TIMESTAMP_NAME; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_TYPE_ID; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_TYPE_NAME; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_VERSION_ID; +import static io.trino.plugin.iceberg.IcebergColumnHandle.DATA_CHANGE_VERSION_NAME; +import static io.trino.plugin.iceberg.TypeConverter.toTrinoType; +import static io.trino.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT; +import static io.trino.spi.function.table.ReturnTypeSpecification.GenericTable.GENERIC_TABLE; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; + +public class TableChangesFunction + extends AbstractConnectorTableFunction +{ + private static final String FUNCTION_NAME = "table_changes"; + private static final String SCHEMA_NAME_VAR_NAME = "SCHEMA_NAME"; + private static final String TABLE_NAME_VAR_NAME = "TABLE_NAME"; + private static final String START_SNAPSHOT_VAR_NAME = "START_SNAPSHOT_ID"; + private static final String END_SNAPSHOT_VAR_NAME = "END_SNAPSHOT_ID"; + + private final TrinoCatalogFactory trinoCatalogFactory; + private final TypeManager typeManager; + + @Inject + public TableChangesFunction(TrinoCatalogFactory trinoCatalogFactory, TypeManager typeManager) + { + super( + "system", + FUNCTION_NAME, + ImmutableList.of( + ScalarArgumentSpecification.builder() + .name(SCHEMA_NAME_VAR_NAME) + .type(VARCHAR) + .build(), + ScalarArgumentSpecification.builder() + .name(TABLE_NAME_VAR_NAME) + .type(VARCHAR) + .build(), + ScalarArgumentSpecification.builder() + .name(START_SNAPSHOT_VAR_NAME) + .type(BIGINT) + .build(), + ScalarArgumentSpecification.builder() + .name(END_SNAPSHOT_VAR_NAME) + .type(BIGINT) + .build()), + GENERIC_TABLE); + + this.trinoCatalogFactory = requireNonNull(trinoCatalogFactory, "trinoCatalogFactory is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + @Override + public TableFunctionAnalysis analyze(ConnectorSession session, ConnectorTransactionHandle transaction, Map arguments, ConnectorAccessControl accessControl) + { + String schema = getSchemaName(arguments); + String table = getTableName(arguments); + + long startSnapshotId = (long) checkNonNull(((ScalarArgument) arguments.get(START_SNAPSHOT_VAR_NAME)).getValue()); + long endSnapshotId = (long) checkNonNull(((ScalarArgument) arguments.get(END_SNAPSHOT_VAR_NAME)).getValue()); + + SchemaTableName schemaTableName = new SchemaTableName(schema, table); + Table icebergTable = trinoCatalogFactory.create(session.getIdentity()) + .loadTable(session, schemaTableName); + + checkSnapshotExists(icebergTable, startSnapshotId); + checkSnapshotExists(icebergTable, endSnapshotId); + if (!SnapshotUtil.isParentAncestorOf(icebergTable, endSnapshotId, startSnapshotId)) { + throw new TrinoException(INVALID_FUNCTION_ARGUMENT, "Starting snapshot (exclusive) %s is not a parent ancestor of end snapshot %s".formatted(startSnapshotId, endSnapshotId)); + } + + ImmutableList.Builder columns = ImmutableList.builder(); + Schema tableSchema = icebergTable.schemas().get(icebergTable.snapshot(endSnapshotId).schemaId()); + tableSchema.columns().stream() + .map(column -> new Descriptor.Field(column.name(), Optional.of(toTrinoType(column.type(), typeManager)))) + .forEach(columns::add); + + columns.add(new Descriptor.Field(DATA_CHANGE_TYPE_NAME, Optional.of(VARCHAR))); + columns.add(new Descriptor.Field(DATA_CHANGE_VERSION_NAME, Optional.of(BIGINT))); + columns.add(new Descriptor.Field(DATA_CHANGE_TIMESTAMP_NAME, Optional.of(TIMESTAMP_TZ_MILLIS))); + columns.add(new Descriptor.Field(DATA_CHANGE_ORDINAL_NAME, Optional.of(INTEGER))); + + ImmutableList.Builder columnHandlesBuilder = ImmutableList.builder(); + IcebergUtil.getTopLevelColumns(tableSchema, typeManager).forEach(columnHandlesBuilder::add); + columnHandlesBuilder.add(IcebergColumnHandle.required(new ColumnIdentity(DATA_CHANGE_TYPE_ID, DATA_CHANGE_TYPE_NAME, PRIMITIVE, ImmutableList.of())) + .columnType(VARCHAR) + .build()); + columnHandlesBuilder.add(IcebergColumnHandle.required(new ColumnIdentity(DATA_CHANGE_VERSION_ID, DATA_CHANGE_VERSION_NAME, PRIMITIVE, ImmutableList.of())) + .columnType(BIGINT) + .build()); + columnHandlesBuilder.add(IcebergColumnHandle.required(new ColumnIdentity(DATA_CHANGE_TIMESTAMP_ID, DATA_CHANGE_TIMESTAMP_NAME, PRIMITIVE, ImmutableList.of())) + .columnType(TIMESTAMP_TZ_MILLIS) + .build()); + columnHandlesBuilder.add(IcebergColumnHandle.required(new ColumnIdentity(DATA_CHANGE_ORDINAL_ID, DATA_CHANGE_ORDINAL_NAME, PRIMITIVE, ImmutableList.of())) + .columnType(INTEGER) + .build()); + List columnHandles = columnHandlesBuilder.build(); + + accessControl.checkCanSelectFromColumns(null, schemaTableName, columnHandles.stream() + .map(IcebergColumnHandle::getName) + .collect(toImmutableSet())); + + return TableFunctionAnalysis.builder() + .returnedType(new Descriptor(columns.build())) + .handle(new TableChangesFunctionHandle( + schemaTableName, + SchemaParser.toJson(tableSchema), + columnHandles, + Optional.ofNullable(icebergTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING)), + startSnapshotId, + endSnapshotId)) + .build(); + } + + private static String getSchemaName(Map arguments) + { + if (argumentExists(arguments, SCHEMA_NAME_VAR_NAME)) { + return ((Slice) checkNonNull(((ScalarArgument) arguments.get(SCHEMA_NAME_VAR_NAME)).getValue())).toStringUtf8(); + } + throw new TrinoException(INVALID_FUNCTION_ARGUMENT, SCHEMA_NAME_VAR_NAME + " argument not found"); + } + + private static String getTableName(Map arguments) + { + if (argumentExists(arguments, TABLE_NAME_VAR_NAME)) { + return ((Slice) checkNonNull(((ScalarArgument) arguments.get(TABLE_NAME_VAR_NAME)).getValue())).toStringUtf8(); + } + throw new TrinoException(INVALID_FUNCTION_ARGUMENT, TABLE_NAME_VAR_NAME + " argument not found"); + } + + private static boolean argumentExists(Map arguments, String key) + { + Argument argument = arguments.get(key); + if (argument instanceof ScalarArgument scalarArgument) { + return !scalarArgument.getNullableValue().isNull(); + } + throw new IllegalArgumentException("Unsupported argument type: " + argument); + } + + private static Object checkNonNull(Object argumentValue) + { + if (argumentValue == null) { + throw new TrinoException(INVALID_FUNCTION_ARGUMENT, FUNCTION_NAME + " arguments may not be null"); + } + return argumentValue; + } + + private static void checkSnapshotExists(Table icebergTable, long snapshotId) + { + if (icebergTable.snapshot(snapshotId) == null) { + throw new TrinoException(INVALID_FUNCTION_ARGUMENT, "Snapshot not found in Iceberg table history: " + snapshotId); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunctionHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunctionHandle.java new file mode 100644 index 000000000000..97354093476c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunctionHandle.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.functions.tablechanges; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.IcebergColumnHandle; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.function.table.ConnectorTableFunctionHandle; + +import java.util.List; +import java.util.Optional; + +import static java.util.Objects.requireNonNull; + +public record TableChangesFunctionHandle( + SchemaTableName schemaTableName, + String tableSchemaJson, + List columns, + Optional nameMappingJson, + long startSnapshotId, + long endSnapshotId) implements ConnectorTableFunctionHandle +{ + public TableChangesFunctionHandle + { + requireNonNull(schemaTableName, "schemaTableName is null"); + requireNonNull(tableSchemaJson, "tableSchemaJson is null"); + columns = ImmutableList.copyOf(requireNonNull(columns, "columns is null")); + requireNonNull(nameMappingJson, "nameMappingJson is null"); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunctionProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunctionProvider.java new file mode 100644 index 000000000000..18d0d4a748f3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesFunctionProvider.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.functions.tablechanges; + +import com.google.inject.Inject; +import com.google.inject.Provider; +import io.trino.plugin.base.classloader.ClassLoaderSafeConnectorTableFunction; +import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; +import io.trino.spi.function.table.ConnectorTableFunction; +import io.trino.spi.type.TypeManager; + +import static java.util.Objects.requireNonNull; + +public class TableChangesFunctionProvider + implements Provider +{ + private final TrinoCatalogFactory trinoCatalogFactory; + private final TypeManager typeManager; + + @Inject + public TableChangesFunctionProvider(TrinoCatalogFactory trinoCatalogFactory, TypeManager typeManager) + { + this.trinoCatalogFactory = requireNonNull(trinoCatalogFactory, "trinoCatalogFactory is null"); + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + @Override + public ConnectorTableFunction get() + { + return new ClassLoaderSafeConnectorTableFunction( + new TableChangesFunction(trinoCatalogFactory, typeManager), + getClass().getClassLoader()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesSplit.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesSplit.java new file mode 100644 index 000000000000..66b7e2e20cbc --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesSplit.java @@ -0,0 +1,122 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.functions.tablechanges; + +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.SizeOf; +import io.trino.plugin.iceberg.IcebergFileFormat; +import io.trino.spi.HostAddress; +import io.trino.spi.SplitWeight; +import io.trino.spi.connector.ConnectorSplit; + +import java.util.List; +import java.util.Map; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static io.airlift.slice.SizeOf.estimatedSizeOf; +import static java.util.Objects.requireNonNull; + +public record TableChangesSplit( + ChangeType changeType, + long snapshotId, + long snapshotTimestamp, + int changeOrdinal, + String path, + long start, + long length, + long fileSize, + long fileRecordCount, + IcebergFileFormat fileFormat, + String partitionSpecJson, + String partitionDataJson, + SplitWeight splitWeight, + Map fileIoProperties) implements ConnectorSplit +{ + private static final int INSTANCE_SIZE = SizeOf.instanceSize(TableChangesSplit.class); + + public TableChangesSplit + { + requireNonNull(changeType, "changeType is null"); + requireNonNull(path, "path is null"); + requireNonNull(fileFormat, "fileFormat is null"); + requireNonNull(partitionSpecJson, "partitionSpecJson is null"); + requireNonNull(partitionDataJson, "partitionDataJson is null"); + requireNonNull(splitWeight, "splitWeight is null"); + fileIoProperties = ImmutableMap.copyOf(requireNonNull(fileIoProperties, "fileIoProperties is null")); + } + + @Override + public SplitWeight getSplitWeight() + { + return splitWeight; + } + + @Override + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE + + estimatedSizeOf(path) + + estimatedSizeOf(partitionSpecJson) + + estimatedSizeOf(partitionDataJson) + + splitWeight.getRetainedSizeInBytes() + + estimatedSizeOf(fileIoProperties, SizeOf::estimatedSizeOf, SizeOf::estimatedSizeOf); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(path) + .add("start", start) + .add("length", length) + .add("records", fileRecordCount) + .toString(); + } + + public enum ChangeType { + ADDED_FILE("insert"), + DELETED_FILE("delete"), + POSITIONAL_DELETE("delete"); + + private final String tableValue; + + ChangeType(String tableValue) + { + this.tableValue = tableValue; + } + + public String getTableValue() + { + return tableValue; + } + } + + @Override + public boolean isRemotelyAccessible() + { + return true; + } + + @Override + public List getAddresses() + { + return List.of(); + } + + @Override + public Object getInfo() + { + throw new UnsupportedOperationException("Unimplemented method 'getInfo'"); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesSplitSource.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesSplitSource.java new file mode 100644 index 000000000000..0ff3415d5273 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/functions/tablechanges/TableChangesSplitSource.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.functions.tablechanges; + +import com.google.common.io.Closer; +import io.trino.plugin.iceberg.IcebergFileFormat; +import io.trino.plugin.iceberg.PartitionData; +import io.trino.spi.SplitWeight; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorSplitSource; +import io.trino.spi.type.DateTimeEncoding; +import org.apache.iceberg.AddedRowsScanTask; +import org.apache.iceberg.ChangelogScanTask; +import org.apache.iceberg.DeletedDataFileScanTask; +import org.apache.iceberg.IncrementalChangelogScan; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.SplittableScanTask; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.CompletableFuture; + +import static com.google.common.collect.Iterators.singletonIterator; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static io.trino.spi.type.TimeZoneKey.UTC_KEY; +import static java.util.Collections.emptyIterator; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.CompletableFuture.completedFuture; + +public class TableChangesSplitSource + implements ConnectorSplitSource +{ + private final Table icebergTable; + private final IncrementalChangelogScan tableScan; + private final long targetSplitSize; + private final Closer closer = Closer.create(); + + private CloseableIterable changelogScanIterable; + private CloseableIterator changelogScanIterator; + private Iterator fileTasksIterator = emptyIterator(); + + public TableChangesSplitSource( + Table icebergTable, + IncrementalChangelogScan tableScan) + { + this.icebergTable = requireNonNull(icebergTable, "table is null"); + this.tableScan = requireNonNull(tableScan, "tableScan is null"); + this.targetSplitSize = tableScan.targetSplitSize(); + } + + @Override + public CompletableFuture getNextBatch(int maxSize) + { + if (changelogScanIterable == null) { + try { + this.changelogScanIterable = closer.register(tableScan.planFiles()); + this.changelogScanIterator = closer.register(changelogScanIterable.iterator()); + } + catch (UnsupportedOperationException e) { + throw new TrinoException(NOT_SUPPORTED, "Table uses features which are not yet supported by the table_changes function", e); + } + } + + List splits = new ArrayList<>(maxSize); + while (splits.size() < maxSize && (fileTasksIterator.hasNext() || changelogScanIterator.hasNext())) { + if (!fileTasksIterator.hasNext()) { + ChangelogScanTask wholeFileTask = changelogScanIterator.next(); + fileTasksIterator = splitIfPossible(wholeFileTask, targetSplitSize); + continue; + } + + ChangelogScanTask next = fileTasksIterator.next(); + splits.add(toIcebergSplit(next)); + } + return completedFuture(new ConnectorSplitBatch(splits, isFinished())); + } + + @Override + public boolean isFinished() + { + return changelogScanIterator != null && !changelogScanIterator.hasNext() && !fileTasksIterator.hasNext(); + } + + @Override + public void close() + { + try { + closer.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @SuppressWarnings("unchecked") + private static Iterator splitIfPossible(ChangelogScanTask wholeFileScan, long targetSplitSize) + { + if (wholeFileScan instanceof AddedRowsScanTask) { + return ((SplittableScanTask) wholeFileScan).split(targetSplitSize).iterator(); + } + + if (wholeFileScan instanceof DeletedDataFileScanTask) { + return ((SplittableScanTask) wholeFileScan).split(targetSplitSize).iterator(); + } + + return singletonIterator(wholeFileScan); + } + + private ConnectorSplit toIcebergSplit(ChangelogScanTask task) + { + // TODO: Support DeletedRowsScanTask (requires https://github.com/apache/iceberg/pull/6182) + if (task instanceof AddedRowsScanTask addedRowsScanTask) { + return toSplit(addedRowsScanTask); + } + else if (task instanceof DeletedDataFileScanTask deletedDataFileScanTask) { + return toSplit(deletedDataFileScanTask); + } + else { + throw new TrinoException(NOT_SUPPORTED, "ChangelogScanTask type is not supported:" + task); + } + } + + private TableChangesSplit toSplit(AddedRowsScanTask task) + { + return new TableChangesSplit( + TableChangesSplit.ChangeType.ADDED_FILE, + task.commitSnapshotId(), + DateTimeEncoding.packDateTimeWithZone(icebergTable.snapshot(task.commitSnapshotId()).timestampMillis(), UTC_KEY), + task.changeOrdinal(), + task.file().location(), + task.start(), + task.length(), + task.file().fileSizeInBytes(), + task.file().recordCount(), + IcebergFileFormat.fromIceberg(task.file().format()), + PartitionSpecParser.toJson(task.spec()), + PartitionData.toJson(task.file().partition()), + SplitWeight.standard(), + icebergTable.io().properties()); + } + + private TableChangesSplit toSplit(DeletedDataFileScanTask task) + { + return new TableChangesSplit( + TableChangesSplit.ChangeType.DELETED_FILE, + task.commitSnapshotId(), + DateTimeEncoding.packDateTimeWithZone(icebergTable.snapshot(task.commitSnapshotId()).timestampMillis(), UTC_KEY), + task.changeOrdinal(), + task.file().location(), + task.start(), + task.length(), + task.file().fileSizeInBytes(), + task.file().recordCount(), + IcebergFileFormat.fromIceberg(task.file().format()), + PartitionSpecParser.toJson(task.spec()), + PartitionData.toJson(task.file().partition()), + SplitWeight.standard(), + icebergTable.io().properties()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/AddFilesTableFromTableProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/AddFilesTableFromTableProcedure.java new file mode 100644 index 000000000000..b96f2dc42f00 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/AddFilesTableFromTableProcedure.java @@ -0,0 +1,79 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Inject; +import com.google.inject.Provider; +import io.trino.plugin.iceberg.procedure.MigrationUtils.RecursiveDirectory; +import io.trino.spi.connector.TableProcedureMetadata; +import io.trino.spi.session.PropertyMetadata; +import io.trino.spi.type.MapType; +import io.trino.spi.type.TypeManager; + +import java.util.Map; + +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.ADD_FILES_FROM_TABLE; +import static io.trino.spi.connector.TableProcedureExecutionMode.coordinatorOnly; +import static io.trino.spi.session.PropertyMetadata.enumProperty; +import static io.trino.spi.session.PropertyMetadata.stringProperty; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; + +public class AddFilesTableFromTableProcedure + implements Provider +{ + private final TypeManager typeManager; + + @Inject + public AddFilesTableFromTableProcedure(TypeManager typeManager) + { + this.typeManager = requireNonNull(typeManager, "typeManager is null"); + } + + @Override + public TableProcedureMetadata get() + { + return new TableProcedureMetadata( + ADD_FILES_FROM_TABLE.name(), + coordinatorOnly(), + ImmutableList.>builder() + .add(stringProperty( + "schema_name", + "Source schema name", + null, + false)) + .add(stringProperty( + "table_name", + "Source table name", + null, + false)) + .add(new PropertyMetadata<>( + "partition_filter", + "Partition filter", + new MapType(VARCHAR, VARCHAR, typeManager.getTypeOperators()), + Map.class, + null, + false, + object -> (Map) object, + Object::toString)) + .add(enumProperty( + "recursive_directory", + "Recursive directory", + RecursiveDirectory.class, + RecursiveDirectory.FAIL, + false)) + .build()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/AddFilesTableProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/AddFilesTableProcedure.java new file mode 100644 index 000000000000..3319c63e8dd7 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/AddFilesTableProcedure.java @@ -0,0 +1,62 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Provider; +import io.trino.plugin.hive.HiveStorageFormat; +import io.trino.plugin.iceberg.procedure.MigrationUtils.RecursiveDirectory; +import io.trino.spi.connector.TableProcedureMetadata; +import io.trino.spi.session.PropertyMetadata; + +import static io.trino.plugin.base.util.Procedures.checkProcedureArgument; +import static io.trino.plugin.hive.HiveStorageFormat.AVRO; +import static io.trino.plugin.hive.HiveStorageFormat.ORC; +import static io.trino.plugin.hive.HiveStorageFormat.PARQUET; +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.ADD_FILES; +import static io.trino.spi.connector.TableProcedureExecutionMode.coordinatorOnly; +import static io.trino.spi.session.PropertyMetadata.enumProperty; +import static io.trino.spi.session.PropertyMetadata.stringProperty; + +public class AddFilesTableProcedure + implements Provider +{ + @Override + public TableProcedureMetadata get() + { + return new TableProcedureMetadata( + ADD_FILES.name(), + coordinatorOnly(), + ImmutableList.>builder() + .add(stringProperty( + "location", + "location", + null, + false)) + .add(enumProperty( + "format", + "File format", + HiveStorageFormat.class, + null, + value -> checkProcedureArgument(value == ORC || value == PARQUET || value == AVRO, "The procedure does not support storage format: %s", value), + false)) + .add(enumProperty( + "recursive_directory", + "Recursive directory", + RecursiveDirectory.class, + RecursiveDirectory.FAIL, + false)) + .build()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergAddFilesFromTableHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergAddFilesFromTableHandle.java new file mode 100644 index 000000000000..67f455a28c38 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergAddFilesFromTableHandle.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import io.trino.plugin.iceberg.procedure.MigrationUtils.RecursiveDirectory; +import jakarta.annotation.Nullable; + +import java.util.Map; + +import static java.util.Objects.requireNonNull; + +public record IcebergAddFilesFromTableHandle( + io.trino.plugin.hive.metastore.Table table, + @Nullable Map partitionFilter, + RecursiveDirectory recursiveDirectory) + implements IcebergProcedureHandle +{ + public IcebergAddFilesFromTableHandle + { + requireNonNull(table, "table is null"); + requireNonNull(recursiveDirectory, "recursiveDirectory is null"); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergAddFilesHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergAddFilesHandle.java new file mode 100644 index 000000000000..6fd365a54089 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergAddFilesHandle.java @@ -0,0 +1,30 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import io.trino.plugin.hive.HiveStorageFormat; +import io.trino.plugin.iceberg.procedure.MigrationUtils.RecursiveDirectory; + +import static java.util.Objects.requireNonNull; + +public record IcebergAddFilesHandle(String location, HiveStorageFormat format, RecursiveDirectory recursiveDirectory) + implements IcebergProcedureHandle +{ + public IcebergAddFilesHandle + { + requireNonNull(location, "location is null"); + requireNonNull(format, "format is null"); + requireNonNull(recursiveDirectory, "recursiveDirectory is null"); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergDropExtendedStatsHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergDropExtendedStatsHandle.java index 55bf7e092a7d..580192c8c9d7 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergDropExtendedStatsHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergDropExtendedStatsHandle.java @@ -13,15 +13,7 @@ */ package io.trino.plugin.iceberg.procedure; -import static com.google.common.base.MoreObjects.toStringHelper; - -public class IcebergDropExtendedStatsHandle - extends IcebergProcedureHandle +public record IcebergDropExtendedStatsHandle() + implements IcebergProcedureHandle { - @Override - public String toString() - { - return toStringHelper(this) - .toString(); - } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergExpireSnapshotsHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergExpireSnapshotsHandle.java index 02687389b233..0757a82d66e3 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergExpireSnapshotsHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergExpireSnapshotsHandle.java @@ -13,35 +13,15 @@ */ package io.trino.plugin.iceberg.procedure; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; import io.airlift.units.Duration; -import static com.google.common.base.MoreObjects.toStringHelper; import static java.util.Objects.requireNonNull; -public class IcebergExpireSnapshotsHandle - extends IcebergProcedureHandle +public record IcebergExpireSnapshotsHandle(Duration retentionThreshold) + implements IcebergProcedureHandle { - private final Duration retentionThreshold; - - @JsonCreator - public IcebergExpireSnapshotsHandle(Duration retentionThreshold) - { - this.retentionThreshold = requireNonNull(retentionThreshold, "retentionThreshold is null"); - } - - @JsonProperty - public Duration getRetentionThreshold() - { - return retentionThreshold; - } - - @Override - public String toString() + public IcebergExpireSnapshotsHandle { - return toStringHelper(this) - .add("retentionThreshold", retentionThreshold) - .toString(); + requireNonNull(retentionThreshold, "retentionThreshold is null"); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergOptimizeHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergOptimizeHandle.java index b64238376536..715764100230 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergOptimizeHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergOptimizeHandle.java @@ -13,8 +13,6 @@ */ package io.trino.plugin.iceberg.procedure; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.airlift.units.DataSize; @@ -26,111 +24,29 @@ import java.util.Map; import java.util.Optional; -import static com.google.common.base.MoreObjects.toStringHelper; import static java.util.Objects.requireNonNull; -public class IcebergOptimizeHandle - extends IcebergProcedureHandle +public record IcebergOptimizeHandle( + Optional snapshotId, + String schemaAsJson, + String partitionSpecAsJson, + List tableColumns, + List sortOrder, + IcebergFileFormat fileFormat, + Map tableStorageProperties, + DataSize maxScannedFileSize, + boolean retriesEnabled) + implements IcebergProcedureHandle { - private final Optional snapshotId; - private final String schemaAsJson; - private final String partitionSpecAsJson; - private final List tableColumns; - private final List sortOrder; - private final IcebergFileFormat fileFormat; - private final Map tableStorageProperties; - private final DataSize maxScannedFileSize; - private final boolean retriesEnabled; - - @JsonCreator - public IcebergOptimizeHandle( - Optional snapshotId, - String schemaAsJson, - String partitionSpecAsJson, - List tableColumns, - List sortOrder, - IcebergFileFormat fileFormat, - Map tableStorageProperties, - DataSize maxScannedFileSize, - boolean retriesEnabled) - { - this.snapshotId = snapshotId; - this.schemaAsJson = requireNonNull(schemaAsJson, "schemaAsJson is null"); - this.partitionSpecAsJson = requireNonNull(partitionSpecAsJson, "partitionSpecAsJson is null"); - this.tableColumns = ImmutableList.copyOf(requireNonNull(tableColumns, "tableColumns is null")); - this.sortOrder = ImmutableList.copyOf(requireNonNull(sortOrder, "sortOrder is null")); - this.fileFormat = requireNonNull(fileFormat, "fileFormat is null"); - this.tableStorageProperties = ImmutableMap.copyOf(requireNonNull(tableStorageProperties, "tableStorageProperties is null")); - this.maxScannedFileSize = requireNonNull(maxScannedFileSize, "maxScannedFileSize is null"); - this.retriesEnabled = retriesEnabled; - } - - @JsonProperty - public Optional getSnapshotId() - { - return snapshotId; - } - - @JsonProperty - public String getSchemaAsJson() - { - return schemaAsJson; - } - - @JsonProperty - public String getPartitionSpecAsJson() - { - return partitionSpecAsJson; - } - - @JsonProperty - public List getTableColumns() - { - return tableColumns; - } - - @JsonProperty - public List getSortOrder() - { - return sortOrder; - } - - @JsonProperty - public IcebergFileFormat getFileFormat() - { - return fileFormat; - } - - @JsonProperty - public Map getTableStorageProperties() - { - return tableStorageProperties; - } - - @JsonProperty - public DataSize getMaxScannedFileSize() - { - return maxScannedFileSize; - } - - @JsonProperty - public boolean isRetriesEnabled() - { - return retriesEnabled; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("snapshotId", snapshotId) - .add("schemaAsJson", schemaAsJson) - .add("partitionSpecAsJson", partitionSpecAsJson) - .add("tableColumns", tableColumns) - .add("fileFormat", fileFormat) - .add("tableStorageProperties", tableStorageProperties) - .add("maxScannedFileSize", maxScannedFileSize) - .add("retriesEnabled", retriesEnabled) - .toString(); + public IcebergOptimizeHandle + { + requireNonNull(snapshotId, "snapshotId is null"); + requireNonNull(schemaAsJson, "schemaAsJson is null"); + requireNonNull(partitionSpecAsJson, "partitionSpecAsJson is null"); + tableColumns = ImmutableList.copyOf(requireNonNull(tableColumns, "tableColumns is null")); + sortOrder = ImmutableList.copyOf(requireNonNull(sortOrder, "sortOrder is null")); + requireNonNull(fileFormat, "fileFormat is null"); + tableStorageProperties = ImmutableMap.copyOf(requireNonNull(tableStorageProperties, "tableStorageProperties is null")); + requireNonNull(maxScannedFileSize, "maxScannedFileSize is null"); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergOptimizeManifestsHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergOptimizeManifestsHandle.java new file mode 100644 index 000000000000..cc1a44174ec5 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergOptimizeManifestsHandle.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +public record IcebergOptimizeManifestsHandle() + implements IcebergProcedureHandle {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergProcedureHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergProcedureHandle.java index e9ce4199eeac..bd2c64e0778c 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergProcedureHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergProcedureHandle.java @@ -20,9 +20,13 @@ use = JsonTypeInfo.Id.NAME, property = "@type") @JsonSubTypes({ - @JsonSubTypes.Type(value = IcebergOptimizeHandle.class, name = "optimize"), @JsonSubTypes.Type(value = IcebergDropExtendedStatsHandle.class, name = "drop_extended_stats"), + @JsonSubTypes.Type(value = IcebergRollbackToSnapshotHandle.class, name = "rollback_to_snapshot"), @JsonSubTypes.Type(value = IcebergExpireSnapshotsHandle.class, name = "expire_snapshots"), + @JsonSubTypes.Type(value = IcebergOptimizeHandle.class, name = "optimize"), + @JsonSubTypes.Type(value = IcebergOptimizeManifestsHandle.class, name = "optimize_manifests"), @JsonSubTypes.Type(value = IcebergRemoveOrphanFilesHandle.class, name = "remove_orphan_files"), + @JsonSubTypes.Type(value = IcebergAddFilesHandle.class, name = "add_files"), + @JsonSubTypes.Type(value = IcebergAddFilesFromTableHandle.class, name = "add_files_from_table"), }) -public abstract class IcebergProcedureHandle {} +public interface IcebergProcedureHandle {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergRemoveOrphanFilesHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergRemoveOrphanFilesHandle.java index e4ac5d9f5890..450109a6cee1 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergRemoveOrphanFilesHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergRemoveOrphanFilesHandle.java @@ -13,35 +13,15 @@ */ package io.trino.plugin.iceberg.procedure; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; import io.airlift.units.Duration; -import static com.google.common.base.MoreObjects.toStringHelper; import static java.util.Objects.requireNonNull; -public class IcebergRemoveOrphanFilesHandle - extends IcebergProcedureHandle +public record IcebergRemoveOrphanFilesHandle(Duration retentionThreshold) + implements IcebergProcedureHandle { - private final Duration retentionThreshold; - - @JsonCreator - public IcebergRemoveOrphanFilesHandle(Duration retentionThreshold) - { - this.retentionThreshold = requireNonNull(retentionThreshold, "retentionThreshold is null"); - } - - @JsonProperty - public Duration getRetentionThreshold() - { - return retentionThreshold; - } - - @Override - public String toString() + public IcebergRemoveOrphanFilesHandle { - return toStringHelper(this) - .add("retentionThreshold", retentionThreshold) - .toString(); + requireNonNull(retentionThreshold, "retentionThreshold is null"); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergRollbackToSnapshotHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergRollbackToSnapshotHandle.java new file mode 100644 index 000000000000..aad1ba3218b1 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergRollbackToSnapshotHandle.java @@ -0,0 +1,17 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +public record IcebergRollbackToSnapshotHandle(long snapshotId) + implements IcebergProcedureHandle {} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableExecuteHandle.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableExecuteHandle.java index 1ffbf7e14d3f..ecf0fad84117 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableExecuteHandle.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableExecuteHandle.java @@ -13,74 +13,35 @@ */ package io.trino.plugin.iceberg.procedure; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.collect.ImmutableMap; import io.trino.spi.connector.ConnectorTableExecuteHandle; import io.trino.spi.connector.SchemaTableName; +import java.util.Map; + import static java.util.Objects.requireNonNull; -public class IcebergTableExecuteHandle +public record IcebergTableExecuteHandle( + SchemaTableName schemaTableName, + IcebergTableProcedureId procedureId, + IcebergProcedureHandle procedureHandle, + String tableLocation, + Map fileIoProperties) implements ConnectorTableExecuteHandle { - private final SchemaTableName schemaTableName; - private final IcebergTableProcedureId procedureId; - private final IcebergProcedureHandle procedureHandle; - private final String tableLocation; - - @JsonCreator - public IcebergTableExecuteHandle( - SchemaTableName schemaTableName, - IcebergTableProcedureId procedureId, - IcebergProcedureHandle procedureHandle, - String tableLocation) - { - this.schemaTableName = requireNonNull(schemaTableName, "schemaTableName is null"); - this.procedureId = requireNonNull(procedureId, "procedureId is null"); - this.procedureHandle = requireNonNull(procedureHandle, "procedureHandle is null"); - this.tableLocation = requireNonNull(tableLocation, "tableLocation is null"); - } - - @JsonProperty - public SchemaTableName getSchemaTableName() - { - return schemaTableName; - } - - @JsonProperty - public IcebergTableProcedureId getProcedureId() - { - return procedureId; - } - - @JsonProperty - public IcebergProcedureHandle getProcedureHandle() - { - return procedureHandle; - } - - @JsonProperty - public String getTableLocation() - { - return tableLocation; - } - - public IcebergTableExecuteHandle withProcedureHandle(IcebergProcedureHandle procedureHandle) + public IcebergTableExecuteHandle { - return new IcebergTableExecuteHandle( - schemaTableName, - procedureId, - procedureHandle, - tableLocation); + requireNonNull(schemaTableName, "schemaTableName is null"); + requireNonNull(procedureId, "procedureId is null"); + requireNonNull(procedureHandle, "procedureHandle is null"); + requireNonNull(tableLocation, "tableLocation is null"); + fileIoProperties = ImmutableMap.copyOf(requireNonNull(fileIoProperties, "fileIoProperties is null")); } @Override public String toString() { - return new StringBuilder() - .append("schemaTableName").append(":").append(schemaTableName) - .append(", procedureId").append(":").append(procedureId) - .append(", procedureHandle").append(":{").append(procedureHandle).append("}") - .toString(); + return "schemaTableName:%s, procedureId:%s, procedureHandle:{%s}".formatted( + schemaTableName, procedureId, procedureHandle); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableProcedureId.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableProcedureId.java index 8b1c68fb23ed..6230f8b779b6 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableProcedureId.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/IcebergTableProcedureId.java @@ -16,7 +16,11 @@ public enum IcebergTableProcedureId { OPTIMIZE, + OPTIMIZE_MANIFESTS, DROP_EXTENDED_STATS, + ROLLBACK_TO_SNAPSHOT, EXPIRE_SNAPSHOTS, REMOVE_ORPHAN_FILES, + ADD_FILES, + ADD_FILES_FROM_TABLE, } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/MigrateProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/MigrateProcedure.java index 6c57853c9e4b..efa17803224d 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/MigrateProcedure.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/MigrateProcedure.java @@ -19,12 +19,8 @@ import com.google.inject.Inject; import com.google.inject.Provider; import io.airlift.log.Logger; -import io.trino.filesystem.FileEntry; -import io.trino.filesystem.FileIterator; -import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.filesystem.TrinoInputFile; import io.trino.plugin.hive.HiveStorageFormat; import io.trino.plugin.hive.metastore.Column; import io.trino.plugin.hive.metastore.HiveMetastore; @@ -33,13 +29,12 @@ import io.trino.plugin.hive.metastore.PrincipalPrivileges; import io.trino.plugin.hive.metastore.RawHiveMetastoreFactory; import io.trino.plugin.hive.metastore.Storage; +import io.trino.plugin.hive.security.UsingSystemSecurity; import io.trino.plugin.iceberg.IcebergConfig; import io.trino.plugin.iceberg.IcebergFileFormat; -import io.trino.plugin.iceberg.IcebergSecurityConfig; -import io.trino.plugin.iceberg.PartitionData; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; -import io.trino.plugin.iceberg.fileio.ForwardingInputFile; +import io.trino.plugin.iceberg.procedure.MigrationUtils.RecursiveDirectory; import io.trino.spi.TrinoException; import io.trino.spi.classloader.ThreadContextClassLoader; import io.trino.spi.connector.ConnectorSession; @@ -50,55 +45,52 @@ import io.trino.spi.type.ArrayType; import io.trino.spi.type.MapType; import io.trino.spi.type.RowType; +import io.trino.spi.type.SmallintType; +import io.trino.spi.type.TimestampType; +import io.trino.spi.type.TinyintType; import io.trino.spi.type.Type; import io.trino.spi.type.TypeManager; import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.Transaction; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.MappingUtil; import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.orc.OrcMetrics; -import org.apache.iceberg.parquet.ParquetUtil; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; -import java.io.IOException; import java.lang.invoke.MethodHandle; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; -import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.Streams.concat; import static io.airlift.slice.Slices.utf8Slice; import static io.trino.plugin.hive.HiveMetadata.TRANSACTIONAL; import static io.trino.plugin.hive.HiveMetadata.extractHiveStorageFormat; +import static io.trino.plugin.hive.HiveTimestampPrecision.MILLISECONDS; import static io.trino.plugin.hive.metastore.MetastoreUtil.buildInitialPrivilegeSet; import static io.trino.plugin.hive.metastore.PrincipalPrivileges.NO_PRIVILEGES; +import static io.trino.plugin.hive.util.HiveTypeUtil.getTypeSignature; import static io.trino.plugin.hive.util.HiveUtil.isDeltaLakeTable; import static io.trino.plugin.hive.util.HiveUtil.isHudiTable; import static io.trino.plugin.hive.util.HiveUtil.isIcebergTable; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_COMMIT_ERROR; -import static io.trino.plugin.iceberg.IcebergSecurityConfig.IcebergSecurity.SYSTEM; -import static io.trino.plugin.iceberg.PartitionFields.parsePartitionFields; import static io.trino.plugin.iceberg.TypeConverter.toIcebergTypeForNewColumn; +import static io.trino.plugin.iceberg.procedure.MigrationUtils.buildDataFiles; +import static io.trino.spi.StandardErrorCode.DUPLICATE_COLUMN_NAME; import static io.trino.spi.StandardErrorCode.INVALID_PROCEDURE_ARGUMENT; import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; -import static io.trino.spi.type.SmallintType.SMALLINT; -import static io.trino.spi.type.TinyintType.TINYINT; import static io.trino.spi.type.VarcharType.VARCHAR; import static java.lang.Boolean.parseBoolean; import static java.lang.invoke.MethodHandles.lookup; @@ -120,7 +112,6 @@ public class MigrateProcedure public static final String PROVIDER_PROPERTY_KEY = "provider"; public static final String PROVIDER_PROPERTY_VALUE = "iceberg"; - private static final MetricsConfig METRICS_CONFIG = MetricsConfig.getDefault(); private final TrinoCatalogFactory catalogFactory; private final HiveMetastoreFactory metastoreFactory; @@ -129,14 +120,6 @@ public class MigrateProcedure private final int formatVersion; private final boolean isUsingSystemSecurity; - private enum RecursiveDirectory - { - TRUE, - FALSE, - FAIL, - /**/ - } - private static final MethodHandle MIGRATE; static { @@ -155,14 +138,14 @@ public MigrateProcedure( TrinoFileSystemFactory fileSystemFactory, TypeManager typeManager, IcebergConfig icebergConfig, - IcebergSecurityConfig securityConfig) + @UsingSystemSecurity boolean usingSystemSecurity) { this.catalogFactory = requireNonNull(catalogFactory, "catalogFactory is null"); this.metastoreFactory = requireNonNull(metastoreFactory, "metastoreFactory is null"); this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.formatVersion = icebergConfig.getFormatVersion(); - this.isUsingSystemSecurity = securityConfig.getSecuritySystem() == SYSTEM; + this.isUsingSystemSecurity = usingSystemSecurity; } @Override @@ -183,7 +166,7 @@ public void migrate(ConnectorSession session, String schemaName, String tableNam // this line guarantees that classLoader that we stored in the field will be used inside try/catch // as we captured reference to PluginClassLoader during initialization of this class // we can use it now to correctly execute the procedure - try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + try (ThreadContextClassLoader ignore = new ThreadContextClassLoader(getClass().getClassLoader())) { doMigrate(session, schemaName, tableName, recursiveDirectory); } } @@ -214,18 +197,19 @@ public void doMigrate(ConnectorSession session, String schemaName, String tableN throw new TrinoException(NOT_SUPPORTED, "The table is already an Iceberg table"); } - Schema schema = toIcebergSchema(concat(hiveTable.getDataColumns().stream(), hiveTable.getPartitionColumns().stream()).toList()); - NameMapping nameMapping = MappingUtil.create(schema); HiveStorageFormat storageFormat = extractHiveStorageFormat(hiveTable.getStorage().getStorageFormat()); + Schema schema = toIcebergSchema(concat(hiveTable.getDataColumns().stream(), hiveTable.getPartitionColumns().stream()).toList(), toIcebergFileFormat(storageFormat)); + NameMapping nameMapping = MappingUtil.create(schema); String location = hiveTable.getStorage().getLocation(); Map properties = icebergTableProperties(location, hiveTable.getParameters(), nameMapping, toIcebergFileFormat(storageFormat)); - PartitionSpec partitionSpec = parsePartitionFields(schema, getPartitionColumnNames(hiveTable)); + PartitionSpec partitionSpec = parsePartitionFields(schema, hiveTable); try { + TrinoFileSystem fileSystem = fileSystemFactory.create(session); ImmutableList.Builder dataFilesBuilder = ImmutableList.builder(); if (hiveTable.getPartitionColumns().isEmpty()) { log.debug("Building data files from %s", location); - dataFilesBuilder.addAll(buildDataFiles(session, recursive, storageFormat, location, partitionSpec, new PartitionData(new Object[]{}), nameMapping)); + dataFilesBuilder.addAll(buildDataFiles(fileSystem, recursive, storageFormat, location, partitionSpec, Optional.empty(), schema)); } else { Map> partitions = listAllPartitions(metastore, hiveTable); @@ -235,7 +219,7 @@ public void doMigrate(ConnectorSession session, String schemaName, String tableN log.debug("Building data files from '%s' for partition %d of %d", storage.getLocation(), fileCount++, partitions.size()); HiveStorageFormat partitionStorageFormat = extractHiveStorageFormat(storage.getStorageFormat()); StructLike partitionData = DataFiles.data(partitionSpec, partition.getKey()); - dataFilesBuilder.addAll(buildDataFiles(session, recursive, partitionStorageFormat, storage.getLocation(), partitionSpec, partitionData, nameMapping)); + dataFilesBuilder.addAll(buildDataFiles(fileSystem, recursive, partitionStorageFormat, storage.getLocation(), partitionSpec, Optional.of(partitionData), schema)); } } @@ -244,9 +228,9 @@ public void doMigrate(ConnectorSession session, String schemaName, String tableN session, sourceTableName, schema, - parsePartitionFields(schema, toPartitionFields(hiveTable)), + partitionSpec, unsorted(), - location, + Optional.of(location), properties); List dataFiles = dataFilesBuilder.build(); @@ -293,78 +277,97 @@ private Map icebergTableProperties(String location, Map columns) + private Schema toIcebergSchema(List columns, IcebergFileFormat storageFormat) { AtomicInteger nextFieldId = new AtomicInteger(1); List icebergColumns = new ArrayList<>(); for (Column column : columns) { int index = icebergColumns.size(); - org.apache.iceberg.types.Type type = toIcebergType(typeManager.getType(column.getType().getTypeSignature()), nextFieldId); - Types.NestedField field = Types.NestedField.of(index, false, column.getName(), type, column.getComment().orElse(null)); + org.apache.iceberg.types.Type type = toIcebergType(typeManager.getType(getTypeSignature(column.getType(), MILLISECONDS)), nextFieldId, storageFormat); + Types.NestedField field = Types.NestedField.optional(index, column.getName(), type, column.getComment().orElse(null)); icebergColumns.add(field); } + org.apache.iceberg.types.Type icebergSchema = Types.StructType.of(icebergColumns); - icebergSchema = TypeUtil.assignFreshIds(icebergSchema, nextFieldId::getAndIncrement); + // Assign column id start from 1 + icebergSchema = TypeUtil.assignFreshIds(icebergSchema, new AtomicInteger(1)::getAndIncrement); return new Schema(icebergSchema.asStructType().fields()); } - private static org.apache.iceberg.types.Type toIcebergType(Type type, AtomicInteger nextFieldId) + private static org.apache.iceberg.types.Type toIcebergType(Type type, AtomicInteger nextFieldId, IcebergFileFormat storageFormat) { - if (type instanceof ArrayType || type instanceof MapType || type instanceof RowType) { - // TODO https://github.com/trinodb/trino/issues/17583 Add support for these complex types - throw new TrinoException(NOT_SUPPORTED, "Migrating %s type is not supported".formatted(type)); - } - if (type.equals(TINYINT) || type.equals(SMALLINT)) { + if (type instanceof TinyintType || type instanceof SmallintType) { return Types.IntegerType.get(); } - return toIcebergTypeForNewColumn(type, nextFieldId); + else if (type instanceof TimestampType) { + switch (storageFormat) { + case ORC: + return Types.TimestampType.withoutZone(); + case PARQUET: + return Types.TimestampType.withZone(); + case AVRO: + // TODO https://github.com/trinodb/trino/issues/20481 + throw new TrinoException(NOT_SUPPORTED, "Migrating timestamp type with Avro format is not supported."); + default: + throw new TrinoException(NOT_SUPPORTED, "Unsupported storage format for timestamp type: " + storageFormat); + } + } + else if (type instanceof RowType rowType) { + return fromRow(rowType, nextFieldId, storageFormat); + } + else if (type instanceof ArrayType arrayType) { + return fromArray(arrayType, nextFieldId, storageFormat); + } + else if (type instanceof MapType mapType) { + return fromMap(mapType, nextFieldId, storageFormat); + } + else { + return toIcebergTypeForNewColumn(type, nextFieldId); + } } - public Map> listAllPartitions(HiveMetastore metastore, io.trino.plugin.hive.metastore.Table table) + private static org.apache.iceberg.types.Type fromRow(RowType type, AtomicInteger nextFieldId, IcebergFileFormat storageFormat) { - List partitionNames = table.getPartitionColumns().stream().map(Column::getName).collect(toImmutableList()); - Optional> partitions = metastore.getPartitionNamesByFilter(table.getDatabaseName(), table.getTableName(), partitionNames, TupleDomain.all()); - if (partitions.isEmpty()) { - return ImmutableMap.of(); + Set fieldNames = new HashSet<>(); + List fields = new ArrayList<>(); + for (int i = 0; i < type.getFields().size(); i++) { + int id = nextFieldId.getAndIncrement(); + RowType.Field field = type.getFields().get(i); + String name = field.getName().orElseThrow(() -> new TrinoException(NOT_SUPPORTED, "Row type field does not have a name: " + type.getDisplayName())); + if (!fieldNames.add(name.toLowerCase(ENGLISH))) { + throw new TrinoException(DUPLICATE_COLUMN_NAME, "Field name '%s' specified more than once".formatted(name.toLowerCase(ENGLISH))); + } + org.apache.iceberg.types.Type icebergTypeInternal = toIcebergType(field.getType(), nextFieldId, storageFormat); + fields.add(Types.NestedField.optional(id, name, icebergTypeInternal)); } - return metastore.getPartitionsByNames(table, partitions.get()); + return Types.StructType.of(fields); } - private List buildDataFiles(ConnectorSession session, RecursiveDirectory recursive, HiveStorageFormat format, String location, PartitionSpec partitionSpec, StructLike partition, NameMapping nameMapping) - throws IOException + private static org.apache.iceberg.types.Type fromArray(ArrayType type, AtomicInteger nextFieldId, IcebergFileFormat storageFormat) { - // TODO: Introduce parallelism - TrinoFileSystem fileSystem = fileSystemFactory.create(session); - FileIterator files = fileSystem.listFiles(Location.of(location)); - ImmutableList.Builder dataFilesBuilder = ImmutableList.builder(); - while (files.hasNext()) { - FileEntry file = files.next(); - String fileLocation = file.location().toString(); - String relativePath = fileLocation.substring(location.length()); - if (relativePath.contains("/_") || relativePath.contains("/.")) { - continue; - } - if (recursive == RecursiveDirectory.FALSE && isRecursive(location, fileLocation)) { - continue; - } - if (recursive == RecursiveDirectory.FAIL && isRecursive(location, fileLocation)) { - throw new TrinoException(NOT_SUPPORTED, "Recursive directory must not exist when recursive_directory argument is 'fail': " + file.location()); - } + int id = nextFieldId.getAndIncrement(); + return Types.ListType.ofOptional(id, toIcebergType(type.getElementType(), nextFieldId, storageFormat)); + } - Metrics metrics = loadMetrics(fileSystem.newInputFile(file.location()), format, nameMapping); - DataFile dataFile = buildDataFile(file, partition, partitionSpec, format.name(), metrics); - dataFilesBuilder.add(dataFile); - } - List dataFiles = dataFilesBuilder.build(); - log.debug("Found %d files in '%s'", dataFiles.size(), location); - return dataFiles; + private static org.apache.iceberg.types.Type fromMap(MapType type, AtomicInteger nextFieldId, IcebergFileFormat storageFormat) + { + int keyId = nextFieldId.getAndIncrement(); + int valueId = nextFieldId.getAndIncrement(); + return Types.MapType.ofOptional( + keyId, + valueId, + toIcebergType(type.getKeyType(), nextFieldId, storageFormat), + toIcebergType(type.getValueType(), nextFieldId, storageFormat)); } - private static boolean isRecursive(String baseLocation, String location) + public Map> listAllPartitions(HiveMetastore metastore, io.trino.plugin.hive.metastore.Table table) { - verify(location.startsWith(baseLocation), "%s should start with %s", location, baseLocation); - String suffix = location.substring(baseLocation.length() + 1).replaceFirst("^/+", ""); - return suffix.contains("/"); + List partitionNames = table.getPartitionColumns().stream().map(Column::getName).collect(toImmutableList()); + Optional> partitions = metastore.getPartitionNamesByFilter(table.getDatabaseName(), table.getTableName(), partitionNames, TupleDomain.all()); + if (partitions.isEmpty()) { + return ImmutableMap.of(); + } + return metastore.getPartitionsByNames(table, partitions.get()); } private static IcebergFileFormat toIcebergFileFormat(HiveStorageFormat storageFormat) @@ -377,22 +380,12 @@ private static IcebergFileFormat toIcebergFileFormat(HiveStorageFormat storageFo }; } - private static Metrics loadMetrics(TrinoInputFile file, HiveStorageFormat storageFormat, NameMapping nameMapping) - { - InputFile inputFile = new ForwardingInputFile(file); - return switch (storageFormat) { - case ORC -> OrcMetrics.fromInputFile(inputFile, METRICS_CONFIG, nameMapping); - case PARQUET -> ParquetUtil.fileMetrics(inputFile, METRICS_CONFIG, nameMapping); - case AVRO -> new Metrics(Avro.rowCount(inputFile), null, null, null, null); - default -> throw new TrinoException(NOT_SUPPORTED, "Unsupported storage format: " + storageFormat); - }; - } - - private static List toPartitionFields(io.trino.plugin.hive.metastore.Table table) + private static PartitionSpec parsePartitionFields(Schema schema, io.trino.plugin.hive.metastore.Table table) { - ImmutableList.Builder fields = ImmutableList.builder(); - fields.addAll(getPartitionColumnNames(table)); - return fields.build(); + PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); + List partitionColumnNames = getPartitionColumnNames(table); + partitionColumnNames.forEach(builder::identity); + return builder.build(); } private static List getPartitionColumnNames(io.trino.plugin.hive.metastore.Table table) @@ -401,15 +394,4 @@ private static List getPartitionColumnNames(io.trino.plugin.hive.metasto .map(Column::getName) .collect(toImmutableList()); } - - private static DataFile buildDataFile(FileEntry file, StructLike partition, PartitionSpec spec, String format, Metrics metrics) - { - return DataFiles.builder(spec) - .withPath(file.location().toString()) - .withFormat(format) - .withFileSizeInBytes(file.length()) - .withMetrics(metrics) - .withPartition(partition) - .build(); - } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/MigrationUtils.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/MigrationUtils.java new file mode 100644 index 000000000000..ae28b22bffd9 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/MigrationUtils.java @@ -0,0 +1,333 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.airlift.log.Logger; +import io.trino.filesystem.FileEntry; +import io.trino.filesystem.FileIterator; +import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.filesystem.TrinoInputFile; +import io.trino.parquet.ParquetDataSource; +import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.metadata.ParquetMetadata; +import io.trino.parquet.reader.MetadataReader; +import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.hive.HiveStorageFormat; +import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.plugin.hive.metastore.HiveMetastoreFactory; +import io.trino.plugin.hive.metastore.Partition; +import io.trino.plugin.hive.metastore.Storage; +import io.trino.plugin.hive.parquet.TrinoParquetDataSource; +import io.trino.plugin.iceberg.catalog.TrinoCatalog; +import io.trino.plugin.iceberg.fileio.ForwardingInputFile; +import io.trino.plugin.iceberg.util.OrcMetrics; +import io.trino.plugin.iceberg.util.ParquetUtil; +import io.trino.spi.TrinoException; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.SchemaTableName; +import org.apache.iceberg.AppendFiles; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.Transaction; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.mapping.MappingUtil; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.types.Types; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import java.util.stream.Stream; + +import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.trino.plugin.base.util.Procedures.checkProcedureArgument; +import static io.trino.plugin.hive.HiveMetadata.extractHiveStorageFormat; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_COMMIT_ERROR; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isMergeManifestsOnWrite; +import static io.trino.spi.StandardErrorCode.ALREADY_EXISTS; +import static io.trino.spi.StandardErrorCode.CONSTRAINT_VIOLATION; +import static io.trino.spi.StandardErrorCode.NOT_FOUND; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; +import static org.apache.iceberg.mapping.NameMappingParser.toJson; + +public final class MigrationUtils +{ + private static final Logger log = Logger.get(MigrationUtils.class); + private static final Joiner.MapJoiner PARTITION_JOINER = Joiner.on("/").withKeyValueSeparator("="); + + private static final MetricsConfig METRICS_CONFIG = MetricsConfig.getDefault(); + + public enum RecursiveDirectory + { + TRUE, + FALSE, + FAIL, + /**/ + } + + private MigrationUtils() {} + + public static List buildDataFiles( + TrinoFileSystem fileSystem, + RecursiveDirectory recursive, + HiveStorageFormat format, + String location, + PartitionSpec partitionSpec, + Optional partition, + Schema schema) + throws IOException + { + // TODO: Introduce parallelism + FileIterator files = fileSystem.listFiles(Location.of(location)); + ImmutableList.Builder dataFilesBuilder = ImmutableList.builder(); + while (files.hasNext()) { + FileEntry file = files.next(); + String fileLocation = file.location().toString(); + String relativePath = fileLocation.substring(location.length()); + if (relativePath.contains("/_") || relativePath.contains("/.")) { + continue; + } + if (recursive == RecursiveDirectory.FALSE && isRecursive(location, fileLocation)) { + continue; + } + if (recursive == RecursiveDirectory.FAIL && isRecursive(location, fileLocation)) { + throw new TrinoException(NOT_SUPPORTED, "Recursive directory must not exist when recursive_directory argument is 'fail': " + file.location()); + } + + Metrics metrics = loadMetrics(fileSystem.newInputFile(file.location(), file.length()), format, schema); + DataFile dataFile = buildDataFile(fileLocation, file.length(), partition, partitionSpec, format.name(), metrics); + dataFilesBuilder.add(dataFile); + } + List dataFiles = dataFilesBuilder.build(); + log.debug("Found %d files in '%s'", dataFiles.size(), location); + return dataFiles; + } + + private static boolean isRecursive(String baseLocation, String location) + { + verify(location.startsWith(baseLocation), "%s should start with %s", location, baseLocation); + String suffix = location.substring(baseLocation.length() + 1).replaceFirst("^/+", ""); + return suffix.contains("/"); + } + + public static Metrics loadMetrics(TrinoInputFile file, HiveStorageFormat storageFormat, Schema schema) + { + return switch (storageFormat) { + case ORC -> OrcMetrics.fileMetrics(file, METRICS_CONFIG, schema); + case PARQUET -> parquetMetrics(file, METRICS_CONFIG, MappingUtil.create(schema)); + case AVRO -> new Metrics(Avro.rowCount(new ForwardingInputFile(file)), null, null, null, null); + default -> throw new TrinoException(NOT_SUPPORTED, "Unsupported storage format: " + storageFormat); + }; + } + + private static Metrics parquetMetrics(TrinoInputFile file, MetricsConfig metricsConfig, NameMapping nameMapping) + { + ParquetReaderOptions options = ParquetReaderOptions.defaultOptions(); + try (ParquetDataSource dataSource = new TrinoParquetDataSource(file, ParquetReaderOptions.defaultOptions(), new FileFormatDataSourceStats())) { + ParquetMetadata metadata = MetadataReader.readFooter(dataSource, options.getMaxFooterReadSize()); + return ParquetUtil.footerMetrics(metadata, Stream.empty(), metricsConfig, nameMapping); + } + catch (IOException e) { + throw new UncheckedIOException("Failed to read file footer: " + file.location(), e); + } + } + + public static void addFiles( + ConnectorSession session, + TrinoFileSystem fileSystem, + TrinoCatalog catalog, + SchemaTableName targetName, + String location, + HiveStorageFormat format, + RecursiveDirectory recursiveDirectory, + ExecutorService icebergScanExecutor) + { + Table table = catalog.loadTable(session, targetName); + PartitionSpec partitionSpec = table.spec(); + + checkProcedureArgument(partitionSpec.isUnpartitioned(), "The procedure does not support partitioned tables"); + + try { + List dataFiles = buildDataFilesFromLocation(fileSystem, recursiveDirectory, format, location, partitionSpec, Optional.empty(), table.schema()); + addFiles(session, table, dataFiles, icebergScanExecutor); + } + catch (Exception e) { + throw new TrinoException(ICEBERG_COMMIT_ERROR, "Failed to add files: " + firstNonNull(e.getMessage(), e), e); + } + } + + private static List buildDataFilesFromLocation( + TrinoFileSystem fileSystem, + RecursiveDirectory recursive, + HiveStorageFormat format, + String location, + PartitionSpec partitionSpec, + Optional partition, + Schema schema) + throws IOException + { + if (fileSystem.directoryExists(Location.of(location)).orElse(false)) { + return MigrationUtils.buildDataFiles(fileSystem, recursive, format, location, partitionSpec, partition, schema); + } + + TrinoInputFile file = fileSystem.newInputFile(Location.of(location)); + if (file.exists()) { + Metrics metrics = loadMetrics(file, format, schema); + return ImmutableList.of(buildDataFile(file.location().toString(), file.length(), partition, partitionSpec, format.name(), metrics)); + } + + throw new TrinoException(NOT_FOUND, "Location not found: " + location); + } + + public static void addFilesFromTable( + ConnectorSession session, + TrinoFileSystem fileSystem, + HiveMetastoreFactory metastoreFactory, + Table targetTable, + io.trino.plugin.hive.metastore.Table sourceTable, + Map partitionFilter, + RecursiveDirectory recursiveDirectory, + ExecutorService icebergScanExecutor) + { + HiveMetastore metastore = metastoreFactory.createMetastore(Optional.of(session.getIdentity())); + + PartitionSpec partitionSpec = targetTable.spec(); + Schema schema = targetTable.schema(); + NameMapping nameMapping = MappingUtil.create(schema); + + HiveStorageFormat storageFormat = extractHiveStorageFormat(sourceTable.getStorage().getStorageFormat()); + String location = sourceTable.getStorage().getLocation(); + + try { + ImmutableList.Builder dataFilesBuilder = ImmutableList.builder(); + if (partitionSpec.isUnpartitioned()) { + log.debug("Building data files from %s", location); + dataFilesBuilder.addAll(buildDataFiles(fileSystem, recursiveDirectory, storageFormat, location, partitionSpec, Optional.empty(), schema)); + } + else { + List partitionNames = partitionFilter == null ? ImmutableList.of() : ImmutableList.of(PARTITION_JOINER.join(partitionFilter)); + Map> partitions = metastore.getPartitionsByNames(sourceTable, partitionNames); + for (Map.Entry> partition : partitions.entrySet()) { + Storage storage = partition.getValue().orElseThrow(() -> new IllegalArgumentException("Invalid partition: " + partition.getKey())).getStorage(); + log.debug("Building data files from partition: %s", partition); + HiveStorageFormat partitionStorageFormat = extractHiveStorageFormat(storage.getStorageFormat()); + StructLike partitionData = DataFiles.data(partitionSpec, partition.getKey()); + dataFilesBuilder.addAll(buildDataFiles(fileSystem, recursiveDirectory, partitionStorageFormat, storage.getLocation(), partitionSpec, Optional.of(partitionData), schema)); + } + } + + log.debug("Start new transaction"); + Transaction transaction = targetTable.newTransaction(); + if (!targetTable.properties().containsKey(DEFAULT_NAME_MAPPING)) { + log.debug("Update default name mapping property"); + transaction.updateProperties() + .set(DEFAULT_NAME_MAPPING, toJson(nameMapping)) + .commit(); + } + addFiles(session, targetTable, dataFilesBuilder.build(), icebergScanExecutor); + } + catch (Exception e) { + throw new TrinoException(ICEBERG_COMMIT_ERROR, "Failed to add files: " + firstNonNull(e.getMessage(), e), e); + } + } + + public static DataFile buildDataFile(String path, long length, Optional partition, PartitionSpec spec, String format, Metrics metrics) + { + DataFiles.Builder dataFile = DataFiles.builder(spec) + .withPath(path) + .withFormat(format) + .withFileSizeInBytes(length) + .withMetrics(metrics); + partition.ifPresent(dataFile::withPartition); + return dataFile.build(); + } + + public static void addFiles(ConnectorSession session, Table table, List dataFiles, ExecutorService icebergScanExecutor) + { + Schema schema = table.schema(); + Set requiredFields = schema.columns().stream() + .filter(Types.NestedField::isRequired) + .map(Types.NestedField::fieldId) + .collect(toImmutableSet()); + + ImmutableSet.Builder existingFilesBuilder = ImmutableSet.builder(); + try (CloseableIterable iterator = table.newScan().planFiles()) { + for (FileScanTask fileScanTask : iterator) { + DataFile dataFile = fileScanTask.file(); + existingFilesBuilder.add(dataFile.location()); + } + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + Set existingFiles = existingFilesBuilder.build(); + + if (!requiredFields.isEmpty()) { + for (DataFile dataFile : dataFiles) { + Map nullValueCounts = firstNonNull(dataFile.nullValueCounts(), Map.of()); + for (Integer field : requiredFields) { + Long nullCount = nullValueCounts.get(field); + if (nullCount == null || nullCount > 0) { + throw new TrinoException(CONSTRAINT_VIOLATION, "NULL value not allowed for NOT NULL column: " + schema.findField(field).name()); + } + } + } + } + + try { + log.debug("Start new transaction"); + Transaction transaction = table.newTransaction(); + if (!table.properties().containsKey(DEFAULT_NAME_MAPPING)) { + log.debug("Update default name mapping property"); + transaction.updateProperties() + .set(DEFAULT_NAME_MAPPING, toJson(MappingUtil.create(schema))) + .commit(); + } + log.debug("Append data %d data files", dataFiles.size()); + AppendFiles appendFiles = isMergeManifestsOnWrite(session) ? transaction.newAppend() : transaction.newFastAppend(); + for (DataFile dataFile : dataFiles) { + if (existingFiles.contains(dataFile.location())) { + throw new TrinoException(ALREADY_EXISTS, "File already exists: " + dataFile.location()); + } + appendFiles.appendFile(dataFile); + } + appendFiles.scanManifestsWith(icebergScanExecutor); + appendFiles.commit(); + transaction.commitTransaction(); + log.debug("Successfully added files to %s table", table.name()); + } + catch (Exception e) { + throw new TrinoException(ICEBERG_COMMIT_ERROR, "Failed to add files: " + firstNonNull(e.getMessage(), e), e); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/OptimizeManifestsTableProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/OptimizeManifestsTableProcedure.java new file mode 100644 index 000000000000..62604a27fa80 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/OptimizeManifestsTableProcedure.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Provider; +import io.trino.spi.connector.TableProcedureMetadata; + +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.OPTIMIZE_MANIFESTS; +import static io.trino.spi.connector.TableProcedureExecutionMode.coordinatorOnly; + +public class OptimizeManifestsTableProcedure + implements Provider +{ + @Override + public TableProcedureMetadata get() + { + return new TableProcedureMetadata( + OPTIMIZE_MANIFESTS.name(), + coordinatorOnly(), + ImmutableList.of()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RegisterTableProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RegisterTableProcedure.java index 8c8984247c2a..92e6bf0b301c 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RegisterTableProcedure.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RegisterTableProcedure.java @@ -14,19 +14,19 @@ package io.trino.plugin.iceberg.procedure; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.inject.Inject; import com.google.inject.Provider; -import io.trino.filesystem.FileEntry; -import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.plugin.iceberg.IcebergConfig; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; -import io.trino.plugin.iceberg.fileio.ForwardingFileIo; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; import io.trino.spi.TrinoException; import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.ConnectorAccessControl; import io.trino.spi.connector.ConnectorSession; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.procedure.Procedure; @@ -34,18 +34,16 @@ import org.apache.iceberg.TableMetadataParser; import java.io.IOException; +import java.io.UncheckedIOException; import java.lang.invoke.MethodHandle; -import java.util.ArrayList; -import java.util.List; import java.util.Optional; -import static com.google.common.collect.Iterables.getOnlyElement; import static io.trino.plugin.base.util.Procedures.checkProcedureArgument; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; -import static io.trino.plugin.iceberg.IcebergUtil.METADATA_FILE_EXTENSION; +import static io.trino.plugin.iceberg.IcebergSessionProperties.isUseFileSizeFromMetadata; import static io.trino.plugin.iceberg.IcebergUtil.METADATA_FOLDER_NAME; -import static io.trino.plugin.iceberg.IcebergUtil.parseVersion; +import static io.trino.plugin.iceberg.IcebergUtil.getLatestMetadataLocation; import static io.trino.spi.StandardErrorCode.INVALID_PROCEDURE_ARGUMENT; import static io.trino.spi.StandardErrorCode.PERMISSION_DENIED; import static io.trino.spi.StandardErrorCode.SCHEMA_NOT_FOUND; @@ -70,7 +68,7 @@ public class RegisterTableProcedure static { try { - REGISTER_TABLE = lookup().unreflect(RegisterTableProcedure.class.getMethod("registerTable", ConnectorSession.class, String.class, String.class, String.class, String.class)); + REGISTER_TABLE = lookup().unreflect(RegisterTableProcedure.class.getMethod("registerTable", ConnectorAccessControl.class, ConnectorSession.class, String.class, String.class, String.class, String.class)); } catch (ReflectiveOperationException e) { throw new AssertionError(e); @@ -79,13 +77,19 @@ public class RegisterTableProcedure private final TrinoCatalogFactory catalogFactory; private final TrinoFileSystemFactory fileSystemFactory; + private final ForwardingFileIoFactory fileIoFactory; private final boolean registerTableProcedureEnabled; @Inject - public RegisterTableProcedure(TrinoCatalogFactory catalogFactory, TrinoFileSystemFactory fileSystemFactory, IcebergConfig icebergConfig) + public RegisterTableProcedure( + TrinoCatalogFactory catalogFactory, + TrinoFileSystemFactory fileSystemFactory, + ForwardingFileIoFactory fileIoFactory, + IcebergConfig icebergConfig) { this.catalogFactory = requireNonNull(catalogFactory, "catalogFactory is null"); this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); + this.fileIoFactory = requireNonNull(fileIoFactory, "fileIoFactory is null"); this.registerTableProcedureEnabled = requireNonNull(icebergConfig, "icebergConfig is null").isRegisterTableProcedureEnabled(); } @@ -104,14 +108,16 @@ public Procedure get() } public void registerTable( + ConnectorAccessControl accessControl, ConnectorSession clientSession, String schemaName, String tableName, String tableLocation, String metadataFileName) { - try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + try (ThreadContextClassLoader ignore = new ThreadContextClassLoader(getClass().getClassLoader())) { doRegisterTable( + accessControl, clientSession, schemaName, tableName, @@ -121,6 +127,7 @@ public void registerTable( } private void doRegisterTable( + ConnectorAccessControl accessControl, ConnectorSession clientSession, String schemaName, String tableName, @@ -136,6 +143,7 @@ private void doRegisterTable( metadataFileName.ifPresent(RegisterTableProcedure::validateMetadataFileName); SchemaTableName schemaTableName = new SchemaTableName(schemaName, tableName); + accessControl.checkCanCreateTable(null, schemaTableName, ImmutableMap.of()); TrinoCatalog catalog = catalogFactory.create(clientSession.getIdentity()); if (!catalog.namespaceExists(clientSession, schemaTableName.getSchemaName())) { throw new TrinoException(SCHEMA_NOT_FOUND, format("Schema '%s' does not exist", schemaTableName.getSchemaName())); @@ -147,16 +155,18 @@ private void doRegisterTable( TableMetadata tableMetadata; try { // Try to read the metadata file. Invalid metadata file will throw the exception. - tableMetadata = TableMetadataParser.read(new ForwardingFileIo(fileSystem), metadataLocation); + tableMetadata = TableMetadataParser.read(fileIoFactory.create(fileSystem, isUseFileSizeFromMetadata(clientSession)), metadataLocation); } catch (RuntimeException e) { throw new TrinoException(ICEBERG_INVALID_METADATA, "Invalid metadata file: " + metadataLocation, e); } - if (!tableMetadata.location().equals(tableLocation)) { - throw new TrinoException(ICEBERG_INVALID_METADATA, """ - Table metadata file [%s] declares table location as [%s] which is differs from location provided [%s]. \ - Iceberg table can only be registered with the same location it was created with.""".formatted(metadataLocation, tableMetadata.location(), tableLocation)); + if (!locationEquivalent(tableLocation, tableMetadata.location())) { + throw new TrinoException( + ICEBERG_INVALID_METADATA, + """ + Table metadata file [%s] declares table location as [%s] which is differs from location provided [%s]. \ + Iceberg table can only be registered with the same location it was created with.""".formatted(metadataLocation, tableMetadata.location(), tableLocation)); } catalog.registerTable(clientSession, schemaTableName, tableMetadata); @@ -180,45 +190,6 @@ private static String getMetadataLocation(TrinoFileSystem fileSystem, String loc .orElseGet(() -> getLatestMetadataLocation(fileSystem, location)); } - public static String getLatestMetadataLocation(TrinoFileSystem fileSystem, String location) - { - List latestMetadataLocations = new ArrayList<>(); - String metadataDirectoryLocation = format("%s/%s", stripTrailingSlash(location), METADATA_FOLDER_NAME); - try { - int latestMetadataVersion = -1; - FileIterator fileIterator = fileSystem.listFiles(Location.of(metadataDirectoryLocation)); - while (fileIterator.hasNext()) { - FileEntry fileEntry = fileIterator.next(); - Location fileLocation = fileEntry.location(); - String fileName = fileLocation.fileName(); - if (fileName.endsWith(METADATA_FILE_EXTENSION)) { - int versionNumber = parseVersion(fileName); - if (versionNumber > latestMetadataVersion) { - latestMetadataVersion = versionNumber; - latestMetadataLocations.clear(); - latestMetadataLocations.add(fileLocation); - } - else if (versionNumber == latestMetadataVersion) { - latestMetadataLocations.add(fileLocation); - } - } - } - if (latestMetadataLocations.isEmpty()) { - throw new TrinoException(ICEBERG_INVALID_METADATA, "No versioned metadata file exists at location: " + metadataDirectoryLocation); - } - if (latestMetadataLocations.size() > 1) { - throw new TrinoException(ICEBERG_INVALID_METADATA, format( - "More than one latest metadata file found at location: %s, latest metadata files are %s", - metadataDirectoryLocation, - latestMetadataLocations)); - } - } - catch (IOException e) { - throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Failed checking table location: " + location, e); - } - return getOnlyElement(latestMetadataLocations).toString(); - } - private static void validateMetadataLocation(TrinoFileSystem fileSystem, Location location) { try { @@ -226,8 +197,22 @@ private static void validateMetadataLocation(TrinoFileSystem fileSystem, Locatio throw new TrinoException(INVALID_PROCEDURE_ARGUMENT, "Metadata file does not exist: " + location); } } - catch (IOException e) { + catch (IOException | UncheckedIOException e) { throw new TrinoException(ICEBERG_FILESYSTEM_ERROR, "Invalid metadata file location: " + location, e); } } + + private static boolean locationEquivalent(String a, String b) + { + return normalizeS3Uri(a).equals(normalizeS3Uri(b)); + } + + private static String normalizeS3Uri(String tableLocation) + { + // Normalize e.g. s3a to s3, so that table can be registered using s3:// location + // even if internally it uses s3a:// paths. + String normalizedSchema = tableLocation.replaceFirst("^s3[an]://", "s3://"); + // Remove trailing slashes so that test_dir is equal to test_dir/ + return stripTrailingSlash(normalizedSchema); + } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RollbackToSnapshotProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RollbackToSnapshotProcedure.java new file mode 100644 index 000000000000..69aac1dc7c3c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RollbackToSnapshotProcedure.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Inject; +import com.google.inject.Provider; +import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; +import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.procedure.Procedure; +import org.apache.iceberg.Table; + +import java.lang.invoke.MethodHandle; + +import static io.trino.plugin.base.util.Procedures.checkProcedureArgument; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.invoke.MethodHandles.lookup; +import static java.util.Objects.requireNonNull; + +@Deprecated +public class RollbackToSnapshotProcedure + implements Provider +{ + private static final MethodHandle ROLLBACK_TO_SNAPSHOT; + + static { + try { + ROLLBACK_TO_SNAPSHOT = lookup().unreflect(RollbackToSnapshotProcedure.class.getMethod("rollbackToSnapshot", ConnectorSession.class, String.class, String.class, Long.class)); + } + catch (ReflectiveOperationException e) { + throw new AssertionError(e); + } + } + + private final TrinoCatalogFactory catalogFactory; + + @Inject + public RollbackToSnapshotProcedure(TrinoCatalogFactory catalogFactory) + { + this.catalogFactory = requireNonNull(catalogFactory, "catalogFactory is null"); + } + + @Override + public Procedure get() + { + return new Procedure( + "system", + "rollback_to_snapshot", + ImmutableList.of( + new Procedure.Argument("SCHEMA", VARCHAR), + new Procedure.Argument("TABLE", VARCHAR), + new Procedure.Argument("SNAPSHOT_ID", BIGINT)), + ROLLBACK_TO_SNAPSHOT.bindTo(this)); + } + + public void rollbackToSnapshot(ConnectorSession clientSession, String schema, String table, Long snapshotId) + { + checkProcedureArgument(schema != null, "schema cannot be null"); + checkProcedureArgument(table != null, "table cannot be null"); + checkProcedureArgument(snapshotId != null, "snapshot_id cannot be null"); + + try (ThreadContextClassLoader ignore = new ThreadContextClassLoader(getClass().getClassLoader())) { + SchemaTableName schemaTableName = new SchemaTableName(schema, table); + Table icebergTable = catalogFactory.create(clientSession.getIdentity()).loadTable(clientSession, schemaTableName); + icebergTable.manageSnapshots().setCurrentSnapshot(snapshotId).commit(); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RollbackToSnapshotTableProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RollbackToSnapshotTableProcedure.java new file mode 100644 index 000000000000..2baf59823169 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/RollbackToSnapshotTableProcedure.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.procedure; + +import com.google.common.collect.ImmutableList; +import com.google.inject.Provider; +import io.trino.spi.connector.TableProcedureMetadata; +import io.trino.spi.session.PropertyMetadata; + +import static io.trino.plugin.iceberg.procedure.IcebergTableProcedureId.ROLLBACK_TO_SNAPSHOT; +import static io.trino.spi.connector.TableProcedureExecutionMode.coordinatorOnly; +import static io.trino.spi.session.PropertyMetadata.longProperty; + +public class RollbackToSnapshotTableProcedure + implements Provider +{ + @Override + public TableProcedureMetadata get() + { + return new TableProcedureMetadata( + ROLLBACK_TO_SNAPSHOT.name(), + coordinatorOnly(), + ImmutableList.>builder() + .add(longProperty( + "snapshot_id", + "Snapshot ID", + null, + false)) + .build()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/UnregisterTableProcedure.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/UnregisterTableProcedure.java index f12ad559fcb5..f13aff8974bc 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/UnregisterTableProcedure.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/procedure/UnregisterTableProcedure.java @@ -75,7 +75,7 @@ public Procedure get() public void unregisterTable(ConnectorAccessControl accessControl, ConnectorSession session, String schemaName, String tableName) { - try (ThreadContextClassLoader ignored = new ThreadContextClassLoader(getClass().getClassLoader())) { + try (ThreadContextClassLoader ignore = new ThreadContextClassLoader(getClass().getClassLoader())) { doUnregisterTable(accessControl, session, schemaName, tableName); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/AllManifestsTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/AllManifestsTable.java new file mode 100644 index 000000000000..fb42860e918e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/AllManifestsTable.java @@ -0,0 +1,107 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.block.ArrayBlockBuilder; +import io.trino.spi.block.RowBlockBuilder; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.TimeZoneKey; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; + +import java.util.List; +import java.util.concurrent.ExecutorService; + +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS; + +public class AllManifestsTable + extends BaseSystemTable +{ + public AllManifestsTable(SchemaTableName tableName, Table icebergTable, ExecutorService executor) + { + super(requireNonNull(icebergTable, "icebergTable is null"), + new ConnectorTableMetadata(requireNonNull(tableName, "tableName is null"), ImmutableList.builder() + .add(new ColumnMetadata("path", VARCHAR)) + .add(new ColumnMetadata("length", BIGINT)) + .add(new ColumnMetadata("partition_spec_id", INTEGER)) + .add(new ColumnMetadata("added_snapshot_id", BIGINT)) + .add(new ColumnMetadata("added_data_files_count", INTEGER)) + .add(new ColumnMetadata("existing_data_files_count", INTEGER)) + .add(new ColumnMetadata("deleted_data_files_count", INTEGER)) + .add(new ColumnMetadata("added_delete_files_count", INTEGER)) + .add(new ColumnMetadata("existing_delete_files_count", INTEGER)) + .add(new ColumnMetadata("deleted_delete_files_count", INTEGER)) + .add(new ColumnMetadata("partition_summaries", new ArrayType(RowType.rowType( + RowType.field("contains_null", BOOLEAN), + RowType.field("contains_nan", BOOLEAN), + RowType.field("lower_bound", VARCHAR), + RowType.field("upper_bound", VARCHAR))))) + .add(new ColumnMetadata("reference_snapshot_id", BIGINT)) + .build()), + ALL_MANIFESTS, + executor); + } + + @Override + protected void addRow(PageListBuilder pagesBuilder, Row row, TimeZoneKey timeZoneKey) + { + pagesBuilder.beginRow(); + pagesBuilder.appendVarchar(row.get("path", String.class)); + pagesBuilder.appendBigint(row.get("length", Long.class)); + pagesBuilder.appendInteger(row.get("partition_spec_id", Integer.class)); + pagesBuilder.appendBigint(row.get("added_snapshot_id", Long.class)); + pagesBuilder.appendInteger(row.get("added_data_files_count", Integer.class)); + pagesBuilder.appendInteger(row.get("existing_data_files_count", Integer.class)); + pagesBuilder.appendInteger(row.get("deleted_data_files_count", Integer.class)); + pagesBuilder.appendInteger(row.get("added_delete_files_count", Integer.class)); + pagesBuilder.appendInteger(row.get("existing_delete_files_count", Integer.class)); + pagesBuilder.appendInteger(row.get("deleted_delete_files_count", Integer.class)); + //noinspection unchecked + appendPartitionSummaries((ArrayBlockBuilder) pagesBuilder.nextColumn(), row.get("partition_summaries", List.class)); + pagesBuilder.appendBigint(row.get("reference_snapshot_id", Long.class)); + pagesBuilder.endRow(); + } + + private static void appendPartitionSummaries(ArrayBlockBuilder arrayBuilder, List partitionSummaries) + { + arrayBuilder.buildEntry(elementBuilder -> { + for (StructLike partitionSummary : partitionSummaries) { + ((RowBlockBuilder) elementBuilder).buildEntry(fieldBuilders -> { + BOOLEAN.writeBoolean(fieldBuilders.get(0), partitionSummary.get(0, Boolean.class)); // required contains_null + Boolean containsNan = partitionSummary.get(1, Boolean.class); + if (containsNan == null) { + // This usually occurs when reading from V1 table, where contains_nan is not populated. + fieldBuilders.get(1).appendNull(); + } + else { + BOOLEAN.writeBoolean(fieldBuilders.get(1), containsNan); + } + VARCHAR.writeString(fieldBuilders.get(2), partitionSummary.get(2, String.class)); // optional lower_bound (human-readable) + VARCHAR.writeString(fieldBuilders.get(3), partitionSummary.get(3, String.class)); // optional upper_bound (human-readable) + }); + } + }); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/BaseSystemTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/BaseSystemTable.java new file mode 100644 index 000000000000..42a001a5bdee --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/BaseSystemTable.java @@ -0,0 +1,127 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableMap; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.Page; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.FixedPageSource; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.TimeZoneKey; +import org.apache.iceberg.DataTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.io.CloseableIterable; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; + +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Maps.immutableEntry; +import static com.google.common.collect.Streams.mapWithIndex; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataTableUtils.createMetadataTableInstance; + +public abstract class BaseSystemTable + implements SystemTable +{ + private final Table icebergTable; + private final ConnectorTableMetadata tableMetadata; + private final MetadataTableType metadataTableType; + private final ExecutorService executor; + + BaseSystemTable(Table icebergTable, ConnectorTableMetadata tableMetadata, MetadataTableType metadataTableType, ExecutorService executor) + { + this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); + this.tableMetadata = requireNonNull(tableMetadata, "tableMetadata is null"); + this.metadataTableType = requireNonNull(metadataTableType, "metadataTableType is null"); + this.executor = requireNonNull(executor, "executor is null"); + } + + @Override + public Distribution getDistribution() + { + return Distribution.SINGLE_COORDINATOR; + } + + @Override + public ConnectorTableMetadata getTableMetadata() + { + return tableMetadata; + } + + @Override + public ConnectorPageSource pageSource(ConnectorTransactionHandle transactionHandle, ConnectorSession session, TupleDomain constraint) + { + return new FixedPageSource(buildPages(tableMetadata, session, icebergTable, metadataTableType)); + } + + private List buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable, MetadataTableType metadataTableType) + { + PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata); + + TableScan tableScan = createMetadataTableInstance(icebergTable, metadataTableType).newScan().planWith(executor); + TimeZoneKey timeZoneKey = session.getTimeZoneKey(); + + Map columnNameToPosition = mapWithIndex(tableScan.schema().columns().stream(), + (column, position) -> immutableEntry(column.name(), Long.valueOf(position).intValue())) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); + + try (CloseableIterable fileScanTasks = tableScan.planFiles()) { + fileScanTasks.forEach(fileScanTask -> addRows((DataTask) fileScanTask, pagesBuilder, timeZoneKey, columnNameToPosition)); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + + return pagesBuilder.build(); + } + + private void addRows(DataTask dataTask, PageListBuilder pagesBuilder, TimeZoneKey timeZoneKey, Map columnNameToPositionInSchema) + { + try (CloseableIterable dataRows = dataTask.rows()) { + dataRows.forEach(dataTaskRow -> addRow(pagesBuilder, new Row(dataTaskRow, columnNameToPositionInSchema), timeZoneKey)); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + protected abstract void addRow(PageListBuilder pagesBuilder, Row row, TimeZoneKey timeZoneKey); + + public record Row(StructLike structLike, Map columnNameToPositionInSchema) + { + public Row + { + requireNonNull(structLike, "structLike is null"); + columnNameToPositionInSchema = ImmutableMap.copyOf(columnNameToPositionInSchema); + } + + public T get(String columnName, Class javaClass) + { + return structLike.get(columnNameToPositionInSchema.get(columnName), javaClass); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/EntriesTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/EntriesTable.java new file mode 100644 index 000000000000..129b7422400b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/EntriesTable.java @@ -0,0 +1,359 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.IcebergUtil; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.block.ArrayBlockBuilder; +import io.trino.spi.block.MapBlockBuilder; +import io.trino.spi.block.RowBlockBuilder; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.RowType; +import io.trino.spi.type.TimeZoneKey; +import io.trino.spi.type.TypeManager; +import io.trino.spi.type.TypeSignature; +import jakarta.annotation.Nullable; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetricsUtil.ReadableMetricsStruct; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.Table; +import org.apache.iceberg.transforms.Transforms; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.util.StructProjection; + +import java.nio.ByteBuffer; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ExecutorService; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.slice.Slices.wrappedBuffer; +import static io.trino.plugin.iceberg.IcebergTypes.convertIcebergValueToTrino; +import static io.trino.plugin.iceberg.IcebergUtil.primitiveFieldTypes; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getAllPartitionFields; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getPartitionColumnType; +import static io.trino.plugin.iceberg.util.SystemTableUtil.partitionTypes; +import static io.trino.plugin.iceberg.util.SystemTableUtil.readableMetricsToJson; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.StandardTypes.JSON; +import static io.trino.spi.type.TypeSignature.mapType; +import static io.trino.spi.type.TypeUtils.writeNativeValue; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_PATH; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_POS; +import static org.apache.iceberg.MetadataTableType.ALL_ENTRIES; +import static org.apache.iceberg.MetadataTableType.ENTRIES; + +// https://iceberg.apache.org/docs/latest/spark-queries/#all-entries +// https://iceberg.apache.org/docs/latest/spark-queries/#entries +public class EntriesTable + extends BaseSystemTable +{ + private final Map idToTypeMapping; + private final List primitiveFields; + private final Optional partitionColumn; + private final List partitionTypes; + + public EntriesTable(TypeManager typeManager, SchemaTableName tableName, Table icebergTable, MetadataTableType metadataTableType, ExecutorService executor) + { + super( + requireNonNull(icebergTable, "icebergTable is null"), + new ConnectorTableMetadata( + requireNonNull(tableName, "tableName is null"), + columns(requireNonNull(typeManager, "typeManager is null"), icebergTable)), + metadataTableType, + executor); + checkArgument(metadataTableType == ALL_ENTRIES || metadataTableType == ENTRIES, "Unexpected metadata table type: %s", metadataTableType); + idToTypeMapping = primitiveFieldTypes(icebergTable.schema()); + primitiveFields = IcebergUtil.primitiveFields(icebergTable.schema()).stream() + .sorted(Comparator.comparing(NestedField::name)) + .collect(toImmutableList()); + List partitionFields = getAllPartitionFields(icebergTable); + partitionColumn = getPartitionColumnType(typeManager, partitionFields, icebergTable.schema()); + partitionTypes = partitionTypes(partitionFields, idToTypeMapping); + } + + private static List columns(TypeManager typeManager, Table icebergTable) + { + return ImmutableList.builder() + .add(new ColumnMetadata("status", INTEGER)) + .add(new ColumnMetadata("snapshot_id", BIGINT)) + .add(new ColumnMetadata("sequence_number", BIGINT)) + .add(new ColumnMetadata("file_sequence_number", BIGINT)) + .add(new ColumnMetadata("data_file", RowType.from(dataFileFieldMetadata(typeManager, icebergTable)))) + .add(new ColumnMetadata("readable_metrics", typeManager.getType(new TypeSignature(JSON)))) + .build(); + } + + private static List dataFileFieldMetadata(TypeManager typeManager, Table icebergTable) + { + List partitionFields = getAllPartitionFields(icebergTable); + Optional partitionColumnType = getPartitionColumnType(typeManager, partitionFields, icebergTable.schema()); + + ImmutableList.Builder fields = ImmutableList.builder(); + fields.add(new RowType.Field(Optional.of("content"), INTEGER)); + fields.add(new RowType.Field(Optional.of("file_path"), VARCHAR)); + fields.add(new RowType.Field(Optional.of("file_format"), VARCHAR)); + fields.add(new RowType.Field(Optional.of("spec_id"), INTEGER)); + partitionColumnType.ifPresent(type -> fields.add(new RowType.Field(Optional.of("partition"), type.rowType()))); + fields.add(new RowType.Field(Optional.of("record_count"), BIGINT)); + fields.add(new RowType.Field(Optional.of("file_size_in_bytes"), BIGINT)); + fields.add(new RowType.Field(Optional.of("column_sizes"), typeManager.getType(mapType(INTEGER.getTypeSignature(), BIGINT.getTypeSignature())))); + fields.add(new RowType.Field(Optional.of("value_counts"), typeManager.getType(mapType(INTEGER.getTypeSignature(), BIGINT.getTypeSignature())))); + fields.add(new RowType.Field(Optional.of("null_value_counts"), typeManager.getType(mapType(INTEGER.getTypeSignature(), BIGINT.getTypeSignature())))); + fields.add(new RowType.Field(Optional.of("nan_value_counts"), typeManager.getType(mapType(INTEGER.getTypeSignature(), BIGINT.getTypeSignature())))); + fields.add(new RowType.Field(Optional.of("lower_bounds"), typeManager.getType(mapType(INTEGER.getTypeSignature(), VARCHAR.getTypeSignature())))); + fields.add(new RowType.Field(Optional.of("upper_bounds"), typeManager.getType(mapType(INTEGER.getTypeSignature(), VARCHAR.getTypeSignature())))); + fields.add(new RowType.Field(Optional.of("key_metadata"), VARBINARY)); + fields.add(new RowType.Field(Optional.of("split_offsets"), new ArrayType(BIGINT))); + fields.add(new RowType.Field(Optional.of("equality_ids"), new ArrayType(INTEGER))); + fields.add(new RowType.Field(Optional.of("sort_order_id"), INTEGER)); + return fields.build(); + } + + @Override + protected void addRow(PageListBuilder pagesBuilder, Row row, TimeZoneKey timeZoneKey) + { + pagesBuilder.beginRow(); + pagesBuilder.appendInteger(row.get("status", Integer.class)); + pagesBuilder.appendBigint(row.get("snapshot_id", Long.class)); + pagesBuilder.appendBigint(row.get("sequence_number", Long.class)); + pagesBuilder.appendBigint(row.get("file_sequence_number", Long.class)); + StructProjection dataFile = row.get("data_file", StructProjection.class); + appendDataFile((RowBlockBuilder) pagesBuilder.nextColumn(), dataFile); + ReadableMetricsStruct readableMetrics = row.get("readable_metrics", ReadableMetricsStruct.class); + String readableMetricsJson = readableMetricsToJson(readableMetrics, primitiveFields); + pagesBuilder.appendVarchar(readableMetricsJson); + pagesBuilder.endRow(); + } + + private void appendDataFile(RowBlockBuilder blockBuilder, StructProjection dataFile) + { + blockBuilder.buildEntry(fieldBuilders -> { + Integer content = dataFile.get(0, Integer.class); + INTEGER.writeLong(fieldBuilders.get(0), content); + + String filePath = dataFile.get(1, String.class); + VARCHAR.writeString(fieldBuilders.get(1), filePath); + + String fileFormat = dataFile.get(2, String.class); + VARCHAR.writeString(fieldBuilders.get(2), fileFormat); + + Integer specId = dataFile.get(3, Integer.class); + INTEGER.writeLong(fieldBuilders.get(3), Long.valueOf(specId)); + + partitionColumn.ifPresent(type -> { + StructProjection partition = dataFile.get(4, StructProjection.class); + RowBlockBuilder partitionBlockBuilder = (RowBlockBuilder) fieldBuilders.get(4); + partitionBlockBuilder.buildEntry(partitionBuilder -> { + for (int i = 0; i < type.rowType().getFields().size(); i++) { + Type icebergType = partitionTypes.get(i); + io.trino.spi.type.Type trinoType = type.rowType().getFields().get(i).getType(); + Object value = null; + Integer fieldId = type.fieldIds().get(i); + if (fieldId != null) { + value = convertIcebergValueToTrino(icebergType, partition.get(i, icebergType.typeId().javaClass())); + } + writeNativeValue(trinoType, partitionBuilder.get(i), value); + } + }); + }); + + int position = partitionColumn.isEmpty() ? 4 : 5; + Long recordCount = dataFile.get(position, Long.class); + BIGINT.writeLong(fieldBuilders.get(position), recordCount); + + Long fileSizeInBytes = dataFile.get(++position, Long.class); + BIGINT.writeLong(fieldBuilders.get(position), fileSizeInBytes); + + //noinspection unchecked + Map columnSizes = dataFile.get(++position, Map.class); + appendIntegerBigintMap((MapBlockBuilder) fieldBuilders.get(position), columnSizes); + + //noinspection unchecked + Map valueCounts = dataFile.get(++position, Map.class); + appendIntegerBigintMap((MapBlockBuilder) fieldBuilders.get(position), valueCounts); + + //noinspection unchecked + Map nullValueCounts = dataFile.get(++position, Map.class); + appendIntegerBigintMap((MapBlockBuilder) fieldBuilders.get(position), nullValueCounts); + + //noinspection unchecked + Map nanValueCounts = dataFile.get(++position, Map.class); + appendIntegerBigintMap((MapBlockBuilder) fieldBuilders.get(position), nanValueCounts); + + switch (ContentType.of(content)) { + case DATA, EQUALITY_DELETE -> { + //noinspection unchecked + Map lowerBounds = dataFile.get(++position, Map.class); + appendIntegerVarcharMap((MapBlockBuilder) fieldBuilders.get(position), lowerBounds); + + //noinspection unchecked + Map upperBounds = dataFile.get(++position, Map.class); + appendIntegerVarcharMap((MapBlockBuilder) fieldBuilders.get(position), upperBounds); + } + case POSITION_DELETE -> { + //noinspection unchecked + Map lowerBounds = dataFile.get(++position, Map.class); + appendBoundsForPositionDelete((MapBlockBuilder) fieldBuilders.get(position), lowerBounds); + + //noinspection unchecked + Map upperBounds = dataFile.get(++position, Map.class); + appendBoundsForPositionDelete((MapBlockBuilder) fieldBuilders.get(position), upperBounds); + } + } + + ByteBuffer keyMetadata = dataFile.get(++position, ByteBuffer.class); + if (keyMetadata == null) { + fieldBuilders.get(position).appendNull(); + } + else { + VARBINARY.writeSlice(fieldBuilders.get(position), wrappedBuffer(keyMetadata)); + } + + //noinspection unchecked + List splitOffsets = dataFile.get(++position, List.class); + appendBigintArray((ArrayBlockBuilder) fieldBuilders.get(position), splitOffsets); + + switch (ContentType.of(content)) { + case DATA -> { + // data files don't have equality ids + fieldBuilders.get(++position).appendNull(); + + Integer sortOrderId = dataFile.get(++position, Integer.class); + INTEGER.writeLong(fieldBuilders.get(position), Long.valueOf(sortOrderId)); + } + case POSITION_DELETE -> { + // position delete files don't have equality ids + fieldBuilders.get(++position).appendNull(); + + // position delete files don't have sort order id + fieldBuilders.get(++position).appendNull(); + } + case EQUALITY_DELETE -> { + //noinspection unchecked + List equalityIds = dataFile.get(++position, List.class); + appendIntegerArray((ArrayBlockBuilder) fieldBuilders.get(position), equalityIds); + + Integer sortOrderId = dataFile.get(++position, Integer.class); + INTEGER.writeLong(fieldBuilders.get(position), Long.valueOf(sortOrderId)); + } + } + }); + } + + public static void appendBigintArray(ArrayBlockBuilder blockBuilder, @Nullable List values) + { + if (values == null) { + blockBuilder.appendNull(); + return; + } + blockBuilder.buildEntry(elementBuilder -> { + for (Long value : values) { + BIGINT.writeLong(elementBuilder, value); + } + }); + } + + public static void appendIntegerArray(ArrayBlockBuilder blockBuilder, @Nullable List values) + { + if (values == null) { + blockBuilder.appendNull(); + return; + } + blockBuilder.buildEntry(elementBuilder -> { + for (Integer value : values) { + INTEGER.writeLong(elementBuilder, value); + } + }); + } + + private static void appendIntegerBigintMap(MapBlockBuilder blockBuilder, @Nullable Map values) + { + if (values == null) { + blockBuilder.appendNull(); + return; + } + blockBuilder.buildEntry((keyBuilder, valueBuilder) -> values.forEach((key, value) -> { + INTEGER.writeLong(keyBuilder, key); + BIGINT.writeLong(valueBuilder, value); + })); + } + + private void appendIntegerVarcharMap(MapBlockBuilder blockBuilder, @Nullable Map values) + { + if (values == null) { + blockBuilder.appendNull(); + return; + } + blockBuilder.buildEntry((keyBuilder, valueBuilder) -> values.forEach((key, value) -> { + Type type = idToTypeMapping.get(key); + INTEGER.writeLong(keyBuilder, key); + VARCHAR.writeString(valueBuilder, Transforms.identity().toHumanString(type, Conversions.fromByteBuffer(type, value))); + })); + } + + private static void appendBoundsForPositionDelete(MapBlockBuilder blockBuilder, @Nullable Map values) + { + if (values == null) { + blockBuilder.appendNull(); + return; + } + + blockBuilder.buildEntry((keyBuilder, valueBuilder) -> { + INTEGER.writeLong(keyBuilder, DELETE_FILE_POS.fieldId()); + ByteBuffer pos = values.get(DELETE_FILE_POS.fieldId()); + checkArgument(pos != null, "delete file pos is null"); + VARCHAR.writeString(valueBuilder, Transforms.identity().toHumanString(Types.LongType.get(), Conversions.fromByteBuffer(Types.LongType.get(), pos))); + + INTEGER.writeLong(keyBuilder, DELETE_FILE_PATH.fieldId()); + ByteBuffer path = values.get(DELETE_FILE_PATH.fieldId()); + checkArgument(path != null, "delete file path is null"); + VARCHAR.writeString(valueBuilder, Transforms.identity().toHumanString(Types.StringType.get(), Conversions.fromByteBuffer(Types.StringType.get(), path))); + }); + } + + private enum ContentType + { + DATA, + POSITION_DELETE, + EQUALITY_DELETE; + + static ContentType of(int content) + { + checkArgument(content >= 0 && content <= 2, "Unexpected content type: %s", content); + if (content == 0) { + return DATA; + } + if (content == 1) { + return POSITION_DELETE; + } + return EQUALITY_DELETE; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/FilesTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/FilesTable.java new file mode 100644 index 000000000000..29694805e4e0 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/FilesTable.java @@ -0,0 +1,173 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.system.files.FilesTableSplitSource; +import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplitSource; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.Type; +import io.trino.spi.type.TypeManager; +import io.trino.spi.type.TypeSignature; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.FileIO; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getAllPartitionFields; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getPartitionColumnType; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.StandardTypes.JSON; +import static io.trino.spi.type.TypeSignature.mapType; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; + +public final class FilesTable + implements SystemTable +{ + public static final String CONTENT_COLUMN_NAME = "content"; + public static final String FILE_PATH_COLUMN_NAME = "file_path"; + public static final String FILE_FORMAT_COLUMN_NAME = "file_format"; + public static final String SPEC_ID_COLUMN_NAME = "spec_id"; + public static final String PARTITION_COLUMN_NAME = "partition"; + public static final String RECORD_COUNT_COLUMN_NAME = "record_count"; + public static final String FILE_SIZE_IN_BYTES_COLUMN_NAME = "file_size_in_bytes"; + public static final String COLUMN_SIZES_COLUMN_NAME = "column_sizes"; + public static final String VALUE_COUNTS_COLUMN_NAME = "value_counts"; + public static final String NULL_VALUE_COUNTS_COLUMN_NAME = "null_value_counts"; + public static final String NAN_VALUE_COUNTS_COLUMN_NAME = "nan_value_counts"; + public static final String LOWER_BOUNDS_COLUMN_NAME = "lower_bounds"; + public static final String UPPER_BOUNDS_COLUMN_NAME = "upper_bounds"; + public static final String KEY_METADATA_COLUMN_NAME = "key_metadata"; + public static final String SPLIT_OFFSETS_COLUMN_NAME = "split_offsets"; + public static final String EQUALITY_IDS_COLUMN_NAME = "equality_ids"; + public static final String SORT_ORDER_ID_COLUMN_NAME = "sort_order_id"; + public static final String READABLE_METRICS_COLUMN_NAME = "readable_metrics"; + + private static final List COLUMN_NAMES = ImmutableList.of( + CONTENT_COLUMN_NAME, + FILE_PATH_COLUMN_NAME, + FILE_FORMAT_COLUMN_NAME, + SPEC_ID_COLUMN_NAME, + PARTITION_COLUMN_NAME, + RECORD_COUNT_COLUMN_NAME, + FILE_SIZE_IN_BYTES_COLUMN_NAME, + COLUMN_SIZES_COLUMN_NAME, + VALUE_COUNTS_COLUMN_NAME, + NULL_VALUE_COUNTS_COLUMN_NAME, + NAN_VALUE_COUNTS_COLUMN_NAME, + LOWER_BOUNDS_COLUMN_NAME, + UPPER_BOUNDS_COLUMN_NAME, + KEY_METADATA_COLUMN_NAME, + SPLIT_OFFSETS_COLUMN_NAME, + EQUALITY_IDS_COLUMN_NAME, + SORT_ORDER_ID_COLUMN_NAME, + READABLE_METRICS_COLUMN_NAME); + + private final ConnectorTableMetadata tableMetadata; + private final Table icebergTable; + private final Optional snapshotId; + private final Optional partitionColumnType; + + public FilesTable(SchemaTableName tableName, TypeManager typeManager, Table icebergTable, Optional snapshotId) + { + this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); + this.snapshotId = requireNonNull(snapshotId, "snapshotId is null"); + + List partitionFields = getAllPartitionFields(icebergTable); + this.partitionColumnType = getPartitionColumnType(typeManager, partitionFields, icebergTable.schema()) + .map(IcebergPartitionColumn::rowType); + + ImmutableList.Builder columns = ImmutableList.builder(); + for (String columnName : COLUMN_NAMES) { + if (columnName.equals(PARTITION_COLUMN_NAME)) { + partitionColumnType.ifPresent(type -> columns.add(new ColumnMetadata(columnName, type))); + } + else { + columns.add(new ColumnMetadata(columnName, getColumnType(columnName, typeManager))); + } + } + this.tableMetadata = new ConnectorTableMetadata(tableName, columns.build()); + } + + @Override + public Distribution getDistribution() + { + return Distribution.ALL_NODES; + } + + @Override + public ConnectorTableMetadata getTableMetadata() + { + return tableMetadata; + } + + @Override + public Optional splitSource(ConnectorSession connectorSession, TupleDomain constraint) + { + try (FileIO fileIO = icebergTable.io()) { + return Optional.of(new FilesTableSplitSource( + icebergTable, + snapshotId, + SchemaParser.toJson(icebergTable.schema()), + SchemaParser.toJson(MetadataTableUtils.createMetadataTableInstance(icebergTable, MetadataTableType.FILES).schema()), + icebergTable.specs().entrySet().stream().collect(toImmutableMap( + Map.Entry::getKey, + partitionSpec -> PartitionSpecParser.toJson(partitionSpec.getValue()))), + partitionColumnType, + fileIO.properties())); + } + } + + public static Type getColumnType(String columnName, TypeManager typeManager) + { + return switch (columnName) { + case CONTENT_COLUMN_NAME, + SORT_ORDER_ID_COLUMN_NAME, + SPEC_ID_COLUMN_NAME -> INTEGER; + case FILE_PATH_COLUMN_NAME, + FILE_FORMAT_COLUMN_NAME -> VARCHAR; + case RECORD_COUNT_COLUMN_NAME, + FILE_SIZE_IN_BYTES_COLUMN_NAME -> BIGINT; + case COLUMN_SIZES_COLUMN_NAME, + NULL_VALUE_COUNTS_COLUMN_NAME, + VALUE_COUNTS_COLUMN_NAME, + NAN_VALUE_COUNTS_COLUMN_NAME -> typeManager.getType(mapType(INTEGER.getTypeSignature(), BIGINT.getTypeSignature())); + case LOWER_BOUNDS_COLUMN_NAME, + UPPER_BOUNDS_COLUMN_NAME -> typeManager.getType(mapType(INTEGER.getTypeSignature(), VARCHAR.getTypeSignature())); + case KEY_METADATA_COLUMN_NAME -> VARBINARY; + case SPLIT_OFFSETS_COLUMN_NAME -> new ArrayType(BIGINT); + case EQUALITY_IDS_COLUMN_NAME -> new ArrayType(INTEGER); + case READABLE_METRICS_COLUMN_NAME -> typeManager.getType(new TypeSignature(JSON)); + default -> throw new IllegalArgumentException("Unexpected value: " + columnName); + }; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/HistoryTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/HistoryTable.java new file mode 100644 index 000000000000..7a741833cd4c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/HistoryTable.java @@ -0,0 +1,91 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.InMemoryRecordSet; +import io.trino.spi.connector.RecordCursor; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.TimeZoneKey; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.util.SnapshotUtil; + +import java.util.List; +import java.util.Set; + +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; +import static java.util.Objects.requireNonNull; + +public class HistoryTable + implements SystemTable +{ + private final ConnectorTableMetadata tableMetadata; + private final Table icebergTable; + + private static final List COLUMNS = ImmutableList.builder() + .add(new ColumnMetadata("made_current_at", TIMESTAMP_TZ_MILLIS)) + .add(new ColumnMetadata("snapshot_id", BIGINT)) + .add(new ColumnMetadata("parent_id", BIGINT)) + .add(new ColumnMetadata("is_current_ancestor", BOOLEAN)) + .build(); + + public HistoryTable(SchemaTableName tableName, Table icebergTable) + { + tableMetadata = new ConnectorTableMetadata(requireNonNull(tableName, "tableName is null"), COLUMNS); + this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); + } + + @Override + public Distribution getDistribution() + { + return Distribution.SINGLE_COORDINATOR; + } + + @Override + public ConnectorTableMetadata getTableMetadata() + { + return tableMetadata; + } + + @Override + public RecordCursor cursor(ConnectorTransactionHandle transactionHandle, ConnectorSession session, TupleDomain constraint) + { + InMemoryRecordSet.Builder table = InMemoryRecordSet.builder(COLUMNS); + + Set ancestorIds = ImmutableSet.copyOf(SnapshotUtil.currentAncestorIds(icebergTable)); + TimeZoneKey timeZoneKey = session.getTimeZoneKey(); + for (Snapshot snapshot : icebergTable.snapshots()) { + long snapshotId = snapshot.snapshotId(); + + table.addRow( + packDateTimeWithZone(snapshot.timestampMillis(), timeZoneKey), + snapshotId, + snapshot.parentId(), + ancestorIds.contains(snapshotId)); + } + + return table.build().cursor(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/IcebergPartitionColumn.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/IcebergPartitionColumn.java new file mode 100644 index 000000000000..b28c732a42d4 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/IcebergPartitionColumn.java @@ -0,0 +1,30 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.spi.type.RowType; + +import java.util.List; + +import static java.util.Objects.requireNonNull; + +public record IcebergPartitionColumn(RowType rowType, List fieldIds) +{ + public IcebergPartitionColumn + { + requireNonNull(rowType, "rowType is null"); + fieldIds = ImmutableList.copyOf(requireNonNull(fieldIds, "fieldIds is null")); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/IcebergTablesSystemTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/IcebergTablesSystemTable.java new file mode 100644 index 000000000000..a30ca23a9beb --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/IcebergTablesSystemTable.java @@ -0,0 +1,110 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.inject.Inject; +import io.airlift.slice.Slice; +import io.trino.plugin.iceberg.catalog.TrinoCatalog; +import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; +import io.trino.spi.classloader.ThreadContextClassLoader; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorAccessControl; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.InMemoryRecordSet; +import io.trino.spi.connector.InMemoryRecordSet.Builder; +import io.trino.spi.connector.RecordCursor; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.TupleDomain; + +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; + +public class IcebergTablesSystemTable + implements SystemTable +{ + private static final SchemaTableName NAME = new SchemaTableName("system", "iceberg_tables"); + + private static final ConnectorTableMetadata METADATA = new ConnectorTableMetadata( + NAME, + ImmutableList.builder() + .add(new ColumnMetadata("table_schema", VARCHAR)) + .add(new ColumnMetadata("table_name", VARCHAR)) + .build()); + + private final TrinoCatalogFactory catalogFactory; + + @Inject + public IcebergTablesSystemTable(TrinoCatalogFactory catalogFactory) + { + this.catalogFactory = requireNonNull(catalogFactory, "catalogFactory is null"); + } + + @Override + public Distribution getDistribution() + { + return Distribution.SINGLE_COORDINATOR; + } + + @Override + public ConnectorTableMetadata getTableMetadata() + { + return METADATA; + } + + @Override + public RecordCursor cursor( + ConnectorTransactionHandle transactionHandle, + ConnectorSession connectorSession, + TupleDomain constraint, + Set requiredColumns, + ConnectorSplit split, + ConnectorAccessControl accessControl) + { + Builder result = InMemoryRecordSet.builder(METADATA); + + Domain schemaDomain = constraint.getDomain(0, VARCHAR); + + Optional schemaFilter = tryGetSingleVarcharValue(schemaDomain); + + try (ThreadContextClassLoader ignore = new ThreadContextClassLoader(getClass().getClassLoader())) { + TrinoCatalog catalog = catalogFactory.create(connectorSession.getIdentity()); + List icebergTables = catalog.listIcebergTables(connectorSession, schemaFilter); + Set accessibleIcebergTables = accessControl.filterTables(null, ImmutableSet.copyOf(icebergTables)); + for (SchemaTableName table : accessibleIcebergTables) { + result.addRow(table.getSchemaName(), table.getTableName()); + } + return result.build().cursor(); + } + } + + private static Optional tryGetSingleVarcharValue(Domain domain) + { + if (!domain.isSingleValue()) { + return Optional.empty(); + } + Object value = domain.getSingleValue(); + return Optional.of(((Slice) value).toStringUtf8()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/ManifestsTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/ManifestsTable.java new file mode 100644 index 000000000000..e58ce9f476e6 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/ManifestsTable.java @@ -0,0 +1,163 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.Page; +import io.trino.spi.TrinoException; +import io.trino.spi.block.ArrayBlockBuilder; +import io.trino.spi.block.BlockBuilder; +import io.trino.spi.block.RowBlockBuilder; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.FixedPageSource; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.ArrayType; +import io.trino.spi.type.RowType; +import org.apache.iceberg.ManifestFile.PartitionFieldSummary; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.BooleanType.BOOLEAN; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; + +public class ManifestsTable + implements SystemTable +{ + private final ConnectorTableMetadata tableMetadata; + private final Table icebergTable; + private final Optional snapshotId; + + public ManifestsTable(SchemaTableName tableName, Table icebergTable, Optional snapshotId) + { + this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); + + tableMetadata = new ConnectorTableMetadata( + tableName, + ImmutableList.builder() + .add(new ColumnMetadata("path", VARCHAR)) + .add(new ColumnMetadata("length", BIGINT)) + .add(new ColumnMetadata("partition_spec_id", INTEGER)) + .add(new ColumnMetadata("added_snapshot_id", BIGINT)) + .add(new ColumnMetadata("added_data_files_count", INTEGER)) + .add(new ColumnMetadata("added_rows_count", BIGINT)) + .add(new ColumnMetadata("existing_data_files_count", INTEGER)) + .add(new ColumnMetadata("existing_rows_count", BIGINT)) + .add(new ColumnMetadata("deleted_data_files_count", INTEGER)) + .add(new ColumnMetadata("deleted_rows_count", BIGINT)) + .add(new ColumnMetadata("partition_summaries", new ArrayType(RowType.rowType( + RowType.field("contains_null", BOOLEAN), + RowType.field("contains_nan", BOOLEAN), + RowType.field("lower_bound", VARCHAR), + RowType.field("upper_bound", VARCHAR))))) + .build()); + this.snapshotId = requireNonNull(snapshotId, "snapshotId is null"); + } + + @Override + public Distribution getDistribution() + { + return Distribution.SINGLE_COORDINATOR; + } + + @Override + public ConnectorTableMetadata getTableMetadata() + { + return tableMetadata; + } + + @Override + public ConnectorPageSource pageSource(ConnectorTransactionHandle transactionHandle, ConnectorSession session, TupleDomain constraint) + { + if (snapshotId.isEmpty()) { + return new FixedPageSource(ImmutableList.of()); + } + return new FixedPageSource(buildPages(tableMetadata, icebergTable, snapshotId.get())); + } + + private static List buildPages(ConnectorTableMetadata tableMetadata, Table icebergTable, long snapshotId) + { + PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata); + + Snapshot snapshot = icebergTable.snapshot(snapshotId); + if (snapshot == null) { + throw new TrinoException(ICEBERG_INVALID_METADATA, format("Snapshot ID [%s] does not exist for table: %s", snapshotId, icebergTable)); + } + + Map partitionSpecsById = icebergTable.specs(); + + snapshot.allManifests(icebergTable.io()).forEach(file -> { + pagesBuilder.beginRow(); + pagesBuilder.appendVarchar(file.path()); + pagesBuilder.appendBigint(file.length()); + pagesBuilder.appendInteger(file.partitionSpecId()); + pagesBuilder.appendBigint(file.snapshotId()); + pagesBuilder.appendInteger(file.addedFilesCount()); + pagesBuilder.appendBigint(file.addedRowsCount()); + pagesBuilder.appendInteger(file.existingFilesCount()); + pagesBuilder.appendBigint(file.existingRowsCount()); + pagesBuilder.appendInteger(file.deletedFilesCount()); + pagesBuilder.appendBigint(file.deletedRowsCount()); + writePartitionSummaries(pagesBuilder.nextColumn(), file.partitions(), partitionSpecsById.get(file.partitionSpecId())); + pagesBuilder.endRow(); + }); + + return pagesBuilder.build(); + } + + private static void writePartitionSummaries(BlockBuilder arrayBlockBuilder, List summaries, PartitionSpec partitionSpec) + { + ((ArrayBlockBuilder) arrayBlockBuilder).buildEntry(elementBuilder -> { + for (int i = 0; i < summaries.size(); i++) { + PartitionFieldSummary summary = summaries.get(i); + PartitionField field = partitionSpec.fields().get(i); + Type nestedType = partitionSpec.partitionType().fields().get(i).type(); + + ((RowBlockBuilder) elementBuilder).buildEntry(fieldBuilders -> { + BOOLEAN.writeBoolean(fieldBuilders.get(0), summary.containsNull()); + Boolean containsNan = summary.containsNaN(); + if (containsNan == null) { + fieldBuilders.get(1).appendNull(); + } + else { + BOOLEAN.writeBoolean(fieldBuilders.get(1), containsNan); + } + VARCHAR.writeString(fieldBuilders.get(2), field.transform().toHumanString( + nestedType, Conversions.fromByteBuffer(nestedType, summary.lowerBound()))); + VARCHAR.writeString(fieldBuilders.get(3), field.transform().toHumanString( + nestedType, Conversions.fromByteBuffer(nestedType, summary.upperBound()))); + }); + } + }); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/MetadataLogEntriesTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/MetadataLogEntriesTable.java new file mode 100644 index 000000000000..23888799e48e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/MetadataLogEntriesTable.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.type.TimeZoneKey; +import org.apache.iceberg.Table; + +import java.util.concurrent.ExecutorService; + +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataTableType.METADATA_LOG_ENTRIES; + +public class MetadataLogEntriesTable + extends BaseSystemTable +{ + private static final String TIMESTAMP_COLUMN_NAME = "timestamp"; + private static final String FILE_COLUMN_NAME = "file"; + private static final String LATEST_SNAPSHOT_ID_COLUMN_NAME = "latest_snapshot_id"; + private static final String LATEST_SCHEMA_ID_COLUMN_NAME = "latest_schema_id"; + private static final String LATEST_SEQUENCE_NUMBER_COLUMN_NAME = "latest_sequence_number"; + + public MetadataLogEntriesTable(SchemaTableName tableName, Table icebergTable, ExecutorService executor) + { + super( + requireNonNull(icebergTable, "icebergTable is null"), + createConnectorTableMetadata(requireNonNull(tableName, "tableName is null")), + METADATA_LOG_ENTRIES, + executor); + } + + private static ConnectorTableMetadata createConnectorTableMetadata(SchemaTableName tableName) + { + return new ConnectorTableMetadata( + tableName, + ImmutableList.builder() + .add(new ColumnMetadata(TIMESTAMP_COLUMN_NAME, TIMESTAMP_TZ_MILLIS)) + .add(new ColumnMetadata(FILE_COLUMN_NAME, VARCHAR)) + .add(new ColumnMetadata(LATEST_SNAPSHOT_ID_COLUMN_NAME, BIGINT)) + .add(new ColumnMetadata(LATEST_SCHEMA_ID_COLUMN_NAME, INTEGER)) + .add(new ColumnMetadata(LATEST_SEQUENCE_NUMBER_COLUMN_NAME, BIGINT)) + .build()); + } + + @Override + protected void addRow(PageListBuilder pagesBuilder, Row row, TimeZoneKey timeZoneKey) + { + pagesBuilder.beginRow(); + pagesBuilder.appendTimestampTzMillis(row.get(TIMESTAMP_COLUMN_NAME, Long.class) / MICROSECONDS_PER_MILLISECOND, timeZoneKey); + pagesBuilder.appendVarchar(row.get(FILE_COLUMN_NAME, String.class)); + pagesBuilder.appendBigint(row.get(LATEST_SNAPSHOT_ID_COLUMN_NAME, Long.class)); + pagesBuilder.appendInteger(row.get(LATEST_SCHEMA_ID_COLUMN_NAME, Integer.class)); + pagesBuilder.appendBigint(row.get(LATEST_SEQUENCE_NUMBER_COLUMN_NAME, Long.class)); + pagesBuilder.endRow(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/PartitionsTable.java similarity index 55% rename from plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionTable.java rename to plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/PartitionsTable.java index 22108bd9348f..e2d64db420fc 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/PartitionTable.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/PartitionsTable.java @@ -11,11 +11,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package io.trino.plugin.iceberg; +package io.trino.plugin.iceberg.system; -import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; +import io.trino.plugin.iceberg.IcebergStatistics; +import io.trino.plugin.iceberg.StructLikeWrapperWithFieldIdToIndex; import io.trino.spi.block.Block; import io.trino.spi.connector.ColumnMetadata; import io.trino.spi.connector.ConnectorSession; @@ -31,27 +31,21 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.PartitionField; -import org.apache.iceberg.Schema; -import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TableScan; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.NestedField; -import org.apache.iceberg.util.StructLikeWrapper; import java.io.IOException; import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.stream.IntStream; +import java.util.concurrent.ExecutorService; import java.util.stream.Stream; import static com.google.common.collect.ImmutableList.toImmutableList; @@ -59,15 +53,18 @@ import static io.trino.plugin.iceberg.IcebergTypes.convertIcebergValueToTrino; import static io.trino.plugin.iceberg.IcebergUtil.getIdentityPartitions; import static io.trino.plugin.iceberg.IcebergUtil.primitiveFieldTypes; +import static io.trino.plugin.iceberg.StructLikeWrapperWithFieldIdToIndex.createStructLikeWrapper; import static io.trino.plugin.iceberg.TypeConverter.toTrinoType; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getAllPartitionFields; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getPartitionColumnType; +import static io.trino.plugin.iceberg.util.SystemTableUtil.partitionTypes; import static io.trino.spi.block.RowValueBuilder.buildRowValue; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.TypeUtils.writeNativeValue; import static java.util.Objects.requireNonNull; import static java.util.stream.Collectors.toSet; -import static java.util.stream.Collectors.toUnmodifiableSet; -public class PartitionTable +public class PartitionsTable implements SystemTable { private final TypeManager typeManager; @@ -81,8 +78,9 @@ public class PartitionTable private final List columnMetricTypes; private final List resultTypes; private final ConnectorTableMetadata connectorTableMetadata; + private final ExecutorService executor; - public PartitionTable(SchemaTableName tableName, TypeManager typeManager, Table icebergTable, Optional snapshotId) + public PartitionsTable(SchemaTableName tableName, TypeManager typeManager, Table icebergTable, Optional snapshotId, ExecutorService executor) { this.typeManager = requireNonNull(typeManager, "typeManager is null"); this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); @@ -94,9 +92,9 @@ public PartitionTable(SchemaTableName tableName, TypeManager typeManager, Table ImmutableList.Builder columnMetadataBuilder = ImmutableList.builder(); - this.partitionColumnType = getPartitionColumnType(partitionFields, icebergTable.schema()); + this.partitionColumnType = getPartitionColumnType(typeManager, partitionFields, icebergTable.schema()); partitionColumnType.ifPresent(icebergPartitionColumn -> - columnMetadataBuilder.add(new ColumnMetadata("partition", icebergPartitionColumn.rowType))); + columnMetadataBuilder.add(new ColumnMetadata("partition", icebergPartitionColumn.rowType()))); Stream.of("record_count", "file_count", "total_size") .forEach(metric -> columnMetadataBuilder.add(new ColumnMetadata(metric, BIGINT))); @@ -109,7 +107,7 @@ public PartitionTable(SchemaTableName tableName, TypeManager typeManager, Table .filter(column -> !identityPartitionIds.contains(column.fieldId()) && column.type().isPrimitiveType()) .collect(toImmutableList()); - this.dataColumnType = getMetricsColumnType(this.nonPartitionPrimitiveColumns); + this.dataColumnType = getMetricsColumnType(typeManager, this.nonPartitionPrimitiveColumns); if (dataColumnType.isPresent()) { columnMetadataBuilder.add(new ColumnMetadata("data", dataColumnType.get())); this.columnMetricTypes = dataColumnType.get().getFields().stream() @@ -121,11 +119,12 @@ public PartitionTable(SchemaTableName tableName, TypeManager typeManager, Table this.columnMetricTypes = ImmutableList.of(); } - ImmutableList columnMetadata = columnMetadataBuilder.build(); + List columnMetadata = columnMetadataBuilder.build(); this.resultTypes = columnMetadata.stream() .map(ColumnMetadata::getType) .collect(toImmutableList()); this.connectorTableMetadata = new ConnectorTableMetadata(tableName, columnMetadata); + this.executor = requireNonNull(executor, "executor is null"); } @Override @@ -140,53 +139,7 @@ public ConnectorTableMetadata getTableMetadata() return connectorTableMetadata; } - private static List getAllPartitionFields(Table icebergTable) - { - Set existingColumnsIds = icebergTable.schema() - .columns().stream() - .map(NestedField::fieldId) - .collect(toUnmodifiableSet()); - - List visiblePartitionFields = icebergTable.specs() - .values().stream() - .flatMap(partitionSpec -> partitionSpec.fields().stream()) - // skip columns that were dropped - .filter(partitionField -> existingColumnsIds.contains(partitionField.sourceId())) - .collect(toImmutableList()); - - return filterOutDuplicates(visiblePartitionFields); - } - - private static List filterOutDuplicates(List visiblePartitionFields) - { - Set alreadyExistingFieldIds = new HashSet<>(); - List result = new ArrayList<>(); - for (PartitionField partitionField : visiblePartitionFields) { - if (!alreadyExistingFieldIds.contains(partitionField.fieldId())) { - alreadyExistingFieldIds.add(partitionField.fieldId()); - result.add(partitionField); - } - } - return result; - } - - private Optional getPartitionColumnType(List fields, Schema schema) - { - if (fields.isEmpty()) { - return Optional.empty(); - } - List partitionFields = fields.stream() - .map(field -> RowType.field( - field.name(), - toTrinoType(field.transform().getResultType(schema.findType(field.sourceId())), typeManager))) - .collect(toImmutableList()); - List fieldIds = fields.stream() - .map(PartitionField::fieldId) - .collect(toImmutableList()); - return Optional.of(new IcebergPartitionColumn(RowType.from(partitionFields), fieldIds)); - } - - private Optional getMetricsColumnType(List columns) + private static Optional getMetricsColumnType(TypeManager typeManager, List columns) { List metricColumns = columns.stream() .map(column -> RowType.field( @@ -211,7 +164,8 @@ public RecordCursor cursor(ConnectorTransactionHandle transactionHandle, Connect } TableScan tableScan = icebergTable.newScan() .useSnapshot(snapshotId.get()) - .includeColumnStats(); + .includeColumnStats() + .planWith(executor); // TODO make the cursor lazy return buildRecordCursor(getStatisticsByPartition(tableScan)); } @@ -222,14 +176,11 @@ private Map getStatistic Map partitions = new HashMap<>(); for (FileScanTask fileScanTask : fileScanTasks) { DataFile dataFile = fileScanTask.file(); - Types.StructType structType = fileScanTask.spec().partitionType(); - StructLike partitionStruct = dataFile.partition(); - StructLikeWrapper partitionWrapper = StructLikeWrapper.forType(structType).set(partitionStruct); - StructLikeWrapperWithFieldIdToIndex structLikeWrapperWithFieldIdToIndex = new StructLikeWrapperWithFieldIdToIndex(partitionWrapper, structType); + StructLikeWrapperWithFieldIdToIndex structLikeWrapperWithFieldIdToIndex = createStructLikeWrapper(fileScanTask); partitions.computeIfAbsent( structLikeWrapperWithFieldIdToIndex, - ignored -> new IcebergStatistics.Builder(icebergTable.schema().columns(), typeManager)) + ignore -> new IcebergStatistics.Builder(icebergTable.schema().columns(), typeManager)) .acceptDataFile(dataFile, fileScanTask.spec()); } @@ -243,7 +194,7 @@ private Map getStatistic private RecordCursor buildRecordCursor(Map partitionStatistics) { - List partitionTypes = partitionTypes(); + List partitionTypes = partitionTypes(partitionFields, idToTypeMapping); List> partitionColumnClass = partitionTypes.stream() .map(type -> type.typeId().javaClass()) .collect(toImmutableList()); @@ -257,18 +208,19 @@ private RecordCursor buildRecordCursor(Map { - row.add(buildRowValue(partitionColumnType.rowType, fields -> { - List partitionColumnTypes = partitionColumnType.rowType.getFields().stream() + row.add(buildRowValue(partitionColumnType.rowType(), fields -> { + List partitionColumnTypes = partitionColumnType.rowType().getFields().stream() .map(RowType.Field::getType) .collect(toImmutableList()); for (int i = 0; i < partitionColumnTypes.size(); i++) { - io.trino.spi.type.Type trinoType = partitionColumnType.rowType.getFields().get(i).getType(); + io.trino.spi.type.Type trinoType = partitionColumnType.rowType().getFields().get(i).getType(); Object value = null; - Integer fieldId = partitionColumnType.fieldIds.get(i); - if (partitionStruct.fieldIdToIndex.containsKey(fieldId)) { + Integer fieldId = partitionColumnType.fieldIds().get(i); + if (partitionStruct.getFieldIdToIndex().containsKey(fieldId)) { + Class cls = partitionColumnClass.get(i); value = convertIcebergValueToTrino( partitionTypes.get(i), - partitionStruct.structLikeWrapper.get().get(partitionStruct.fieldIdToIndex.get(fieldId), partitionColumnClass.get(i))); + partitionStruct.getStructLikeWrapper().get().get(partitionStruct.getFieldIdToIndex().get(fieldId), cls)); } writeNativeValue(trinoType, fields.get(i), value); } @@ -276,32 +228,28 @@ private RecordCursor buildRecordCursor(Map { - try { - row.add(buildRowValue(dataColumnType, fields -> { - for (int i = 0; i < columnMetricTypes.size(); i++) { - Integer fieldId = nonPartitionPrimitiveColumns.get(i).fieldId(); - Object min = icebergStatistics.getMinValues().get(fieldId); - Object max = icebergStatistics.getMaxValues().get(fieldId); - Long nullCount = icebergStatistics.getNullCounts().get(fieldId); - Long nanCount = icebergStatistics.getNanCounts().get(fieldId); - if (min == null && max == null && nullCount == null) { - throw new MissingColumnMetricsException(); - } - - RowType columnMetricType = columnMetricTypes.get(i); + row.add(buildRowValue(dataColumnType, fields -> { + for (int i = 0; i < columnMetricTypes.size(); i++) { + Integer fieldId = nonPartitionPrimitiveColumns.get(i).fieldId(); + Object min = icebergStatistics.minValues().get(fieldId); + Object max = icebergStatistics.maxValues().get(fieldId); + Long nullCount = icebergStatistics.nullCounts().get(fieldId); + Long nanCount = icebergStatistics.nanCounts().get(fieldId); + RowType columnMetricType = columnMetricTypes.get(i); + if (min == null && max == null && nullCount == null) { + fields.get(i).appendNull(); + } + else { columnMetricType.writeObject(fields.get(i), getColumnMetricBlock(columnMetricType, min, max, nullCount, nanCount)); } - })); - } - catch (MissingColumnMetricsException ignored) { - row.add(null); - } + } + })); }); records.add(row); @@ -310,21 +258,6 @@ private RecordCursor buildRecordCursor(Map partitionTypes() - { - ImmutableList.Builder partitionTypeBuilder = ImmutableList.builder(); - for (PartitionField partitionField : partitionFields) { - Type.PrimitiveType sourceType = idToTypeMapping.get(partitionField.sourceId()); - Type type = partitionField.transform().getResultType(sourceType); - partitionTypeBuilder.add(type); - } - return partitionTypeBuilder.build(); - } - private static Block getColumnMetricBlock(RowType columnMetricType, Object min, Object max, Long nullCount, Long nanCount) { return buildRowValue(columnMetricType, fieldBuilders -> { @@ -335,72 +268,4 @@ private static Block getColumnMetricBlock(RowType columnMetricType, Object min, writeNativeValue(fields.get(3).getType(), fieldBuilders.get(3), nanCount); }); } - - @VisibleForTesting - static class StructLikeWrapperWithFieldIdToIndex - { - private final StructLikeWrapper structLikeWrapper; - private final Map fieldIdToIndex; - - public StructLikeWrapperWithFieldIdToIndex(StructLikeWrapper structLikeWrapper, Types.StructType structType) - { - this.structLikeWrapper = structLikeWrapper; - ImmutableMap.Builder fieldIdToIndex = ImmutableMap.builder(); - List fields = structType.fields(); - IntStream.range(0, fields.size()) - .forEach(i -> fieldIdToIndex.put(fields.get(i).fieldId(), i)); - this.fieldIdToIndex = fieldIdToIndex.buildOrThrow(); - } - - @Override - public boolean equals(Object o) - { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - StructLikeWrapperWithFieldIdToIndex that = (StructLikeWrapperWithFieldIdToIndex) o; - // Due to bogus implementation of equals in StructLikeWrapper https://github.com/apache/iceberg/issues/5064 order here matters. - return Objects.equals(fieldIdToIndex, that.fieldIdToIndex) && Objects.equals(structLikeWrapper, that.structLikeWrapper); - } - - @Override - public int hashCode() - { - return Objects.hash(fieldIdToIndex, structLikeWrapper); - } - } - - private static class IcebergPartitionColumn - { - private final RowType rowType; - private final List fieldIds; - - public IcebergPartitionColumn(RowType rowType, List fieldIds) - { - this.rowType = rowType; - this.fieldIds = fieldIds; - } - - @Override - public boolean equals(Object o) - { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - IcebergPartitionColumn that = (IcebergPartitionColumn) o; - return Objects.equals(rowType, that.rowType) && Objects.equals(fieldIds, that.fieldIds); - } - - @Override - public int hashCode() - { - return Objects.hash(rowType, fieldIds); - } - } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/PropertiesTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/PropertiesTable.java new file mode 100644 index 000000000000..4154ea4d020b --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/PropertiesTable.java @@ -0,0 +1,115 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.Page; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.FixedPageSource; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SystemTable; +import io.trino.spi.predicate.TupleDomain; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.SortOrder; + +import java.util.List; +import java.util.Set; + +import static io.trino.plugin.iceberg.SortFieldUtils.toSortFields; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; +import static org.apache.iceberg.TableUtil.formatVersion; + +public class PropertiesTable + implements SystemTable +{ + private static final Set RESERVED_PROPERTIES = ImmutableSet.builder() + .add("provider") + .add("format") + .add("current-snapshot-id") + .add("location") + .add("format-version") + .build(); + + private final ConnectorTableMetadata tableMetadata; + private final BaseTable icebergTable; + + public PropertiesTable(SchemaTableName tableName, BaseTable icebergTable) + { + this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); + this.tableMetadata = new ConnectorTableMetadata(requireNonNull(tableName, "tableName is null"), + ImmutableList.builder() + .add(new ColumnMetadata("key", VARCHAR)) + .add(new ColumnMetadata("value", VARCHAR)) + .build()); + } + + @Override + public Distribution getDistribution() + { + return Distribution.SINGLE_COORDINATOR; + } + + @Override + public ConnectorTableMetadata getTableMetadata() + { + return tableMetadata; + } + + @Override + public ConnectorPageSource pageSource(ConnectorTransactionHandle transactionHandle, ConnectorSession session, TupleDomain constraint) + { + return new FixedPageSource(buildPages(tableMetadata, icebergTable)); + } + + private static List buildPages(ConnectorTableMetadata tableMetadata, BaseTable icebergTable) + { + ImmutableMap.Builder properties = ImmutableMap.builder(); + String currentSnapshotId = icebergTable.currentSnapshot() != null ? String.valueOf(icebergTable.currentSnapshot().snapshotId()) : "none"; + String fileFormat = icebergTable.properties().getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT); + properties.put("format", "iceberg/" + fileFormat); + properties.put("provider", "iceberg"); + properties.put("current-snapshot-id", currentSnapshotId); + properties.put("location", icebergTable.location()); + properties.put("format-version", String.valueOf(formatVersion(icebergTable))); + // TODO: Support sort column transforms (https://github.com/trinodb/trino/issues/15088) + SortOrder sortOrder = icebergTable.sortOrder(); + if (!sortOrder.isUnsorted() && sortOrder.fields().stream().allMatch(sortField -> sortField.transform().isIdentity())) { + List sortColumnNames = toSortFields(sortOrder); + properties.put("sort-order", String.join(", ", sortColumnNames)); + } + icebergTable.properties().entrySet().stream() + .filter(entry -> !RESERVED_PROPERTIES.contains(entry.getKey())) + .forEach(properties::put); + + PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata); + properties.buildOrThrow().entrySet().forEach(prop -> { + pagesBuilder.beginRow(); + pagesBuilder.appendVarchar(prop.getKey()); + pagesBuilder.appendVarchar(prop.getValue()); + pagesBuilder.endRow(); + }); + + return pagesBuilder.build(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/RefsTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/RefsTable.java new file mode 100644 index 000000000000..d67b13a2a36f --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/RefsTable.java @@ -0,0 +1,66 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.type.TimeZoneKey; +import org.apache.iceberg.Table; + +import java.util.List; +import java.util.concurrent.ExecutorService; + +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataTableType.REFS; + +public class RefsTable + extends BaseSystemTable +{ + private static final List COLUMNS = ImmutableList.builder() + .add(new ColumnMetadata("name", VARCHAR)) + .add(new ColumnMetadata("type", VARCHAR)) + .add(new ColumnMetadata("snapshot_id", BIGINT)) + .add(new ColumnMetadata("max_reference_age_in_ms", BIGINT)) + .add(new ColumnMetadata("min_snapshots_to_keep", INTEGER)) + .add(new ColumnMetadata("max_snapshot_age_in_ms", BIGINT)) + .build(); + + public RefsTable(SchemaTableName tableName, Table icebergTable, ExecutorService executor) + { + super( + requireNonNull(icebergTable, "icebergTable is null"), + new ConnectorTableMetadata(requireNonNull(tableName, "tableName is null"), COLUMNS), + REFS, + executor); + } + + @Override + protected void addRow(PageListBuilder pagesBuilder, Row row, TimeZoneKey timeZoneKey) + { + pagesBuilder.beginRow(); + pagesBuilder.appendVarchar(row.get("name", String.class)); + pagesBuilder.appendVarchar(row.get("type", String.class)); + pagesBuilder.appendBigint(row.get("snapshot_id", Long.class)); + pagesBuilder.appendBigint(row.get("max_reference_age_in_ms", Long.class)); + pagesBuilder.appendInteger(row.get("min_snapshots_to_keep", Integer.class)); + pagesBuilder.appendBigint(row.get("max_snapshot_age_in_ms", Long.class)); + pagesBuilder.endRow(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/SnapshotsTable.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/SnapshotsTable.java new file mode 100644 index 000000000000..38797ccae994 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/SnapshotsTable.java @@ -0,0 +1,84 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system; + +import com.google.common.collect.ImmutableList; +import io.trino.plugin.iceberg.util.PageListBuilder; +import io.trino.spi.connector.ColumnMetadata; +import io.trino.spi.connector.ConnectorTableMetadata; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.type.TimeZoneKey; +import io.trino.spi.type.TypeManager; +import io.trino.spi.type.TypeSignature; +import org.apache.iceberg.Table; + +import java.util.Map; +import java.util.concurrent.ExecutorService; + +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataTableType.SNAPSHOTS; + +public class SnapshotsTable + extends BaseSystemTable +{ + private static final String COMMITTED_AT_COLUMN_NAME = "committed_at"; + private static final String SNAPSHOT_ID_COLUMN_NAME = "snapshot_id"; + private static final String PARENT_ID_COLUMN_NAME = "parent_id"; + private static final String OPERATION_COLUMN_NAME = "operation"; + private static final String MANIFEST_LIST_COLUMN_NAME = "manifest_list"; + private static final String SUMMARY_COLUMN_NAME = "summary"; + + public SnapshotsTable(SchemaTableName tableName, TypeManager typeManager, Table icebergTable, ExecutorService executor) + { + super( + requireNonNull(icebergTable, "icebergTable is null"), + createConnectorTableMetadata( + requireNonNull(tableName, "tableName is null"), + requireNonNull(typeManager, "typeManager is null")), + SNAPSHOTS, + executor); + } + + private static ConnectorTableMetadata createConnectorTableMetadata(SchemaTableName tableName, TypeManager typeManager) + { + return new ConnectorTableMetadata( + tableName, + ImmutableList.builder() + .add(new ColumnMetadata(COMMITTED_AT_COLUMN_NAME, TIMESTAMP_TZ_MILLIS)) + .add(new ColumnMetadata(SNAPSHOT_ID_COLUMN_NAME, BIGINT)) + .add(new ColumnMetadata(PARENT_ID_COLUMN_NAME, BIGINT)) + .add(new ColumnMetadata(OPERATION_COLUMN_NAME, VARCHAR)) + .add(new ColumnMetadata(MANIFEST_LIST_COLUMN_NAME, VARCHAR)) + .add(new ColumnMetadata(SUMMARY_COLUMN_NAME, typeManager.getType(TypeSignature.mapType(VARCHAR.getTypeSignature(), VARCHAR.getTypeSignature())))) + .build()); + } + + @Override + protected void addRow(PageListBuilder pagesBuilder, Row row, TimeZoneKey timeZoneKey) + { + pagesBuilder.beginRow(); + pagesBuilder.appendTimestampTzMillis(row.get(COMMITTED_AT_COLUMN_NAME, Long.class) / MICROSECONDS_PER_MILLISECOND, timeZoneKey); + pagesBuilder.appendBigint(row.get(SNAPSHOT_ID_COLUMN_NAME, Long.class)); + pagesBuilder.appendBigint(row.get(PARENT_ID_COLUMN_NAME, Long.class)); + pagesBuilder.appendVarchar(row.get(OPERATION_COLUMN_NAME, String.class)); + pagesBuilder.appendVarchar(row.get(MANIFEST_LIST_COLUMN_NAME, String.class)); + //noinspection unchecked + pagesBuilder.appendVarcharVarcharMap(row.get(SUMMARY_COLUMN_NAME, Map.class)); + pagesBuilder.endRow(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTablePageSource.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTablePageSource.java new file mode 100644 index 000000000000..cba77dfab181 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTablePageSource.java @@ -0,0 +1,349 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system.files; + +import com.google.common.io.Closer; +import io.airlift.slice.Slices; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.plugin.iceberg.IcebergUtil; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; +import io.trino.plugin.iceberg.system.FilesTable; +import io.trino.plugin.iceberg.system.IcebergPartitionColumn; +import io.trino.spi.Page; +import io.trino.spi.PageBuilder; +import io.trino.spi.block.ArrayBlockBuilder; +import io.trino.spi.block.BlockBuilder; +import io.trino.spi.block.MapBlockBuilder; +import io.trino.spi.block.RowBlockBuilder; +import io.trino.spi.connector.ConnectorPageSource; +import io.trino.spi.connector.SourcePage; +import io.trino.spi.type.RowType; +import io.trino.spi.type.TypeManager; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ManifestReader; +import org.apache.iceberg.MetricsUtil; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.transforms.Transforms; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.Types; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.function.BiConsumer; +import java.util.function.Supplier; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.Maps.immutableEntry; +import static com.google.common.collect.Streams.mapWithIndex; +import static io.trino.plugin.iceberg.IcebergTypes.convertIcebergValueToTrino; +import static io.trino.plugin.iceberg.IcebergUtil.primitiveFieldTypes; +import static io.trino.plugin.iceberg.IcebergUtil.readerForManifest; +import static io.trino.plugin.iceberg.system.FilesTable.COLUMN_SIZES_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.CONTENT_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.EQUALITY_IDS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.FILE_FORMAT_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.FILE_PATH_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.FILE_SIZE_IN_BYTES_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.KEY_METADATA_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.LOWER_BOUNDS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.NAN_VALUE_COUNTS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.NULL_VALUE_COUNTS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.PARTITION_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.READABLE_METRICS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.RECORD_COUNT_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.SORT_ORDER_ID_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.SPEC_ID_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.SPLIT_OFFSETS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.UPPER_BOUNDS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.VALUE_COUNTS_COLUMN_NAME; +import static io.trino.plugin.iceberg.system.FilesTable.getColumnType; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getAllPartitionFields; +import static io.trino.plugin.iceberg.util.SystemTableUtil.getPartitionColumnType; +import static io.trino.plugin.iceberg.util.SystemTableUtil.partitionTypes; +import static io.trino.plugin.iceberg.util.SystemTableUtil.readableMetricsToJson; +import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.TypeUtils.writeNativeValue; +import static io.trino.spi.type.VarbinaryType.VARBINARY; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetricsUtil.readableMetricsStruct; + +public final class FilesTablePageSource + implements ConnectorPageSource +{ + private final Closer closer; + private final Schema schema; + private final Schema metadataSchema; + private final Map idToTypeMapping; + private final List partitionFields; + private final Optional partitionColumnType; + private final List primitiveFields; + private final Iterator> contentIterator; + private final Map columnNameToIndex; + private final PageBuilder pageBuilder; + private final long completedBytes; + private long completedPositions; + private long readTimeNanos; + private boolean closed; + + public FilesTablePageSource( + TypeManager typeManager, + TrinoFileSystem trinoFileSystem, + ForwardingFileIoFactory fileIoFactory, + List requiredColumns, + FilesTableSplit split) + { + this.closer = Closer.create(); + this.schema = SchemaParser.fromJson(requireNonNull(split.schemaJson(), "schema is null")); + this.metadataSchema = SchemaParser.fromJson(requireNonNull(split.metadataTableJson(), "metadataSchema is null")); + this.idToTypeMapping = primitiveFieldTypes(schema); + Map specs = split.partitionSpecsByIdJson().entrySet().stream().collect(toImmutableMap( + Map.Entry::getKey, + entry -> PartitionSpecParser.fromJson(SchemaParser.fromJson(split.schemaJson()), entry.getValue()))); + this.partitionFields = getAllPartitionFields(schema, specs); + this.partitionColumnType = getPartitionColumnType(typeManager, partitionFields, schema); + this.primitiveFields = IcebergUtil.primitiveFields(schema).stream() + .sorted(Comparator.comparing(Types.NestedField::name)) + .collect(toImmutableList()); + ManifestReader> manifestReader = closer.register(readerForManifest(split.manifestFile(), fileIoFactory.create(trinoFileSystem), specs)); + // TODO figure out why selecting the specific column causes null to be returned for offset_splits + this.contentIterator = closer.register(requireNonNull(manifestReader, "manifestReader is null").iterator()); + this.pageBuilder = new PageBuilder(requiredColumns.stream().map(column -> { + if (column.equals(PARTITION_COLUMN_NAME)) { + return split.partitionColumnType().orElseThrow(); + } + return getColumnType(column, typeManager); + }).collect(toImmutableList())); + this.columnNameToIndex = mapWithIndex(requiredColumns.stream(), + (columnName, position) -> immutableEntry(columnName, Long.valueOf(position).intValue())) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); + this.completedBytes = split.manifestFile().length(); + this.completedPositions = 0L; + this.readTimeNanos = 0L; + this.closed = false; + } + + @Override + public long getCompletedBytes() + { + return completedBytes; + } + + @Override + public OptionalLong getCompletedPositions() + { + return OptionalLong.of(completedPositions); + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public boolean isFinished() + { + return closed; + } + + @Override + public SourcePage getNextSourcePage() + { + if (closed) { + return null; + } + + while (contentIterator.hasNext() && !pageBuilder.isFull()) { + pageBuilder.declarePosition(); + long start = System.nanoTime(); + ContentFile contentFile = contentIterator.next(); + + // content + writeValueOrNull(pageBuilder, CONTENT_COLUMN_NAME, () -> contentFile.content().id(), INTEGER::writeInt); + // file_path + writeValueOrNull(pageBuilder, FILE_PATH_COLUMN_NAME, contentFile::location, VARCHAR::writeString); + // file_format + writeValueOrNull(pageBuilder, FILE_FORMAT_COLUMN_NAME, () -> contentFile.format().toString(), VARCHAR::writeString); + // spec_id + writeValueOrNull(pageBuilder, SPEC_ID_COLUMN_NAME, contentFile::specId, INTEGER::writeInt); + // partitions + if (partitionColumnType.isPresent() && columnNameToIndex.containsKey(FilesTable.PARTITION_COLUMN_NAME)) { + List partitionTypes = partitionTypes(partitionFields, idToTypeMapping); + List partitionColumnTypes = partitionColumnType.orElseThrow().rowType().getFields().stream() + .map(RowType.Field::getType) + .collect(toImmutableList()); + + if (pageBuilder.getBlockBuilder(columnNameToIndex.get(FilesTable.PARTITION_COLUMN_NAME)) instanceof RowBlockBuilder rowBlockBuilder) { + rowBlockBuilder.buildEntry(fields -> { + for (int i = 0; i < partitionColumnTypes.size(); i++) { + Type type = partitionTypes.get(i); + io.trino.spi.type.Type trinoType = partitionColumnType.get().rowType().getFields().get(i).getType(); + Object value = null; + Integer fieldId = partitionColumnType.get().fieldIds().get(i); + if (fieldId != null) { + value = convertIcebergValueToTrino(type, contentFile.partition().get(i, type.typeId().javaClass())); + } + writeNativeValue(trinoType, fields.get(i), value); + } + }); + } + } + // record_count + writeValueOrNull(pageBuilder, RECORD_COUNT_COLUMN_NAME, contentFile::recordCount, BIGINT::writeLong); + // file_size_in_bytes + writeValueOrNull(pageBuilder, FILE_SIZE_IN_BYTES_COLUMN_NAME, contentFile::fileSizeInBytes, BIGINT::writeLong); + // column_sizes + writeValueOrNull(pageBuilder, COLUMN_SIZES_COLUMN_NAME, contentFile::columnSizes, + FilesTablePageSource::writeIntegerBigintInMap); + // value_counts + writeValueOrNull(pageBuilder, VALUE_COUNTS_COLUMN_NAME, contentFile::valueCounts, + FilesTablePageSource::writeIntegerBigintInMap); + // null_value_counts + writeValueOrNull(pageBuilder, NULL_VALUE_COUNTS_COLUMN_NAME, contentFile::nullValueCounts, + FilesTablePageSource::writeIntegerBigintInMap); + // nan_value_counts + writeValueOrNull(pageBuilder, NAN_VALUE_COUNTS_COLUMN_NAME, contentFile::nanValueCounts, + FilesTablePageSource::writeIntegerBigintInMap); + // lower_bounds + writeValueOrNull(pageBuilder, LOWER_BOUNDS_COLUMN_NAME, contentFile::lowerBounds, + this::writeIntegerVarcharInMap); + // upper_bounds + writeValueOrNull(pageBuilder, UPPER_BOUNDS_COLUMN_NAME, contentFile::upperBounds, + this::writeIntegerVarcharInMap); + // key_metadata + writeValueOrNull(pageBuilder, KEY_METADATA_COLUMN_NAME, contentFile::keyMetadata, + (blkBldr, value) -> VARBINARY.writeSlice(blkBldr, Slices.wrappedHeapBuffer(value))); + // split_offset + writeValueOrNull(pageBuilder, SPLIT_OFFSETS_COLUMN_NAME, contentFile::splitOffsets, + FilesTablePageSource::writeLongInArray); + // equality_ids + writeValueOrNull(pageBuilder, EQUALITY_IDS_COLUMN_NAME, contentFile::equalityFieldIds, + FilesTablePageSource::writeIntegerInArray); + // sort_order_id + writeValueOrNull(pageBuilder, SORT_ORDER_ID_COLUMN_NAME, contentFile::sortOrderId, + (blkBldr, value) -> INTEGER.writeLong(blkBldr, value)); + // readable_metrics + writeValueOrNull(pageBuilder, READABLE_METRICS_COLUMN_NAME, () -> metadataSchema.findField(MetricsUtil.READABLE_METRICS), + (blkBldr, value) -> VARCHAR.writeString(blkBldr, readableMetricsToJson(readableMetricsStruct(schema, contentFile, value.type().asStructType()), primitiveFields))); + readTimeNanos += System.nanoTime() - start; + } + + if (!pageBuilder.isEmpty()) { + Page page = pageBuilder.build(); + completedPositions += page.getPositionCount(); + pageBuilder.reset(); + return SourcePage.create(page); + } + + close(); + return null; + } + + @Override + public long getMemoryUsage() + { + return pageBuilder.getRetainedSizeInBytes(); + } + + @Override + public void close() + { + if (closed) { + return; + } + closed = true; + + try { + closer.close(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private void writeValueOrNull(PageBuilder pageBuilder, String columnName, Supplier valueSupplier, BiConsumer valueWriter) + { + Integer channel = columnNameToIndex.get(columnName); + if (channel == null) { + return; + } + + BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(channel); + T value = valueSupplier.get(); + if (value == null) { + blockBuilder.appendNull(); + } + else { + valueWriter.accept(blockBuilder, value); + } + } + + private static void writeLongInArray(BlockBuilder blockBuilder, List values) + { + if (blockBuilder instanceof ArrayBlockBuilder arrayBlockBuilder) { + arrayBlockBuilder.buildEntry(builder -> + values.forEach(value -> BIGINT.writeLong(builder, value))); + } + } + + private static void writeIntegerInArray(BlockBuilder blockBuilder, List values) + { + if (blockBuilder instanceof ArrayBlockBuilder arrayBlockBuilder) { + arrayBlockBuilder.buildEntry(builder -> + values.forEach(value -> INTEGER.writeInt(builder, value))); + } + } + + private static void writeIntegerBigintInMap(BlockBuilder blockBuilder, Map values) + { + if (blockBuilder instanceof MapBlockBuilder mapBlockBuilder) { + mapBlockBuilder.buildEntry((keyBuilder, valueBuilder) -> values.forEach((key, value) -> { + INTEGER.writeInt(keyBuilder, key); + BIGINT.writeLong(valueBuilder, value); + })); + } + } + + private void writeIntegerVarcharInMap(BlockBuilder blockBuilder, Map values) + { + if (blockBuilder instanceof MapBlockBuilder mapBlockBuilder) { + mapBlockBuilder.buildEntry((keyBuilder, valueBuilder) -> { + values.forEach((key, value) -> { + if (idToTypeMapping.containsKey(key)) { + INTEGER.writeInt(keyBuilder, key); + VARCHAR.writeString(valueBuilder, Transforms.identity().toHumanString( + idToTypeMapping.get(key), + Conversions.fromByteBuffer(idToTypeMapping.get(key), value))); + } + }); + }); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTableSplit.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTableSplit.java new file mode 100644 index 000000000000..41d0a1c28f9e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTableSplit.java @@ -0,0 +1,68 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system.files; + +import io.airlift.slice.SizeOf; +import io.trino.spi.HostAddress; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.type.Type; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.airlift.slice.SizeOf.estimatedSizeOf; +import static io.airlift.slice.SizeOf.instanceSize; + +public record FilesTableSplit( + TrinoManifestFile manifestFile, + String schemaJson, + String metadataTableJson, + Map partitionSpecsByIdJson, + Optional partitionColumnType, + Map fileIoProperties) + implements ConnectorSplit +{ + private static final int INSTANCE_SIZE = instanceSize(FilesTableSplit.class); + + @Override + public long getRetainedSizeInBytes() + { + // partitionColumnType is not accounted for as Type instances are cached (by TypeRegistry) and shared + return INSTANCE_SIZE + + manifestFile.getRetainedSizeInBytes() + + estimatedSizeOf(schemaJson) + + estimatedSizeOf(metadataTableJson) + + estimatedSizeOf(partitionSpecsByIdJson, SizeOf::sizeOf, SizeOf::estimatedSizeOf) + + estimatedSizeOf(fileIoProperties, SizeOf::estimatedSizeOf, SizeOf::estimatedSizeOf); + } + + @Override + public boolean isRemotelyAccessible() + { + return true; + } + + @Override + public List getAddresses() + { + return List.of(); + } + + @Override + public Object getInfo() + { + throw new UnsupportedOperationException("Unimplemented method 'getInfo'"); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTableSplitSource.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTableSplitSource.java new file mode 100644 index 000000000000..adb1244d7e96 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/FilesTableSplitSource.java @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system.files; + +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorSplitSource; +import io.trino.spi.type.Type; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableScan; +import org.apache.iceberg.io.FileIO; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; + +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.CompletableFuture.completedFuture; + +public final class FilesTableSplitSource + implements ConnectorSplitSource +{ + private final Table icebergTable; + private final Optional snapshotId; + private final String schemaJson; + private final String metadataSchemaJson; + private final Map partitionSpecsByIdJson; + private final Optional partitionColumnType; + private final Map fileIoProperties; + private boolean finished; + + public FilesTableSplitSource( + Table icebergTable, + Optional snapshotId, + String schemaJson, + String metadataSchemaJson, + Map partitionSpecsByIdJson, + Optional partitionColumnType, + Map fileIoProperties) + { + this.icebergTable = requireNonNull(icebergTable, "icebergTable is null"); + this.snapshotId = requireNonNull(snapshotId, "snapshotId is null"); + this.schemaJson = requireNonNull(schemaJson, "schemaJson is null"); + this.metadataSchemaJson = requireNonNull(metadataSchemaJson, "metadataSchemaJson is null"); + this.partitionSpecsByIdJson = requireNonNull(partitionSpecsByIdJson, "partitionSpecsByIdJson is null"); + this.partitionColumnType = requireNonNull(partitionColumnType, "partitionColumnType is null"); + this.fileIoProperties = requireNonNull(fileIoProperties, "fileIoProperties is null"); + this.finished = false; + } + + @Override + public CompletableFuture getNextBatch(int maxSize) + { + TableScan scan = icebergTable.newScan(); + snapshotId.ifPresent(scan::useSnapshot); + List splits = new ArrayList<>(); + + try (FileIO fileIO = icebergTable.io()) { + for (ManifestFile manifestFile : scan.snapshot().allManifests(fileIO)) { + splits.add(new FilesTableSplit( + TrinoManifestFile.from(manifestFile), + schemaJson, + metadataSchemaJson, + partitionSpecsByIdJson, + partitionColumnType, + fileIoProperties)); + } + } + + finished = true; + return completedFuture(new ConnectorSplitBatch(splits, true)); + } + + @Override + public void close() + { + // do nothing + } + + @Override + public boolean isFinished() + { + return finished; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/TrinoManifestFile.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/TrinoManifestFile.java new file mode 100644 index 000000000000..cbfd71215d32 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/system/files/TrinoManifestFile.java @@ -0,0 +1,177 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.system.files; + +import org.apache.iceberg.ManifestContent; +import org.apache.iceberg.ManifestFile; + +import java.util.List; + +import static io.airlift.slice.SizeOf.estimatedSizeOf; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.airlift.slice.SizeOf.sizeOf; + +public record TrinoManifestFile( + String path, + long length, + int partitionSpecId, + ManifestContent content, + long sequenceNumber, + long minSequenceNumber, + Long snapshotId, + Integer addedFilesCount, + Integer existingFilesCount, + Integer deletedFilesCount, + Long addedRowsCount, + Long existingRowsCount, + Long deletedRowsCount, + Long firstRowId) + implements ManifestFile +{ + private static final long INSTANCE_SIZE = instanceSize(TrinoManifestFile.class); + + @Override + public String path() + { + return path; + } + + @Override + public long length() + { + return length; + } + + @Override + public int partitionSpecId() + { + return partitionSpecId; + } + + @Override + public ManifestContent content() + { + return content; + } + + @Override + public long sequenceNumber() + { + return sequenceNumber; + } + + @Override + public long minSequenceNumber() + { + return minSequenceNumber; + } + + @Override + public Long snapshotId() + { + return snapshotId; + } + + @Override + public Integer addedFilesCount() + { + return addedFilesCount; + } + + @Override + public Long addedRowsCount() + { + return addedRowsCount; + } + + @Override + public Integer existingFilesCount() + { + return existingFilesCount; + } + + @Override + public Long existingRowsCount() + { + return existingRowsCount; + } + + @Override + public Integer deletedFilesCount() + { + return deletedFilesCount; + } + + @Override + public Long deletedRowsCount() + { + return deletedRowsCount; + } + + @Override + public List partitions() + { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Long firstRowId() + { + return firstRowId; + } + + @Override + public ManifestFile copy() + { + throw new UnsupportedOperationException("Cannot copy"); + } + + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE + + estimatedSizeOf(path) + + sizeOf(length) + + sizeOf(partitionSpecId) + + sizeOf(content.id()) + + sizeOf(sequenceNumber) + + sizeOf(minSequenceNumber) + + sizeOf(snapshotId) + + sizeOf(addedFilesCount) + + sizeOf(existingFilesCount) + + sizeOf(deletedFilesCount) + + sizeOf(addedRowsCount) + + sizeOf(existingRowsCount) + + sizeOf(deletedRowsCount) + + sizeOf(firstRowId); + } + + public static TrinoManifestFile from(ManifestFile manifestFile) + { + return new TrinoManifestFile( + manifestFile.path(), + manifestFile.length(), + manifestFile.partitionSpecId(), + manifestFile.content(), + manifestFile.sequenceNumber(), + manifestFile.minSequenceNumber(), + manifestFile.snapshotId(), + manifestFile.addedFilesCount(), + manifestFile.existingFilesCount(), + manifestFile.deletedFilesCount(), + manifestFile.addedRowsCount(), + manifestFile.existingRowsCount(), + manifestFile.deletedRowsCount(), + manifestFile.firstRowId()); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/DataFileWithDeleteFiles.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/DataFileWithDeleteFiles.java index 5046f4df9cf4..f5e1edc15633 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/DataFileWithDeleteFiles.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/DataFileWithDeleteFiles.java @@ -21,24 +21,11 @@ import static java.util.Objects.requireNonNull; -public class DataFileWithDeleteFiles +public record DataFileWithDeleteFiles(DataFile dataFile, List deleteFiles) { - private final DataFile dataFile; - private final List deleteFiles; - - public DataFileWithDeleteFiles(DataFile dataFile, List deleteFiles) - { - this.dataFile = requireNonNull(dataFile, "dataFile is null"); - this.deleteFiles = ImmutableList.copyOf(requireNonNull(deleteFiles, "deleteFiles is null")); - } - - public DataFile getDataFile() - { - return dataFile; - } - - public List getDeleteFiles() + public DataFileWithDeleteFiles { - return deleteFiles; + requireNonNull(dataFile, "dataFile is null"); + deleteFiles = ImmutableList.copyOf(requireNonNull(deleteFiles, "deleteFiles is null")); } } diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/DefaultLocationProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/DefaultLocationProvider.java new file mode 100644 index 000000000000..a63b3ca54066 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/DefaultLocationProvider.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.util; + +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.io.LocationProvider; + +import java.util.Map; + +import static java.lang.String.format; +import static org.apache.iceberg.util.LocationUtil.stripTrailingSlash; + +// based on org.apache.iceberg.LocationProviders.DefaultLocationProvider +public class DefaultLocationProvider + implements LocationProvider +{ + private final String dataLocation; + + public DefaultLocationProvider(String tableLocation, Map properties) + { + this.dataLocation = stripTrailingSlash(dataLocation(properties, tableLocation)); + } + + @SuppressWarnings("deprecation") + private static String dataLocation(Map properties, String tableLocation) + { + String dataLocation = properties.get(TableProperties.WRITE_DATA_LOCATION); + if (dataLocation == null) { + dataLocation = properties.get(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); + if (dataLocation == null) { + dataLocation = format("%s/data", stripTrailingSlash(tableLocation)); + } + } + return dataLocation; + } + + @Override + public String newDataLocation(PartitionSpec spec, StructLike partitionData, String filename) + { + return "%s/%s/%s".formatted(dataLocation, spec.partitionToPath(partitionData), filename); + } + + @Override + public String newDataLocation(String filename) + { + return "%s/%s".formatted(dataLocation, filename); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/HiveSchemaUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/HiveSchemaUtil.java index 7ff436f7df13..59ff800fc8ae 100644 --- a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/HiveSchemaUtil.java +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/HiveSchemaUtil.java @@ -14,10 +14,12 @@ package io.trino.plugin.iceberg.util; import io.trino.plugin.hive.type.TypeInfo; +import io.trino.spi.TrinoException; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types.DecimalType; import static io.trino.plugin.hive.type.TypeInfoUtils.getTypeInfoFromTypeString; +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; import static java.util.stream.Collectors.joining; // based on org.apache.iceberg.hive.HiveSchemaUtil @@ -41,8 +43,11 @@ private static String convertToTypeString(Type type) case DATE -> "date"; case TIME, STRING, UUID -> "string"; case TIMESTAMP -> "timestamp"; + case TIMESTAMP_NANO -> throw new TrinoException(NOT_SUPPORTED, "Unsupported Iceberg type: TIMESTAMP_NANO"); case FIXED, BINARY -> "binary"; case DECIMAL -> "decimal(%s,%s)".formatted(((DecimalType) type).precision(), ((DecimalType) type).scale()); + case UNKNOWN, GEOMETRY, GEOGRAPHY -> throw new TrinoException(NOT_SUPPORTED, "Unsupported Iceberg type: " + type); + case VARIANT -> throw new TrinoException(NOT_SUPPORTED, "Unsupported Iceberg type: VARIANT"); case LIST -> "array<%s>".formatted(convert(type.asListType().elementType())); case MAP -> "map<%s,%s>".formatted(convert(type.asMapType().keyType()), convert(type.asMapType().valueType())); case STRUCT -> "struct<%s>".formatted(type.asStructType().fields().stream() diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/ObjectStoreLocationProvider.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/ObjectStoreLocationProvider.java new file mode 100644 index 000000000000..fb552c381d87 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/ObjectStoreLocationProvider.java @@ -0,0 +1,105 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.util; + +import com.google.common.hash.HashFunction; +import io.trino.filesystem.Location; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.io.LocationProvider; + +import java.util.Base64; +import java.util.Map; + +import static com.google.common.hash.Hashing.murmur3_32_fixed; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.iceberg.util.LocationUtil.stripTrailingSlash; + +// based on org.apache.iceberg.LocationProviders.ObjectStoreLocationProvider +public class ObjectStoreLocationProvider + implements LocationProvider +{ + private static final HashFunction HASH_FUNC = murmur3_32_fixed(); + private static final Base64.Encoder BASE64_ENCODER = Base64.getUrlEncoder().withoutPadding(); + private final String storageLocation; + private final String context; + + public ObjectStoreLocationProvider(String tableLocation, Map properties) + { + this.storageLocation = stripTrailingSlash(dataLocation(properties, tableLocation)); + // if the storage location is within the table prefix, don't add table and database name context + this.context = storageLocation.startsWith(stripTrailingSlash(tableLocation)) ? null : pathContext(tableLocation); + } + + @SuppressWarnings("deprecation") + private static String dataLocation(Map properties, String tableLocation) + { + String dataLocation = properties.get(TableProperties.WRITE_DATA_LOCATION); + if (dataLocation == null) { + dataLocation = properties.get(TableProperties.OBJECT_STORE_PATH); + if (dataLocation == null) { + dataLocation = properties.get(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); + if (dataLocation == null) { + dataLocation = "%s/data".formatted(stripTrailingSlash(tableLocation)); + } + } + } + return dataLocation; + } + + @Override + public String newDataLocation(PartitionSpec spec, StructLike partitionData, String filename) + { + return newDataLocation("%s/%s".formatted(spec.partitionToPath(partitionData), filename)); + } + + @Override + public String newDataLocation(String filename) + { + String hash = computeHash(filename); + if (context != null) { + return "%s/%s/%s/%s".formatted(storageLocation, hash, context, filename); + } + return "%s/%s/%s".formatted(storageLocation, hash, filename); + } + + private static String pathContext(String tableLocation) + { + Location location; + String name; + try { + location = Location.of(stripTrailingSlash(tableLocation)); + name = location.fileName(); + } + catch (IllegalArgumentException | IllegalStateException e) { + return null; + } + + try { + String parent = stripTrailingSlash(location.parentDirectory().path()); + parent = parent.substring(parent.lastIndexOf('/') + 1); + return "%s/%s".formatted(parent, name); + } + catch (IllegalArgumentException | IllegalStateException e) { + return name; + } + } + + private static String computeHash(String fileName) + { + byte[] bytes = HASH_FUNC.hashString(fileName, UTF_8).asBytes(); + return BASE64_ENCODER.encodeToString(bytes); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcIcebergIds.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcIcebergIds.java new file mode 100644 index 000000000000..41ac93f92e46 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcIcebergIds.java @@ -0,0 +1,107 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.graph.Traverser; +import io.trino.orc.OrcColumn; +import io.trino.orc.OrcReader; +import io.trino.orc.metadata.OrcType.OrcTypeKind; +import org.apache.iceberg.mapping.MappedField; +import org.apache.iceberg.mapping.NameMapping; + +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.plugin.iceberg.util.OrcTypeConverter.ORC_ICEBERG_ID_KEY; + +public final class OrcIcebergIds +{ + private OrcIcebergIds() {} + + public static Map fileColumnsByIcebergId(OrcReader reader, Optional nameMapping) + { + List fileColumns = reader.getRootColumn().getNestedColumns(); + + if (nameMapping.isPresent() && !hasIds(reader.getRootColumn())) { + fileColumns = fileColumns.stream() + .map(orcColumn -> setMissingFieldIds(orcColumn, nameMapping.get(), ImmutableList.of(orcColumn.getColumnName()))) + .collect(toImmutableList()); + } + + return mapIdsToOrcFileColumns(fileColumns); + } + + private static boolean hasIds(OrcColumn column) + { + if (column.getAttributes().containsKey(ORC_ICEBERG_ID_KEY)) { + return true; + } + + return column.getNestedColumns().stream().anyMatch(OrcIcebergIds::hasIds); + } + + private static OrcColumn setMissingFieldIds(OrcColumn column, NameMapping nameMapping, List qualifiedPath) + { + MappedField mappedField = nameMapping.find(qualifiedPath); + + ImmutableMap.Builder attributes = ImmutableMap.builder(); + attributes.putAll(column.getAttributes()); + if ((mappedField != null) && (mappedField.id() != null)) { + attributes.put(ORC_ICEBERG_ID_KEY, String.valueOf(mappedField.id())); + } + + List orcColumns = column.getNestedColumns().stream() + .map(nestedColumn -> setMissingFieldIds(nestedColumn, nameMapping, ImmutableList.builder() + .addAll(qualifiedPath) + .add(pathName(column, nestedColumn)) + .build())) + .collect(toImmutableList()); + + return new OrcColumn( + column.getPath(), + column.getColumnId(), + column.getColumnName(), + column.getColumnType(), + column.getOrcDataSourceId(), + orcColumns, + attributes.buildOrThrow()); + } + + private static String pathName(OrcColumn column, OrcColumn nestedColumn) + { + // Trino ORC reader uses "item" for list element names, but NameMapper expects "element" + if (column.getColumnType().getOrcTypeKind() == OrcTypeKind.LIST) { + return "element"; + } + return nestedColumn.getColumnName(); + } + + private static Map mapIdsToOrcFileColumns(List columns) + { + ImmutableMap.Builder columnsById = ImmutableMap.builder(); + Traverser.forTree(OrcColumn::getNestedColumns) + .depthFirstPreOrder(columns) + .forEach(column -> { + String fieldId = column.getAttributes().get(ORC_ICEBERG_ID_KEY); + if (fieldId != null) { + columnsById.put(Integer.parseInt(fieldId), column); + } + }); + return columnsById.buildOrThrow(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcMetrics.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcMetrics.java new file mode 100644 index 000000000000..35debe802afd --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcMetrics.java @@ -0,0 +1,351 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.util; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.airlift.slice.Slice; +import io.trino.filesystem.TrinoInputFile; +import io.trino.orc.OrcColumn; +import io.trino.orc.OrcDataSource; +import io.trino.orc.OrcReader; +import io.trino.orc.OrcReaderOptions; +import io.trino.orc.metadata.ColumnMetadata; +import io.trino.orc.metadata.Footer; +import io.trino.orc.metadata.OrcColumnId; +import io.trino.orc.metadata.OrcType; +import io.trino.orc.metadata.statistics.BooleanStatistics; +import io.trino.orc.metadata.statistics.ColumnStatistics; +import io.trino.orc.metadata.statistics.DateStatistics; +import io.trino.orc.metadata.statistics.DecimalStatistics; +import io.trino.orc.metadata.statistics.DoubleStatistics; +import io.trino.orc.metadata.statistics.IntegerStatistics; +import io.trino.orc.metadata.statistics.StringStatistics; +import io.trino.orc.metadata.statistics.TimestampStatistics; +import io.trino.plugin.base.metrics.FileFormatDataSourceStats; +import io.trino.plugin.iceberg.TrinoOrcDataSource; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.MetricsModes; +import org.apache.iceberg.MetricsUtil; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.mapping.MappingUtil; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Type.TypeID; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.BinaryUtil; +import org.apache.iceberg.util.UnicodeUtil; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.orc.OrcReader.createOrcReader; +import static io.trino.orc.metadata.OrcColumnId.ROOT_COLUMN; +import static io.trino.plugin.iceberg.util.OrcIcebergIds.fileColumnsByIcebergId; +import static io.trino.plugin.iceberg.util.OrcTypeConverter.ORC_ICEBERG_ID_KEY; +import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND; +import static java.lang.Math.toIntExact; +import static java.math.RoundingMode.UNNECESSARY; +import static java.util.function.Function.identity; + +public final class OrcMetrics +{ + private OrcMetrics() {} + + public static Metrics fileMetrics(TrinoInputFile file, MetricsConfig metricsConfig, Schema schema) + { + OrcReaderOptions options = new OrcReaderOptions(); + try (OrcDataSource dataSource = new TrinoOrcDataSource(file, options, new FileFormatDataSourceStats())) { + Optional reader = createOrcReader(dataSource, options); + if (reader.isEmpty()) { + return new Metrics(0L, null, null, null, null); + } + Footer footer = reader.get().getFooter(); + + // use name mapping to compute missing Iceberg field IDs + Optional nameMapping = Optional.of(MappingUtil.create(schema)); + Map mappedColumns = fileColumnsByIcebergId(reader.get(), nameMapping) + .values().stream() + .collect(toImmutableMap(OrcColumn::getColumnId, identity())); + + // rebuild type list with mapped columns + List mappedTypes = new ArrayList<>(); + ColumnMetadata types = footer.getTypes(); + for (int i = 0; i < types.size(); i++) { + OrcColumnId id = new OrcColumnId(i); + mappedTypes.add(Optional.ofNullable(mappedColumns.get(id)) + .map(OrcMetrics::toBasicOrcType) + .orElseGet(() -> types.get(id))); + } + + return computeMetrics(metricsConfig, schema, new ColumnMetadata<>(mappedTypes), footer.getNumberOfRows(), footer.getFileStats()); + } + catch (IOException e) { + throw new UncheckedIOException("Failed to read file footer: " + file.location(), e); + } + } + + private static OrcType toBasicOrcType(OrcColumn column) + { + return new OrcType( + column.getColumnType().getOrcTypeKind(), + column.getNestedColumns().stream() + .map(OrcColumn::getColumnId) + .collect(toImmutableList()), + null, + Optional.empty(), + Optional.empty(), + Optional.empty(), + column.getAttributes()); + } + + public static Metrics computeMetrics( + MetricsConfig metricsConfig, + Schema icebergSchema, + ColumnMetadata orcColumns, + long fileRowCount, + Optional> columnStatistics) + { + if (columnStatistics.isEmpty()) { + return new Metrics(fileRowCount, null, null, null, null, null, null); + } + // Columns that are descendants of LIST or MAP types are excluded because: + // 1. Their stats are not used by Apache Iceberg to filter out data files + // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them. + // See https://github.com/apache/iceberg/pull/199#discussion_r429443627 + Set excludedColumns = getExcludedColumns(orcColumns); + + ImmutableMap.Builder valueCountsBuilder = ImmutableMap.builder(); + ImmutableMap.Builder nullCountsBuilder = ImmutableMap.builder(); + ImmutableMap.Builder nanCountsBuilder = ImmutableMap.builder(); + ImmutableMap.Builder lowerBoundsBuilder = ImmutableMap.builder(); + ImmutableMap.Builder upperBoundsBuilder = ImmutableMap.builder(); + + // OrcColumnId(0) is the root column that represents file-level schema + for (int i = 1; i < orcColumns.size(); i++) { + OrcColumnId orcColumnId = new OrcColumnId(i); + if (excludedColumns.contains(orcColumnId)) { + continue; + } + OrcType orcColumn = orcColumns.get(orcColumnId); + ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId); + int icebergId = getIcebergId(orcColumn); + Types.NestedField icebergField = icebergSchema.findField(icebergId); + MetricsModes.MetricsMode metricsMode = MetricsUtil.metricsMode(icebergSchema, metricsConfig, icebergId); + if (metricsMode.equals(MetricsModes.None.get())) { + continue; + } + verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema); + valueCountsBuilder.put(icebergId, fileRowCount); + if (orcColumnStats.hasNumberOfValues()) { + nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues()); + } + if (orcColumnStats.getNumberOfNanValues() > 0) { + nanCountsBuilder.put(icebergId, orcColumnStats.getNumberOfNanValues()); + } + + if (!metricsMode.equals(MetricsModes.Counts.get())) { + toIcebergMinMax(orcColumnStats, icebergField.type(), metricsMode).ifPresent(minMax -> { + lowerBoundsBuilder.put(icebergId, minMax.getMin()); + upperBoundsBuilder.put(icebergId, minMax.getMax()); + }); + } + } + Map valueCounts = valueCountsBuilder.buildOrThrow(); + Map nullCounts = nullCountsBuilder.buildOrThrow(); + Map nanCounts = nanCountsBuilder.buildOrThrow(); + Map lowerBounds = lowerBoundsBuilder.buildOrThrow(); + Map upperBounds = upperBoundsBuilder.buildOrThrow(); + return new Metrics( + fileRowCount, + null, // TODO: Add column size accounting to ORC column writers + valueCounts.isEmpty() ? null : valueCounts, + nullCounts.isEmpty() ? null : nullCounts, + nanCounts.isEmpty() ? null : nanCounts, + lowerBounds.isEmpty() ? null : lowerBounds, + upperBounds.isEmpty() ? null : upperBounds); + } + + private static Set getExcludedColumns(ColumnMetadata orcColumns) + { + ImmutableSet.Builder excludedColumns = ImmutableSet.builder(); + populateExcludedColumns(orcColumns, ROOT_COLUMN, false, excludedColumns); + return excludedColumns.build(); + } + + private static void populateExcludedColumns(ColumnMetadata orcColumns, OrcColumnId orcColumnId, boolean exclude, ImmutableSet.Builder excludedColumns) + { + if (exclude) { + excludedColumns.add(orcColumnId); + } + OrcType orcColumn = orcColumns.get(orcColumnId); + switch (orcColumn.getOrcTypeKind()) { + case LIST: + case MAP: + for (OrcColumnId child : orcColumn.getFieldTypeIndexes()) { + populateExcludedColumns(orcColumns, child, true, excludedColumns); + } + return; + case STRUCT: + for (OrcColumnId child : orcColumn.getFieldTypeIndexes()) { + populateExcludedColumns(orcColumns, child, exclude, excludedColumns); + } + return; + default: + // unexpected, TODO throw + } + } + + private static int getIcebergId(OrcType orcColumn) + { + String icebergId = orcColumn.getAttributes().get(ORC_ICEBERG_ID_KEY); + verify(icebergId != null, "ORC column %s doesn't have an associated Iceberg ID", orcColumn); + return Integer.parseInt(icebergId); + } + + private static Optional toIcebergMinMax(ColumnStatistics orcColumnStats, Type icebergType, MetricsModes.MetricsMode metricsModes) + { + BooleanStatistics booleanStatistics = orcColumnStats.getBooleanStatistics(); + if (booleanStatistics != null) { + boolean hasTrueValues = booleanStatistics.getTrueValueCount() != 0; + boolean hasFalseValues = orcColumnStats.getNumberOfValues() != booleanStatistics.getTrueValueCount(); + return Optional.of(new IcebergMinMax(icebergType, !hasFalseValues, hasTrueValues, metricsModes)); + } + + IntegerStatistics integerStatistics = orcColumnStats.getIntegerStatistics(); + if (integerStatistics != null) { + Object min = integerStatistics.getMin(); + Object max = integerStatistics.getMax(); + if (min == null || max == null) { + return Optional.empty(); + } + if (icebergType.typeId() == TypeID.INTEGER) { + min = toIntExact((Long) min); + max = toIntExact((Long) max); + } + return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes)); + } + DoubleStatistics doubleStatistics = orcColumnStats.getDoubleStatistics(); + if (doubleStatistics != null) { + Object min = doubleStatistics.getMin(); + Object max = doubleStatistics.getMax(); + if (min == null || max == null) { + return Optional.empty(); + } + if (icebergType.typeId() == TypeID.FLOAT) { + min = ((Double) min).floatValue(); + max = ((Double) max).floatValue(); + } + return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes)); + } + StringStatistics stringStatistics = orcColumnStats.getStringStatistics(); + if (stringStatistics != null) { + Slice min = stringStatistics.getMin(); + Slice max = stringStatistics.getMax(); + if (min == null || max == null) { + return Optional.empty(); + } + return Optional.of(new IcebergMinMax(icebergType, min.toStringUtf8(), max.toStringUtf8(), metricsModes)); + } + DateStatistics dateStatistics = orcColumnStats.getDateStatistics(); + if (dateStatistics != null) { + Integer min = dateStatistics.getMin(); + Integer max = dateStatistics.getMax(); + if (min == null || max == null) { + return Optional.empty(); + } + return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes)); + } + DecimalStatistics decimalStatistics = orcColumnStats.getDecimalStatistics(); + if (decimalStatistics != null) { + BigDecimal min = decimalStatistics.getMin(); + BigDecimal max = decimalStatistics.getMax(); + if (min == null || max == null) { + return Optional.empty(); + } + min = min.setScale(((Types.DecimalType) icebergType).scale(), UNNECESSARY); + max = max.setScale(((Types.DecimalType) icebergType).scale(), UNNECESSARY); + return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes)); + } + TimestampStatistics timestampStatistics = orcColumnStats.getTimestampStatistics(); + if (timestampStatistics != null) { + Long min = timestampStatistics.getMin(); + Long max = timestampStatistics.getMax(); + if (min == null || max == null) { + return Optional.empty(); + } + // Since ORC timestamp statistics are truncated to millisecond precision, this can cause some column values to fall outside the stats range. + // We are appending 999 microseconds to account for the fact that Trino ORC writer truncates timestamps. + return Optional.of(new IcebergMinMax(icebergType, min * MICROSECONDS_PER_MILLISECOND, (max * MICROSECONDS_PER_MILLISECOND) + (MICROSECONDS_PER_MILLISECOND - 1), metricsModes)); + } + return Optional.empty(); + } + + private static class IcebergMinMax + { + private final ByteBuffer min; + private final ByteBuffer max; + + private IcebergMinMax(Type type, Object min, Object max, MetricsModes.MetricsMode metricsMode) + { + if (metricsMode instanceof MetricsModes.Full) { + this.min = Conversions.toByteBuffer(type, min); + this.max = Conversions.toByteBuffer(type, max); + } + else if (metricsMode instanceof MetricsModes.Truncate truncateMode) { + int truncateLength = truncateMode.length(); + switch (type.typeId()) { + case STRING: + this.min = UnicodeUtil.truncateStringMin(Literal.of((CharSequence) min), truncateLength).toByteBuffer(); + this.max = UnicodeUtil.truncateStringMax(Literal.of((CharSequence) max), truncateLength).toByteBuffer(); + break; + case FIXED: + case BINARY: + this.min = BinaryUtil.truncateBinaryMin(Literal.of((ByteBuffer) min), truncateLength).toByteBuffer(); + this.max = BinaryUtil.truncateBinaryMax(Literal.of((ByteBuffer) max), truncateLength).toByteBuffer(); + break; + default: + this.min = Conversions.toByteBuffer(type, min); + this.max = Conversions.toByteBuffer(type, max); + } + } + else { + throw new UnsupportedOperationException("Unsupported metrics mode for Iceberg Max/Min Bound: " + metricsMode); + } + } + + public ByteBuffer getMin() + { + return min; + } + + public ByteBuffer getMax() + { + return max; + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcTypeConverter.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcTypeConverter.java new file mode 100644 index 000000000000..2b757319a68e --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/OrcTypeConverter.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.util; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.orc.metadata.ColumnMetadata; +import io.trino.orc.metadata.OrcColumnId; +import io.trino.orc.metadata.OrcType; +import io.trino.orc.metadata.OrcType.OrcTypeKind; +import io.trino.spi.TrinoException; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types.DecimalType; +import org.apache.iceberg.types.Types.ListType; +import org.apache.iceberg.types.Types.MapType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.types.Types.TimestampType; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; + +public final class OrcTypeConverter +{ + public static final String ORC_ICEBERG_ID_KEY = "iceberg.id"; + public static final String ORC_ICEBERG_REQUIRED_KEY = "iceberg.required"; + public static final String ICEBERG_LONG_TYPE = "iceberg.long-type"; + public static final String ICEBERG_BINARY_TYPE = "iceberg.binary-type"; + + private OrcTypeConverter() {} + + public static ColumnMetadata toOrcType(Schema schema) + { + return new ColumnMetadata<>(toOrcStructType(0, schema.asStruct(), ImmutableMap.of())); + } + + private static List toOrcType(int nextFieldTypeIndex, Type type, Map attributes) + { + return switch (type.typeId()) { + case BOOLEAN -> ImmutableList.of(new OrcType(OrcTypeKind.BOOLEAN, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case INTEGER -> ImmutableList.of(new OrcType(OrcTypeKind.INT, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case LONG -> ImmutableList.of(new OrcType(OrcTypeKind.LONG, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case FLOAT -> ImmutableList.of(new OrcType(OrcTypeKind.FLOAT, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case DOUBLE -> ImmutableList.of(new OrcType(OrcTypeKind.DOUBLE, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case DATE -> ImmutableList.of(new OrcType(OrcTypeKind.DATE, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case TIME -> { + attributes = ImmutableMap.builder() + .putAll(attributes) + .put(ICEBERG_LONG_TYPE, "TIME") + .buildOrThrow(); + yield ImmutableList.of(new OrcType(OrcTypeKind.LONG, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + } + case TIMESTAMP -> { + OrcTypeKind timestampKind = ((TimestampType) type).shouldAdjustToUTC() ? OrcTypeKind.TIMESTAMP_INSTANT : OrcTypeKind.TIMESTAMP; + yield ImmutableList.of(new OrcType(timestampKind, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + } + // TODO https://github.com/trinodb/trino/issues/19753 Support Iceberg timestamp types with nanosecond precision + case TIMESTAMP_NANO -> throw new TrinoException(NOT_SUPPORTED, "Unsupported Iceberg type: TIMESTAMP_NANO"); + case STRING -> ImmutableList.of(new OrcType(OrcTypeKind.STRING, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case FIXED, BINARY -> ImmutableList.of(new OrcType(OrcTypeKind.BINARY, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + case DECIMAL -> { + DecimalType decimalType = (DecimalType) type; + yield ImmutableList.of(new OrcType(OrcTypeKind.DECIMAL, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.of(decimalType.precision()), Optional.of(decimalType.scale()), attributes)); + } + case UUID -> { + attributes = ImmutableMap.builder() + .putAll(attributes) + .put(ICEBERG_BINARY_TYPE, "UUID") + .buildOrThrow(); + yield ImmutableList.of(new OrcType(OrcTypeKind.BINARY, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), Optional.empty(), attributes)); + } + case VARIANT, GEOMETRY, GEOGRAPHY, UNKNOWN -> throw new TrinoException(NOT_SUPPORTED, "Unsupported Iceberg type: " + type); + case STRUCT -> toOrcStructType(nextFieldTypeIndex, (StructType) type, attributes); + case LIST -> toOrcListType(nextFieldTypeIndex, (ListType) type, attributes); + case MAP -> toOrcMapType(nextFieldTypeIndex, (MapType) type, attributes); + }; + } + + private static List toOrcStructType(int nextFieldTypeIndex, StructType structType, Map attributes) + { + nextFieldTypeIndex++; + + List fieldTypeIndexes = new ArrayList<>(); + List fieldNames = new ArrayList<>(); + List> fieldTypesList = new ArrayList<>(); + for (NestedField field : structType.fields()) { + fieldTypeIndexes.add(new OrcColumnId(nextFieldTypeIndex)); + fieldNames.add(field.name()); + Map fieldAttributes = ImmutableMap.builder() + .put(ORC_ICEBERG_ID_KEY, Integer.toString(field.fieldId())) + .put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(field.isRequired())) + .buildOrThrow(); + List fieldOrcTypes = toOrcType(nextFieldTypeIndex, field.type(), fieldAttributes); + fieldTypesList.add(fieldOrcTypes); + nextFieldTypeIndex += fieldOrcTypes.size(); + } + + return ImmutableList.builder() + .add(new OrcType( + OrcTypeKind.STRUCT, + fieldTypeIndexes, + fieldNames, + Optional.empty(), + Optional.empty(), + Optional.empty(), + attributes)) + .addAll(fieldTypesList.stream().flatMap(List::stream).iterator()) + .build(); + } + + private static List toOrcListType(int nextFieldTypeIndex, ListType listType, Map attributes) + { + nextFieldTypeIndex++; + + Map elementAttributes = ImmutableMap.builder() + .put(ORC_ICEBERG_ID_KEY, Integer.toString(listType.elementId())) + .put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(listType.isElementRequired())) + .buildOrThrow(); + List itemTypes = toOrcType(nextFieldTypeIndex, listType.elementType(), elementAttributes); + + return ImmutableList.builder() + .add(new OrcType( + OrcTypeKind.LIST, + ImmutableList.of(new OrcColumnId(nextFieldTypeIndex)), + ImmutableList.of("item"), + Optional.empty(), + Optional.empty(), + Optional.empty(), + attributes)) + .addAll(itemTypes) + .build(); + } + + private static List toOrcMapType(int nextFieldTypeIndex, MapType mapType, Map attributes) + { + nextFieldTypeIndex++; + + List keyTypes = toOrcType(nextFieldTypeIndex, mapType.keyType(), ImmutableMap.builder() + .put(ORC_ICEBERG_ID_KEY, Integer.toString(mapType.keyId())) + .put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(true)) + .buildOrThrow()); + + Map valueAttributes = ImmutableMap.builder() + .put(ORC_ICEBERG_ID_KEY, Integer.toString(mapType.valueId())) + .put(ORC_ICEBERG_REQUIRED_KEY, Boolean.toString(mapType.isValueRequired())) + .buildOrThrow(); + List valueTypes = toOrcType(nextFieldTypeIndex + keyTypes.size(), mapType.valueType(), valueAttributes); + + return ImmutableList.builder() + .add(new OrcType( + OrcTypeKind.MAP, + ImmutableList.of(new OrcColumnId(nextFieldTypeIndex), new OrcColumnId(nextFieldTypeIndex + keyTypes.size())), + ImmutableList.of("key", "value"), + Optional.empty(), + Optional.empty(), + Optional.empty(), + attributes)) + .addAll(keyTypes) + .addAll(valueTypes) + .build(); + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/ParquetUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/ParquetUtil.java new file mode 100644 index 000000000000..98f50940b419 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/ParquetUtil.java @@ -0,0 +1,392 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.trino.plugin.iceberg.util; + +import com.google.common.collect.ImmutableList; +import io.trino.parquet.ParquetCorruptionException; +import io.trino.parquet.metadata.BlockMetadata; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.metadata.ParquetMetadata; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.MetricsModes; +import org.apache.iceberg.MetricsModes.MetricsMode; +import org.apache.iceberg.MetricsUtil; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.parquet.ParquetSchemaUtil; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.BinaryUtil; +import org.apache.iceberg.util.UnicodeUtil; +import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; +import java.util.stream.Stream; + +import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; +import static java.util.stream.Collectors.toMap; +import static org.apache.iceberg.MetricsUtil.createNanValueCounts; +import static org.apache.iceberg.parquet.ParquetUtil.extractTimestampInt96; + +public final class ParquetUtil +{ + // based on org.apache.iceberg.parquet.ParquetUtil and on org.apache.iceberg.parquet.ParquetConversions + private ParquetUtil() {} + + public static Metrics footerMetrics(ParquetMetadata metadata, Stream> fieldMetrics, MetricsConfig metricsConfig) + throws ParquetCorruptionException + { + return footerMetrics(metadata, fieldMetrics, metricsConfig, null); + } + + public static Metrics footerMetrics( + ParquetMetadata metadata, + Stream> fieldMetrics, + MetricsConfig metricsConfig, + NameMapping nameMapping) + throws ParquetCorruptionException + { + requireNonNull(fieldMetrics, "fieldMetrics should not be null"); + + long rowCount = 0; + Map columnSizes = new HashMap<>(); + Map valueCounts = new HashMap<>(); + Map nullValueCounts = new HashMap<>(); + Map> lowerBounds = new HashMap<>(); + Map> upperBounds = new HashMap<>(); + Set missingStats = new HashSet<>(); + + // ignore metrics for fields we failed to determine reliable IDs + MessageType parquetTypeWithIds = getParquetTypeWithIds(metadata, nameMapping); + Schema fileSchema = ParquetSchemaUtil.convertAndPrune(parquetTypeWithIds); + + Map> fieldMetricsMap = fieldMetrics.collect(toMap(FieldMetrics::id, identity())); + + List blocks = metadata.getBlocks(); + for (BlockMetadata block : blocks) { + rowCount += block.rowCount(); + for (ColumnChunkMetadata column : block.columns()) { + Integer fieldId = fileSchema.aliasToId(column.getPath().toDotString()); + if (fieldId == null) { + // fileSchema may contain a subset of columns present in the file + // as we prune columns we could not assign ids + continue; + } + + increment(columnSizes, fieldId, column.getTotalSize()); + + MetricsMode metricsMode = MetricsUtil.metricsMode(fileSchema, metricsConfig, fieldId); + if (metricsMode == MetricsModes.None.get()) { + continue; + } + increment(valueCounts, fieldId, column.getValueCount()); + + Statistics stats = column.getStatistics(); + if (stats != null && !stats.isEmpty()) { + increment(nullValueCounts, fieldId, stats.getNumNulls()); + + // when there are metrics gathered by Iceberg for a column, we should use those instead + // of the ones from Parquet + if (metricsMode != MetricsModes.Counts.get() && !fieldMetricsMap.containsKey(fieldId)) { + Types.NestedField field = fileSchema.findField(fieldId); + if (field != null && stats.hasNonNullValue() && shouldStoreBounds(column, fileSchema)) { + Literal min = fromParquetPrimitive(field.type(), column.getPrimitiveType(), stats.genericGetMin()); + updateMin(lowerBounds, fieldId, field.type(), min, metricsMode); + Literal max = fromParquetPrimitive(field.type(), column.getPrimitiveType(), stats.genericGetMax()); + updateMax(upperBounds, fieldId, field.type(), max, metricsMode); + } + } + } + else { + missingStats.add(fieldId); + } + } + } + + // discard accumulated values if any stats were missing + for (Integer fieldId : missingStats) { + nullValueCounts.remove(fieldId); + lowerBounds.remove(fieldId); + upperBounds.remove(fieldId); + } + + updateFromFieldMetrics(fieldMetricsMap, metricsConfig, fileSchema, lowerBounds, upperBounds); + + return new Metrics( + rowCount, + columnSizes, + valueCounts, + nullValueCounts, + createNanValueCounts(fieldMetricsMap.values().stream(), metricsConfig, fileSchema), + toBufferMap(fileSchema, lowerBounds), + toBufferMap(fileSchema, upperBounds)); + } + + public static List getSplitOffsets(ParquetMetadata metadata) + throws ParquetCorruptionException + { + List blocks = metadata.getBlocks(); + List splitOffsets = new ArrayList<>(blocks.size()); + for (BlockMetadata blockMetaData : blocks) { + splitOffsets.add(blockMetaData.getStartingPos()); + } + Collections.sort(splitOffsets); + return ImmutableList.copyOf(splitOffsets); + } + + private static void updateFromFieldMetrics( + Map> idToFieldMetricsMap, + MetricsConfig metricsConfig, + Schema schema, + Map> lowerBounds, + Map> upperBounds) + { + idToFieldMetricsMap + .entrySet() + .forEach( + entry -> { + int fieldId = entry.getKey(); + FieldMetrics metrics = entry.getValue(); + MetricsMode metricsMode = MetricsUtil.metricsMode(schema, metricsConfig, fieldId); + + // only check for MetricsModes.None, since we don't truncate float/double values. + if (metricsMode != MetricsModes.None.get()) { + if (!metrics.hasBounds()) { + lowerBounds.remove(fieldId); + upperBounds.remove(fieldId); + } + else if (metrics.upperBound() instanceof Float) { + lowerBounds.put(fieldId, Literal.of((Float) metrics.lowerBound())); + upperBounds.put(fieldId, Literal.of((Float) metrics.upperBound())); + } + else if (metrics.upperBound() instanceof Double) { + lowerBounds.put(fieldId, Literal.of((Double) metrics.lowerBound())); + upperBounds.put(fieldId, Literal.of((Double) metrics.upperBound())); + } + else { + throw new UnsupportedOperationException("Expected only float or double column metrics"); + } + } + }); + } + + private static MessageType getParquetTypeWithIds(ParquetMetadata metadata, NameMapping nameMapping) + { + MessageType type = metadata.getFileMetaData().getSchema(); + + if (ParquetSchemaUtil.hasIds(type)) { + return type; + } + + if (nameMapping != null) { + return ParquetSchemaUtil.applyNameMapping(type, nameMapping); + } + + return ParquetSchemaUtil.addFallbackIds(type); + } + + // we allow struct nesting, but not maps or arrays + private static boolean shouldStoreBounds(ColumnChunkMetadata column, Schema schema) + { + if (column.getPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) { + // stats for INT96 are not reliable + return false; + } + + ColumnPath columnPath = column.getPath(); + Iterator pathIterator = columnPath.iterator(); + Type currentType = schema.asStruct(); + + while (pathIterator.hasNext()) { + if (currentType == null || !currentType.isStructType()) { + return false; + } + String fieldName = pathIterator.next(); + currentType = currentType.asStructType().fieldType(fieldName); + } + + return currentType != null && currentType.isPrimitiveType(); + } + + private static void increment(Map columns, int fieldId, long amount) + { + if (columns != null) { + if (columns.containsKey(fieldId)) { + columns.put(fieldId, columns.get(fieldId) + amount); + } + else { + columns.put(fieldId, amount); + } + } + } + + @SuppressWarnings("unchecked") + private static void updateMin( + Map> lowerBounds, + int id, + Type type, + Literal min, + MetricsMode metricsMode) + { + Literal currentMin = (Literal) lowerBounds.get(id); + if (currentMin == null || min.comparator().compare(min.value(), currentMin.value()) < 0) { + if (metricsMode == MetricsModes.Full.get()) { + lowerBounds.put(id, min); + } + else { + MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode; + int truncateLength = truncateMode.length(); + switch (type.typeId()) { + case STRING: + lowerBounds.put(id, UnicodeUtil.truncateStringMin((Literal) min, truncateLength)); + break; + case FIXED: + case BINARY: + lowerBounds.put(id, BinaryUtil.truncateBinaryMin((Literal) min, truncateLength)); + break; + default: + lowerBounds.put(id, min); + } + } + } + } + + @SuppressWarnings("unchecked") + private static void updateMax( + Map> upperBounds, + int id, + Type type, + Literal max, + MetricsMode metricsMode) + { + Literal currentMax = (Literal) upperBounds.get(id); + if (currentMax == null || max.comparator().compare(max.value(), currentMax.value()) > 0) { + if (metricsMode == MetricsModes.Full.get()) { + upperBounds.put(id, max); + } + else { + MetricsModes.Truncate truncateMode = (MetricsModes.Truncate) metricsMode; + int truncateLength = truncateMode.length(); + switch (type.typeId()) { + case STRING: + Literal truncatedMaxString = UnicodeUtil.truncateStringMax((Literal) max, truncateLength); + if (truncatedMaxString != null) { + upperBounds.put(id, truncatedMaxString); + } + break; + case FIXED: + case BINARY: + Literal truncatedMaxBinary = BinaryUtil.truncateBinaryMax((Literal) max, truncateLength); + if (truncatedMaxBinary != null) { + upperBounds.put(id, truncatedMaxBinary); + } + break; + default: + upperBounds.put(id, max); + } + } + } + } + + private static Map toBufferMap(Schema schema, Map> map) + { + Map bufferMap = new HashMap<>(); + for (Map.Entry> entry : map.entrySet()) { + bufferMap.put( + entry.getKey(), + Conversions.toByteBuffer(schema.findType(entry.getKey()), entry.getValue().value())); + } + return bufferMap; + } + + @SuppressWarnings("unchecked") + public static Literal fromParquetPrimitive(Type type, PrimitiveType parquetType, Object value) + { + return switch (type.typeId()) { + case BOOLEAN -> (Literal) Literal.of((Boolean) value); + case INTEGER, DATE -> (Literal) Literal.of((Integer) value); + case LONG, TIME, TIMESTAMP -> (Literal) Literal.of((Long) value); + case FLOAT -> (Literal) Literal.of((Float) value); + case DOUBLE -> (Literal) Literal.of((Double) value); + case STRING -> { + Function stringConversion = converterFromParquet(parquetType); + yield (Literal) Literal.of((CharSequence) stringConversion.apply(value)); + } + case UUID -> { + Function uuidConversion = converterFromParquet(parquetType); + yield (Literal) Literal.of((UUID) uuidConversion.apply(value)); + } + case FIXED, BINARY -> { + Function binaryConversion = converterFromParquet(parquetType); + yield (Literal) Literal.of((ByteBuffer) binaryConversion.apply(value)); + } + case DECIMAL -> { + Function decimalConversion = converterFromParquet(parquetType); + yield (Literal) Literal.of((BigDecimal) decimalConversion.apply(value)); + } + default -> throw new IllegalArgumentException("Unsupported primitive type: " + type); + }; + } + + static Function converterFromParquet(PrimitiveType type) + { + if (type.getOriginalType() != null) { + switch (type.getOriginalType()) { + case UTF8: + // decode to CharSequence to avoid copying into a new String + return binary -> StandardCharsets.UTF_8.decode(((Binary) binary).toByteBuffer()); + case DECIMAL: + DecimalLogicalTypeAnnotation decimal = (DecimalLogicalTypeAnnotation) type.getLogicalTypeAnnotation(); + int scale = decimal.getScale(); + return switch (type.getPrimitiveTypeName()) { + case INT32, INT64 -> number -> BigDecimal.valueOf(((Number) number).longValue(), scale); + case FIXED_LEN_BYTE_ARRAY, BINARY -> binary -> new BigDecimal(new BigInteger(((Binary) binary).getBytes()), scale); + default -> throw new IllegalArgumentException("Unsupported primitive type for decimal: " + type.getPrimitiveTypeName()); + }; + default: + } + } + + return switch (type.getPrimitiveTypeName()) { + case FIXED_LEN_BYTE_ARRAY, BINARY -> binary -> ByteBuffer.wrap(((Binary) binary).getBytes()); + case INT96 -> binary -> extractTimestampInt96(ByteBuffer.wrap(((Binary) binary).getBytes()).order(ByteOrder.LITTLE_ENDIAN)); + default -> obj -> obj; + }; + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/SystemTableUtil.java b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/SystemTableUtil.java new file mode 100644 index 000000000000..e28b66a0d05c --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/plugin/iceberg/util/SystemTableUtil.java @@ -0,0 +1,179 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.util; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.collect.ImmutableList; +import io.trino.plugin.base.util.JsonUtils; +import io.trino.plugin.iceberg.system.IcebergPartitionColumn; +import io.trino.spi.type.RowType; +import io.trino.spi.type.TypeManager; +import org.apache.iceberg.MetricsUtil; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SingleValueParser; +import org.apache.iceberg.Table; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.types.Types.NestedField; + +import java.io.IOException; +import java.io.StringWriter; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.plugin.iceberg.TypeConverter.toTrinoType; + +public final class SystemTableUtil +{ + private static final JsonFactory JSON_FACTORY = JsonUtils.jsonFactoryBuilder().build(); + + private SystemTableUtil() {} + + public static List getAllPartitionFields(Table icebergTable) + { + return getAllPartitionFields(icebergTable.schema(), icebergTable.specs()); + } + + public static List getAllPartitionFields(Schema schema, Map specs) + { + Set existingColumnsIds = TypeUtil.indexById(schema.asStruct()).keySet(); + + List visiblePartitionFields = specs + .values().stream() + .flatMap(partitionSpec -> partitionSpec.fields().stream()) + // skip columns that were dropped + .filter(partitionField -> existingColumnsIds.contains(partitionField.sourceId())) + .collect(toImmutableList()); + + return filterOutDuplicates(visiblePartitionFields); + } + + private static List filterOutDuplicates(List visiblePartitionFields) + { + Set alreadyExistingFieldIds = new HashSet<>(); + List result = new ArrayList<>(); + for (PartitionField partitionField : visiblePartitionFields) { + if (!alreadyExistingFieldIds.contains(partitionField.fieldId())) { + alreadyExistingFieldIds.add(partitionField.fieldId()); + result.add(partitionField); + } + } + return result; + } + + public static Optional getPartitionColumnType(TypeManager typeManager, List fields, Schema schema) + { + if (fields.isEmpty()) { + return Optional.empty(); + } + List partitionFields = fields.stream() + .map(field -> RowType.field( + field.name(), + toTrinoType(field.transform().getResultType(schema.findType(field.sourceId())), typeManager))) + .collect(toImmutableList()); + List fieldIds = fields.stream() + .map(PartitionField::fieldId) + .collect(toImmutableList()); + return Optional.of(new IcebergPartitionColumn(RowType.from(partitionFields), fieldIds)); + } + + public static List partitionTypes(List partitionFields, Map idToPrimitiveTypeMapping) + { + ImmutableList.Builder partitionTypeBuilder = ImmutableList.builder(); + for (PartitionField partitionField : partitionFields) { + PrimitiveType sourceType = idToPrimitiveTypeMapping.get(partitionField.sourceId()); + Type type = partitionField.transform().getResultType(sourceType); + partitionTypeBuilder.add(type); + } + return partitionTypeBuilder.build(); + } + + public static String readableMetricsToJson(MetricsUtil.ReadableMetricsStruct readableMetrics, List primitiveFields) + { + StringWriter writer = new StringWriter(); + try { + JsonGenerator generator = JSON_FACTORY.createGenerator(writer); + generator.writeStartObject(); + + for (int i = 0; i < readableMetrics.size(); i++) { + NestedField field = primitiveFields.get(i); + generator.writeFieldName(field.name()); + + generator.writeStartObject(); + MetricsUtil.ReadableColMetricsStruct columnMetrics = readableMetrics.get(i, MetricsUtil.ReadableColMetricsStruct.class); + + generator.writeFieldName("column_size"); + Long columnSize = columnMetrics.get(0, Long.class); + if (columnSize == null) { + generator.writeNull(); + } + else { + generator.writeNumber(columnSize); + } + + generator.writeFieldName("value_count"); + Long valueCount = columnMetrics.get(1, Long.class); + if (valueCount == null) { + generator.writeNull(); + } + else { + generator.writeNumber(valueCount); + } + + generator.writeFieldName("null_value_count"); + Long nullValueCount = columnMetrics.get(2, Long.class); + if (nullValueCount == null) { + generator.writeNull(); + } + else { + generator.writeNumber(nullValueCount); + } + + generator.writeFieldName("nan_value_count"); + Long nanValueCount = columnMetrics.get(3, Long.class); + if (nanValueCount == null) { + generator.writeNull(); + } + else { + generator.writeNumber(nanValueCount); + } + + generator.writeFieldName("lower_bound"); + SingleValueParser.toJson(field.type(), columnMetrics.get(4, Object.class), generator); + + generator.writeFieldName("upper_bound"); + SingleValueParser.toJson(field.type(), columnMetrics.get(5, Object.class), generator); + + generator.writeEndObject(); + } + + generator.writeEndObject(); + generator.flush(); + return writer.toString(); + } + catch (IOException e) { + throw new UncheckedIOException("JSON conversion failed for: " + readableMetrics, e); + } + } +} diff --git a/plugin/trino-iceberg/src/main/java/io/trino/spi/connector/RelationType.java b/plugin/trino-iceberg/src/main/java/io/trino/spi/connector/RelationType.java new file mode 100644 index 000000000000..d038a1de8bc3 --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/io/trino/spi/connector/RelationType.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.spi.connector; + +public enum RelationType +{ + TABLE, + VIEW, + MATERIALIZED_VIEW, +} diff --git a/plugin/trino-iceberg/src/main/java/org/apache/iceberg/IcebergManifestUtils.java b/plugin/trino-iceberg/src/main/java/org/apache/iceberg/IcebergManifestUtils.java new file mode 100644 index 000000000000..593e9161dbcd --- /dev/null +++ b/plugin/trino-iceberg/src/main/java/org/apache/iceberg/IcebergManifestUtils.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.io.FileIO; + +import java.util.List; + +public class IcebergManifestUtils +{ + private IcebergManifestUtils() {} + + public static List read(FileIO fileIO, String manifestListLocation) + { + // Avoid using snapshot.allManifests() when processing multiple snapshots, + // as each Snapshot instance internally caches `org.apache.iceberg.BaseSnapshot.allManifests` + // and leads to high memory usage + return ManifestLists.read(fileIO.newInputFile(manifestListLocation)); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorSmokeTest.java index e2d3fec64e1b..173dcc0626a8 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorSmokeTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorSmokeTest.java @@ -14,12 +14,12 @@ package io.trino.plugin.iceberg; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; import com.google.common.collect.Streams; import io.trino.Session; import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; -import io.trino.plugin.iceberg.fileio.ForwardingFileIo; import io.trino.testing.BaseConnectorSmokeTest; import io.trino.testing.TestingConnectorBehavior; import io.trino.testing.sql.TestTable; @@ -27,34 +27,53 @@ import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.io.FileIO; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.RepeatedTest; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; + +import java.io.IOException; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; import java.util.List; +import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static io.airlift.concurrent.MoreFutures.tryGetFutureValue; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getMetadataFileAndUpdatedMillis; import static io.trino.plugin.iceberg.IcebergTestUtils.withSmallRowGroups; import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.DROP_TABLE; import static io.trino.testing.TestingAccessControlManager.privilege; +import static io.trino.testing.TestingConnectorBehavior.SUPPORTS_CREATE_TABLE; import static io.trino.testing.TestingConnectorSession.SESSION; import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.String.format; +import static java.time.ZoneOffset.UTC; import static java.util.Objects.requireNonNull; import static java.util.concurrent.Executors.newFixedThreadPool; import static java.util.concurrent.TimeUnit.SECONDS; -import static java.util.stream.Collectors.joining; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public abstract class BaseIcebergConnectorSmokeTest extends BaseConnectorSmokeTest { @@ -66,26 +85,19 @@ public BaseIcebergConnectorSmokeTest(FileFormat format) this.format = requireNonNull(format, "format is null"); } - @BeforeClass + @BeforeAll public void initFileSystem() { fileSystem = getFileSystemFactory(getDistributedQueryRunner()).create(SESSION); } - @SuppressWarnings("DuplicateBranchesInSwitch") @Override protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior) { - switch (connectorBehavior) { - case SUPPORTS_TRUNCATE: - return false; - - case SUPPORTS_TOPN_PUSHDOWN: - return false; - - default: - return super.hasBehavior(connectorBehavior); - } + return switch (connectorBehavior) { + case SUPPORTS_TOPN_PUSHDOWN -> false; + default -> super.hasBehavior(connectorBehavior); + }; } @Test @@ -110,7 +122,7 @@ public void testShowCreateTable() @Test public void testHiddenPathColumn() { - try (TestTable table = new TestTable(getQueryRunner()::execute, "hidden_file_path", "(a int, b VARCHAR)", ImmutableList.of("(1, 'a')"))) { + try (TestTable table = newTrinoTable("hidden_file_path", "(a int, b VARCHAR)", ImmutableList.of("(1, 'a')"))) { String filePath = (String) computeScalar(format("SELECT file_path FROM \"%s$files\"", table.getName())); assertQuery("SELECT DISTINCT \"$path\" FROM " + table.getName(), "VALUES " + "'" + filePath + "'"); @@ -121,7 +133,9 @@ public void testHiddenPathColumn() } // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. - @Test(timeOut = 120_000, invocationCount = 4) + @RepeatedTest(4) + @Timeout(120) + @Execution(ExecutionMode.SAME_THREAD) public void testDeleteRowsConcurrently() throws Exception { @@ -130,9 +144,10 @@ public void testDeleteRowsConcurrently() ExecutorService executor = newFixedThreadPool(threads); List rows = ImmutableList.of("(1, 0, 0, 0)", "(0, 1, 0, 0)", "(0, 0, 1, 0)", "(0, 0, 0, 1)"); - String[] expectedErrors = new String[]{"Failed to commit Iceberg update to table:", "Failed to replace table due to concurrent updates:"}; - try (TestTable table = new TestTable( - getQueryRunner()::execute, + String[] expectedErrors = new String[] {"Failed to commit the transaction during write:", + "Failed to replace table due to concurrent updates:", + "Failed to commit during write:"}; + try (TestTable table = newTrinoTable( "test_concurrent_delete", "(col0 INTEGER, col1 INTEGER, col2 INTEGER, col3 INTEGER)")) { String tableName = table.getName(); @@ -154,19 +169,64 @@ public void testDeleteRowsConcurrently() .collect(toImmutableList()); Stream> expectedRows = Streams.mapWithIndex(futures.stream(), (future, index) -> { - boolean deleteSuccessful = tryGetFutureValue(future, 10, SECONDS).orElseThrow(); + Optional value = tryGetFutureValue(future, 20, SECONDS); + checkState(value.isPresent(), "Task %s did not complete in time", index); + boolean deleteSuccessful = value.get(); return deleteSuccessful ? Optional.empty() : Optional.of(rows.get((int) index)); }); - String expectedValues = expectedRows.filter(Optional::isPresent).map(Optional::get).collect(joining(", ")); - assertThat(expectedValues).isNotEmpty().as("Expected at least one delete operation to pass"); - assertThat(query("SELECT * FROM " + tableName)).matches("VALUES " + expectedValues); + List expectedValues = expectedRows.filter(Optional::isPresent).map(Optional::get).collect(toImmutableList()); + assertThat(expectedValues).as("Expected at least one delete operation to pass").hasSizeLessThan(rows.size()); + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES " + String.join(", ", expectedValues)); } finally { executor.shutdownNow(); - assertTrue(executor.awaitTermination(10, SECONDS)); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); } } + @Test + public void testCreateOrReplaceTable() + { + try (TestTable table = newTrinoTable( + "test_create_or_replace", + " AS SELECT BIGINT '42' a, DOUBLE '-38.5' b")) { + assertThat(query("SELECT a, b FROM " + table.getName())) + .matches("VALUES (BIGINT '42', -385e-1)"); + + long v1SnapshotId = getMostRecentSnapshotId(table.getName()); + + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " AS SELECT BIGINT '-42' a, DOUBLE '38.5' b", 1); + assertThat(query("SELECT a, b FROM " + table.getName())) + .matches("VALUES (BIGINT '-42', 385e-1)"); + + assertThat(query("SELECT COUNT(snapshot_id) FROM \"" + table.getName() + "$history\"")) + .matches("VALUES BIGINT '2'"); + + assertThat(query("SELECT a, b FROM " + table.getName() + " FOR VERSION AS OF " + v1SnapshotId)) + .matches("VALUES (BIGINT '42', -385e-1)"); + } + } + + @Test + public void testCreateOrReplaceTableChangeColumnNamesAndTypes() + { + String tableName = "test_create_or_replace_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT BIGINT '42' a, DOUBLE '-38.5' b", 1); + assertThat(query("SELECT CAST(a AS bigint), b FROM " + tableName)) + .matches("VALUES (BIGINT '42', -385e-1)"); + + long v1SnapshotId = getMostRecentSnapshotId(tableName); + + assertUpdate("CREATE OR REPLACE TABLE " + tableName + " AS SELECT VARCHAR 'test' c, VARCHAR 'test2' d", 1); + assertThat(query("SELECT c, d FROM " + tableName)) + .matches("VALUES (VARCHAR 'test', VARCHAR 'test2')"); + + assertThat(query("SELECT a, b FROM " + tableName + " FOR VERSION AS OF " + v1SnapshotId)) + .matches("VALUES (BIGINT '42', -385e-1)"); + + assertUpdate("DROP TABLE " + tableName); + } + @Test public void testRegisterTableWithTableLocation() { @@ -471,13 +531,12 @@ public void testCreateTableWithNonExistingSchemaVerifyLocation() public void testSortedNationTable() { Session withSmallRowGroups = withSmallRowGroups(getSession()); - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_sorted_nation_table", "WITH (sorted_by = ARRAY['comment'], format = '" + format.name() + "') AS SELECT * FROM nation WITH NO DATA")) { assertUpdate(withSmallRowGroups, "INSERT INTO " + table.getName() + " SELECT * FROM nation", 25); for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) { - assertTrue(isFileSorted(Location.of((String) filePath), "comment")); + assertThat(isFileSorted(Location.of((String) filePath), "comment")).isTrue(); } assertQuery("SELECT * FROM " + table.getName(), "SELECT * FROM nation"); } @@ -487,9 +546,12 @@ public void testSortedNationTable() public void testFileSortingWithLargerTable() { // Using a larger table forces buffered data to be written to disk - Session withSmallRowGroups = withSmallRowGroups(getSession()); - try (TestTable table = new TestTable( - getQueryRunner()::execute, + Session withSmallRowGroups = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", "orc_writer_max_stripe_rows", "200") + .setCatalogSessionProperty("iceberg", "parquet_writer_block_size", "20kB") + .setCatalogSessionProperty("iceberg", "parquet_writer_batch_size", "200") + .build(); + try (TestTable table = newTrinoTable( "test_sorted_lineitem_table", "WITH (sorted_by = ARRAY['comment'], format = '" + format.name() + "') AS TABLE tpch.tiny.lineitem WITH NO DATA")) { assertUpdate( @@ -497,7 +559,7 @@ public void testFileSortingWithLargerTable() "INSERT INTO " + table.getName() + " TABLE tpch.tiny.lineitem", "VALUES 60175"); for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) { - assertTrue(isFileSorted(Location.of((String) filePath), "comment")); + assertThat(isFileSorted(Location.of((String) filePath), "comment")).isTrue(); } assertQuery("SELECT * FROM " + table.getName(), "SELECT * FROM lineitem"); } @@ -515,12 +577,16 @@ public void testDropTableWithMissingMetadataFile() // Delete current metadata file fileSystem.deleteFile(metadataLocation); - assertFalse(fileSystem.newInputFile(metadataLocation).exists(), "Current metadata file should not exist"); + assertThat(fileSystem.newInputFile(metadataLocation).exists()) + .describedAs("Current metadata file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -531,18 +597,22 @@ public void testDropTableWithMissingSnapshotFile() assertUpdate("CREATE TABLE " + tableName + " AS SELECT 1 x, 'INDIA' y", 1); String metadataLocation = getMetadataLocation(tableName); - TableMetadata tableMetadata = TableMetadataParser.read(new ForwardingFileIo(fileSystem), metadataLocation); + TableMetadata tableMetadata = TableMetadataParser.read(FILE_IO_FACTORY.create(fileSystem), metadataLocation); Location tableLocation = Location.of(tableMetadata.location()); Location currentSnapshotFile = Location.of(tableMetadata.currentSnapshot().manifestListLocation()); // Delete current snapshot file fileSystem.deleteFile(currentSnapshotFile); - assertFalse(fileSystem.newInputFile(currentSnapshotFile).exists(), "Current snapshot file should not exist"); + assertThat(fileSystem.newInputFile(currentSnapshotFile).exists()) + .describedAs("Current snapshot file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -553,19 +623,23 @@ public void testDropTableWithMissingManifestListFile() assertUpdate("CREATE TABLE " + tableName + " AS SELECT 1 x, 'INDIA' y", 1); String metadataLocation = getMetadataLocation(tableName); - FileIO fileIo = new ForwardingFileIo(fileSystem); + FileIO fileIo = FILE_IO_FACTORY.create(fileSystem); TableMetadata tableMetadata = TableMetadataParser.read(fileIo, metadataLocation); Location tableLocation = Location.of(tableMetadata.location()); Location manifestListFile = Location.of(tableMetadata.currentSnapshot().allManifests(fileIo).get(0).path()); // Delete Manifest List file fileSystem.deleteFile(manifestListFile); - assertFalse(fileSystem.newInputFile(manifestListFile).exists(), "Manifest list file should not exist"); + assertThat(fileSystem.newInputFile(manifestListFile).exists()) + .describedAs("Manifest list file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -579,17 +653,21 @@ public void testDropTableWithMissingDataFile() Location tableLocation = Location.of(getTableLocation(tableName)); Location tableDataPath = tableLocation.appendPath("data"); FileIterator fileIterator = fileSystem.listFiles(tableDataPath); - assertTrue(fileIterator.hasNext()); + assertThat(fileIterator.hasNext()).isTrue(); Location dataFile = fileIterator.next().location(); // Delete data file fileSystem.deleteFile(dataFile); - assertFalse(fileSystem.newInputFile(dataFile).exists(), "Data file should not exist"); + assertThat(fileSystem.newInputFile(dataFile).exists()) + .describedAs("Data file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -604,22 +682,26 @@ public void testDropTableWithNonExistentTableLocation() // Delete table location fileSystem.deleteDirectory(tableLocation); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); } // Verify the accuracy of Trino metadata tables while retrieving Iceberg table metadata from the underlying `TrinoCatalog` implementation @Test public void testMetadataTables() { - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_metadata_tables", - "(id int, part varchar) WITH (partitioning = ARRAY['part'])", - ImmutableList.of("1, 'p1'", "2, 'p1'", "3, 'p2'"))) { + "(id int, part varchar) WITH (partitioning = ARRAY['part'])")) { + assertUpdate("INSERT INTO " + table.getName() + " VALUES (1, 'p1')", 1); + assertUpdate("INSERT INTO " + table.getName() + " VALUES (2, 'p1')", 1); + assertUpdate("INSERT INTO " + table.getName() + " VALUES (3, 'p2')", 1); + List snapshotIds = computeActual("SELECT snapshot_id FROM \"" + table.getName() + "$snapshots\" ORDER BY committed_at DESC") .getOnlyColumn() .map(Long.class::cast) @@ -638,11 +720,230 @@ public void testMetadataTables() } } + @Test + public void testPartitionFilterRequired() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", "query_partition_filter_required", "true") + .build(); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + String query = "SELECT id FROM " + tableName + " WHERE a = 'a'"; + @Language("RegExp") String failureMessage = "Filter required for .*" + tableName + " on at least one of the partition columns: ds"; + assertQueryFails(session, query, failureMessage); + assertQueryFails(session, "EXPLAIN " + query, failureMessage); + assertUpdate(session, "DROP TABLE " + tableName); + } + protected abstract boolean isFileSorted(Location path, String sortColumnName); - private String getTableLocation(String tableName) + @Test + public void testTableChangesFunction() { - return (String) computeScalar("SELECT DISTINCT regexp_replace(\"$path\", '/[^/]*/[^/]*$', '') FROM " + tableName); + DateTimeFormatter instantMillisFormatter = DateTimeFormatter.ofPattern("uuuu-MM-dd'T'HH:mm:ss.SSSVV").withZone(UTC); + + try (TestTable table = newTrinoTable( + "test_table_changes_function_", + "AS SELECT nationkey, name FROM tpch.tiny.nation WITH NO DATA")) { + long initialSnapshot = getMostRecentSnapshotId(table.getName()); + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey, name FROM nation", 25); + long snapshotAfterInsert = getMostRecentSnapshotId(table.getName()); + String snapshotAfterInsertTime = getSnapshotTime(table.getName(), snapshotAfterInsert).format(instantMillisFormatter); + + assertQuery( + "SELECT nationkey, name, _change_type, _change_version_id, to_iso8601(_change_timestamp), _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterInsert), + "SELECT nationkey, name, 'insert', %s, '%s', 0 FROM nation".formatted(snapshotAfterInsert, snapshotAfterInsertTime)); + + // Run with named arguments + assertQuery( + "SELECT nationkey, name, _change_type, _change_version_id, to_iso8601(_change_timestamp), _change_ordinal " + + "FROM TABLE(system.table_changes(schema_name => CURRENT_SCHEMA, table_name => '%s', start_snapshot_id => %s, end_snapshot_id => %s))" + .formatted(table.getName(), initialSnapshot, snapshotAfterInsert), + "SELECT nationkey, name, 'insert', %s, '%s', 0 FROM nation".formatted(snapshotAfterInsert, snapshotAfterInsertTime)); + + assertUpdate("DELETE FROM " + table.getName(), 25); + long snapshotAfterDelete = getMostRecentSnapshotId(table.getName()); + String snapshotAfterDeleteTime = getSnapshotTime(table.getName(), snapshotAfterDelete).format(instantMillisFormatter); + + assertQuery( + "SELECT nationkey, name, _change_type, _change_version_id, to_iso8601(_change_timestamp), _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), snapshotAfterInsert, snapshotAfterDelete), + "SELECT nationkey, name, 'delete', %s, '%s', 0 FROM nation".formatted(snapshotAfterDelete, snapshotAfterDeleteTime)); + + assertQuery( + "SELECT nationkey, name, _change_type, _change_version_id, to_iso8601(_change_timestamp), _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterDelete), + "SELECT nationkey, name, 'insert', %s, '%s', 0 FROM nation UNION SELECT nationkey, name, 'delete', %s, '%s', 1 FROM nation".formatted( + snapshotAfterInsert, snapshotAfterInsertTime, snapshotAfterDelete, snapshotAfterDeleteTime)); + } + } + + @Test + public void testRowLevelDeletesWithTableChangesFunction() + { + try (TestTable table = newTrinoTable( + "test_row_level_deletes_with_table_changes_function_", + "AS SELECT nationkey, regionkey, name FROM tpch.tiny.nation WITH NO DATA")) { + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey, regionkey, name FROM nation", 25); + long snapshotAfterInsert = getMostRecentSnapshotId(table.getName()); + + assertUpdate("DELETE FROM " + table.getName() + " WHERE regionkey = 2", 5); + long snapshotAfterDelete = getMostRecentSnapshotId(table.getName()); + + assertQueryFails( + "SELECT * FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), snapshotAfterInsert, snapshotAfterDelete), + "Table uses features which are not yet supported by the table_changes function"); + } + } + + @Test + public void testCreateOrReplaceWithTableChangesFunction() + { + DateTimeFormatter instantMillisFormatter = DateTimeFormatter.ofPattern("uuuu-MM-dd'T'HH:mm:ss.SSSVV").withZone(UTC); + + try (TestTable table = newTrinoTable( + "test_table_changes_function_", + "AS SELECT nationkey, name FROM tpch.tiny.nation WITH NO DATA")) { + long initialSnapshot = getMostRecentSnapshotId(table.getName()); + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey, name FROM nation", 25); + long snapshotAfterInsert = getMostRecentSnapshotId(table.getName()); + String snapshotAfterInsertTime = getSnapshotTime(table.getName(), snapshotAfterInsert).format(instantMillisFormatter); + + assertQuery( + "SELECT nationkey, name, _change_type, _change_version_id, to_iso8601(_change_timestamp), _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterInsert), + "SELECT nationkey, name, 'insert', %s, '%s', 0 FROM nation".formatted(snapshotAfterInsert, snapshotAfterInsertTime)); + + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " AS SELECT nationkey, name FROM nation LIMIT 0", 0); + long snapshotAfterCreateOrReplace = getMostRecentSnapshotId(table.getName()); + + assertQueryFails( + "SELECT * FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterCreateOrReplace), + "Starting snapshot \\(exclusive\\) %s is not a parent ancestor of end snapshot %s".formatted(initialSnapshot, snapshotAfterCreateOrReplace)); + + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey, name FROM nation", 25); + long snapshotAfterInsertIntoCreateOrReplace = getMostRecentSnapshotId(table.getName()); + String snapshotAfterInsertTimeIntoCreateOrReplace = getSnapshotTime(table.getName(), snapshotAfterInsertIntoCreateOrReplace).format(instantMillisFormatter); + + assertQuery( + "SELECT nationkey, name, _change_type, _change_version_id, to_iso8601(_change_timestamp), _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), snapshotAfterCreateOrReplace, snapshotAfterInsertIntoCreateOrReplace), + "SELECT nationkey, name, 'insert', %s, '%s', 0 FROM nation".formatted(snapshotAfterInsertIntoCreateOrReplace, snapshotAfterInsertTimeIntoCreateOrReplace)); + } + } + + @Test + public void testMetadataDeleteAfterCommitEnabled() + throws IOException + { + if (!hasBehavior(SUPPORTS_CREATE_TABLE)) { + return; + } + + int metadataPreviousVersionCount = 5; + String tableName = "test_metadata_delete_after_commit_enabled" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + "(_bigint BIGINT, _varchar VARCHAR)"); + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES extra_properties = MAP(ARRAY['write.metadata.delete-after-commit.enabled'], ARRAY['true'])"); + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES extra_properties = MAP(ARRAY['write.metadata.previous-versions-max'], ARRAY['" + metadataPreviousVersionCount + "'])"); + String tableLocation = getTableLocation(tableName); + + Map historyMetadataFiles = getMetadataFileAndUpdatedMillis(fileSystem, tableLocation); + for (int i = 0; i < 10; i++) { + assertUpdate("INSERT INTO " + tableName + " VALUES (1, 'a')", 1); + Map metadataFiles = getMetadataFileAndUpdatedMillis(fileSystem, tableLocation); + historyMetadataFiles.putAll(metadataFiles); + assertThat(metadataFiles.size()).isLessThanOrEqualTo(1 + metadataPreviousVersionCount); + Set expectMetadataFiles = historyMetadataFiles + .entrySet() + .stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(metadataPreviousVersionCount + 1) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + assertThat(metadataFiles.keySet()).containsAll(expectMetadataFiles); + } + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testIcebergTablesSystemTable() + throws Exception + { + String firstSchema = "first_schema_" + randomNameSuffix(); + String secondSchema = "second_schema_" + randomNameSuffix(); + createSchema(firstSchema); + createSchema(secondSchema); + + try (AutoCloseable _ = createTable(firstSchema, "first_schema_table1", "(id int)"); + AutoCloseable _ = createTable(firstSchema, "first_schema_table2", "(id int)"); + AutoCloseable _ = createTable(secondSchema, "second_schema_table", "(id int)"); + AutoCloseable _ = createSparkIcebergTable(firstSchema)) { + assertThat(query("SELECT * FROM iceberg.system.iceberg_tables WHERE table_schema = '%s'".formatted(firstSchema))) + .matches("SELECT table_schema, table_name FROM iceberg.information_schema.tables WHERE table_schema='%s'".formatted(firstSchema)); + assertThat(query("SELECT * FROM iceberg.system.iceberg_tables WHERE table_schema in ('%s', '%s')".formatted(firstSchema, secondSchema))) + .matches("SELECT table_schema, table_name FROM iceberg.information_schema.tables WHERE table_schema IN ('%s', '%s')".formatted(firstSchema, secondSchema)); + } + finally { + dropSchema(firstSchema); + dropSchema(secondSchema); + } + } + + protected void dropSchema(String schema) + throws Exception + { + assertQuerySucceeds("DROP SCHEMA " + schema); + } + + protected AutoCloseable createTable(String schema, String tableName, String tableDefinition) + throws Exception + { + Session schemaSession = Session.builder(getQueryRunner().getDefaultSession()).setSchema(schema).build(); + return new TestTable( + sql -> getQueryRunner().execute(schemaSession, sql), + tableName, + tableDefinition); + } + + protected void createSchema(String schemaName) + throws Exception + { + String defaultSchemaName = getSession().getSchema().orElseThrow(); + String schemaLocation = schemaPath().replaceAll(defaultSchemaName, schemaName); + assertQuerySucceeds("CREATE SCHEMA " + schemaName + " WITH (location = '%s')".formatted(schemaLocation)); + } + + protected AutoCloseable createSparkIcebergTable(String schema) + { + return () -> {}; + } + + private long getMostRecentSnapshotId(String tableName) + { + return (long) Iterables.getOnlyElement(getQueryRunner().execute(format("SELECT snapshot_id FROM \"%s$snapshots\" ORDER BY committed_at DESC LIMIT 1", tableName)) + .getOnlyColumnAsSet()); + } + + private ZonedDateTime getSnapshotTime(String tableName, long snapshotId) + { + return (ZonedDateTime) Iterables.getOnlyElement(getQueryRunner().execute(format("SELECT committed_at FROM \"%s$snapshots\" WHERE snapshot_id = %s", tableName, snapshotId)) + .getOnlyColumnAsSet()); + } + + protected String getTableLocation(String tableName) + { + Pattern locationPattern = Pattern.compile(".*location = '(.*?)'.*", Pattern.DOTALL); + Matcher m = locationPattern.matcher((String) computeActual("SHOW CREATE TABLE " + tableName).getOnlyValue()); + if (m.find()) { + String location = m.group(1); + verify(!m.find(), "Unexpected second match"); + return location; + } + throw new IllegalStateException("Location not found in SHOW CREATE TABLE result"); } protected abstract void dropTableFromMetastore(String tableName); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorTest.java index 9fec165a10cb..331283577b76 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergConnectorTest.java @@ -19,19 +19,24 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.inject.Key; import io.airlift.units.DataSize; import io.airlift.units.Duration; import io.trino.Session; +import io.trino.execution.StageId; +import io.trino.execution.StageInfo; import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; -import io.trino.hdfs.HdfsContext; import io.trino.metadata.Metadata; import io.trino.metadata.QualifiedObjectName; import io.trino.metadata.TableHandle; import io.trino.operator.OperatorStats; +import io.trino.plugin.hive.HiveCompressionCodec; import io.trino.plugin.hive.TestingHivePlugin; -import io.trino.plugin.iceberg.fileio.ForwardingFileIo; +import io.trino.server.DynamicFilterService; +import io.trino.server.testing.TestingTrinoServer; import io.trino.spi.QueryId; import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.Constraint; @@ -40,16 +45,21 @@ import io.trino.spi.connector.TableNotFoundException; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.TupleDomain; +import io.trino.sql.PlannerContext; +import io.trino.sql.planner.Plan; +import io.trino.sql.planner.optimizations.PlanNodeSearcher; +import io.trino.sql.planner.plan.ExchangeNode; import io.trino.sql.planner.plan.FilterNode; import io.trino.sql.planner.plan.OutputNode; +import io.trino.sql.planner.plan.TableScanNode; +import io.trino.sql.planner.plan.TableWriterNode; import io.trino.sql.planner.plan.ValuesNode; import io.trino.testing.BaseConnectorTest; -import io.trino.testing.DataProviders; import io.trino.testing.DistributedQueryRunner; import io.trino.testing.MaterializedResult; -import io.trino.testing.MaterializedResultWithQueryId; import io.trino.testing.MaterializedRow; import io.trino.testing.QueryRunner; +import io.trino.testing.QueryRunner.MaterializedResultWithPlan; import io.trino.testing.TestingConnectorBehavior; import io.trino.testing.sql.TestTable; import org.apache.avro.Schema; @@ -58,31 +68,28 @@ import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; -import org.apache.hadoop.fs.FileSystem; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.util.JsonUtil; import org.intellij.lang.annotations.Language; -import org.testng.SkipException; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.net.URI; import java.nio.file.Files; import java.nio.file.Path; import java.time.Instant; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; -import java.util.ArrayList; +import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; @@ -92,11 +99,13 @@ import java.util.function.Consumer; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.LongStream; import java.util.stream.Stream; import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Verify.verify; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; @@ -104,23 +113,31 @@ import static com.google.common.collect.Iterables.getOnlyElement; import static com.google.common.collect.MoreCollectors.onlyElement; import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; +import static io.trino.SystemSessionProperties.DETERMINE_PARTITION_COUNT_FOR_WRITE_ENABLED; +import static io.trino.SystemSessionProperties.ENABLE_DYNAMIC_FILTERING; +import static io.trino.SystemSessionProperties.MAX_HASH_PARTITION_COUNT; import static io.trino.SystemSessionProperties.SCALE_WRITERS; -import static io.trino.SystemSessionProperties.TASK_PARTITIONED_WRITER_COUNT; -import static io.trino.SystemSessionProperties.TASK_WRITER_COUNT; +import static io.trino.SystemSessionProperties.TASK_SCALE_WRITERS_ENABLED; import static io.trino.SystemSessionProperties.USE_PREFERRED_WRITE_PARTITIONING; -import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static io.trino.hive.formats.compression.CompressionKind.ZSTD; +import static io.trino.plugin.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static io.trino.plugin.iceberg.IcebergFileFormat.AVRO; import static io.trino.plugin.iceberg.IcebergFileFormat.ORC; import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.plugin.iceberg.IcebergSessionProperties.BUCKET_EXECUTION_ENABLED; import static io.trino.plugin.iceberg.IcebergSessionProperties.COLLECT_EXTENDED_STATISTICS_ON_WRITE; +import static io.trino.plugin.iceberg.IcebergSessionProperties.DYNAMIC_FILTERING_WAIT_TIMEOUT; import static io.trino.plugin.iceberg.IcebergSessionProperties.EXTENDED_STATISTICS_ENABLED; import static io.trino.plugin.iceberg.IcebergSplitManager.ICEBERG_DOMAIN_COMPACTION_THRESHOLD; +import static io.trino.plugin.iceberg.IcebergTableProperties.isCompressionCodecSupportedForFormat; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; import static io.trino.plugin.iceberg.IcebergTestUtils.withSmallRowGroups; import static io.trino.plugin.iceberg.IcebergUtil.TRINO_QUERY_ID_NAME; -import static io.trino.plugin.iceberg.procedure.RegisterTableProcedure.getLatestMetadataLocation; +import static io.trino.plugin.iceberg.IcebergUtil.TRINO_USER_NAME; +import static io.trino.plugin.iceberg.IcebergUtil.getCompressionPropertyName; +import static io.trino.plugin.iceberg.IcebergUtil.getLatestMetadataLocation; import static io.trino.spi.predicate.Domain.multipleValues; import static io.trino.spi.predicate.Domain.singleValue; import static io.trino.spi.type.BigintType.BIGINT; @@ -131,19 +148,20 @@ import static io.trino.spi.type.TimeZoneKey.getTimeZoneKey; import static io.trino.spi.type.VarcharType.VARCHAR; import static io.trino.sql.planner.assertions.PlanMatchPattern.node; +import static io.trino.sql.planner.optimizations.PlanNodeSearcher.searchFrom; import static io.trino.testing.MaterializedResult.resultBuilder; import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder; import static io.trino.testing.TestingConnectorSession.SESSION; import static io.trino.testing.TestingNames.randomNameSuffix; import static io.trino.testing.TestingSession.testSessionBuilder; import static io.trino.testing.assertions.Assert.assertEventually; -import static io.trino.transaction.TransactionBuilder.transaction; import static java.lang.String.format; import static java.lang.String.join; import static java.nio.charset.StandardCharsets.UTF_8; import static java.time.ZoneOffset.UTC; import static java.time.format.DateTimeFormatter.ISO_OFFSET_DATE_TIME; import static java.util.Collections.nCopies; +import static java.util.Locale.ENGLISH; import static java.util.Objects.requireNonNull; import static java.util.UUID.randomUUID; import static java.util.concurrent.TimeUnit.MILLISECONDS; @@ -152,13 +170,14 @@ import static java.util.stream.Collectors.joining; import static java.util.stream.Collectors.toList; import static java.util.stream.IntStream.range; +import static org.apache.iceberg.TableMetadata.newTableMetadata; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertNotEquals; -import static org.testng.Assert.assertNull; -import static org.testng.Assert.assertTrue; +import static org.assertj.core.api.Assertions.offset; +import static org.junit.jupiter.api.Assumptions.abort; public abstract class BaseIcebergConnectorTest extends BaseConnectorTest @@ -175,6 +194,22 @@ protected BaseIcebergConnectorTest(IcebergFileFormat format) this.format = requireNonNull(format, "format is null"); } + protected TestTable newTrinoTable(String namePrefix, @Language("SQL") String tableDefinition) + { + return newTrinoTable(namePrefix, tableDefinition, ImmutableList.of()); + } + + protected TestTable newTrinoTable(String namePrefix, @Language("SQL") String tableDefinition, List rowsToInsert) + { + return new TestTable(getQueryRunner()::execute, namePrefix, tableDefinition, rowsToInsert); + } + + protected PlannerContext getPlannerContext() + { + TestingTrinoServer coordinator = ((DistributedQueryRunner) getQueryRunner()).getCoordinator(); + return coordinator.getInstance(Key.get(PlannerContext.class)); + } + @Override protected QueryRunner createQueryRunner() throws Exception @@ -188,22 +223,24 @@ protected IcebergQueryRunner.Builder createQueryRunnerBuilder() return IcebergQueryRunner.builder() .setIcebergProperties(ImmutableMap.builder() .put("iceberg.file-format", format.name()) + // Only allow some extra properties. Add "sorted_by" so that we can test that the property is disallowed by the connector explicitly. + .put("iceberg.allowed-extra-properties", "extra.property.one,extra.property.two,extra.property.three,sorted_by") // Allows testing the sorting writer flushing to the file system with smaller tables .put("iceberg.writer-sort-buffer-size", "1MB") .buildOrThrow()) .setInitialTables(REQUIRED_TPCH_TABLES); } - @BeforeClass + @BeforeAll public void initFileSystem() { fileSystem = getFileSystemFactory(getDistributedQueryRunner()).create(SESSION); } - @BeforeClass + @BeforeAll public void initStorageTimePrecision() { - try (TestTable table = new TestTable(getQueryRunner()::execute, "inspect_storage_precision", "(i int)")) { + try (TestTable table = newTrinoTable("inspect_storage_precision", "(i int)")) { assertUpdate("INSERT INTO " + table.getName() + " VALUES (1)", 1); assertUpdate("INSERT INTO " + table.getName() + " VALUES (2)", 1); assertUpdate("INSERT INTO " + table.getName() + " VALUES (3)", 1); @@ -215,59 +252,32 @@ public void initStorageTimePrecision() } } - @SuppressWarnings("DuplicateBranchesInSwitch") @Override protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior) { - switch (connectorBehavior) { - case SUPPORTS_TRUNCATE: - return false; - - case SUPPORTS_TOPN_PUSHDOWN: - return false; - - case SUPPORTS_DROP_SCHEMA_CASCADE: - return false; - - case SUPPORTS_RENAME_MATERIALIZED_VIEW_ACROSS_SCHEMAS: - return false; - - case SUPPORTS_ADD_COLUMN_NOT_NULL_CONSTRAINT: - return false; - - case SUPPORTS_REPORTING_WRITTEN_BYTES: - return true; - - default: - return super.hasBehavior(connectorBehavior); - } + return switch (connectorBehavior) { + case SUPPORTS_REPORTING_WRITTEN_BYTES -> true; + case SUPPORTS_ADD_COLUMN_NOT_NULL_CONSTRAINT, + SUPPORTS_RENAME_MATERIALIZED_VIEW_ACROSS_SCHEMAS, + SUPPORTS_TOPN_PUSHDOWN -> false; + default -> super.hasBehavior(connectorBehavior); + }; } @Test public void testAddRowFieldCaseInsensitivity() { - try (TestTable table = new TestTable(getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_add_row_field_case_insensitivity_", "AS SELECT CAST(row(row(2)) AS row(\"CHILD\" row(grandchild_1 integer))) AS col")) { - assertEquals(getColumnType(table.getName(), "col"), "row(CHILD row(grandchild_1 integer))"); + assertThat(getColumnType(table.getName(), "col")).isEqualTo("row(CHILD row(grandchild_1 integer))"); assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN col.child.grandchild_2 integer"); - assertEquals(getColumnType(table.getName(), "col"), "row(CHILD row(grandchild_1 integer, grandchild_2 integer))"); + assertThat(getColumnType(table.getName(), "col")).isEqualTo("row(CHILD row(grandchild_1 integer, grandchild_2 integer))"); assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN col.CHILD.grandchild_3 integer"); - assertEquals(getColumnType(table.getName(), "col"), "row(CHILD row(grandchild_1 integer, grandchild_2 integer, grandchild_3 integer))"); - } - } - - @Override - public void testAddAndDropColumnName(String columnName) - { - if (columnName.equals("a.dot")) { - assertThatThrownBy(() -> super.testAddAndDropColumnName(columnName)) - .hasMessage("Failed to add column: Cannot add column with ambiguous name: a.dot, use addColumn(parent, name, type)"); - return; + assertThat(getColumnType(table.getName(), "col")).isEqualTo("row(CHILD row(grandchild_1 integer, grandchild_2 integer, grandchild_3 integer))"); } - super.testAddAndDropColumnName(columnName); } @Override @@ -278,13 +288,15 @@ protected void verifyVersionedQueryFailurePermissible(Exception e) "Unsupported type for temporal table version: .*|" + "Unsupported type for table version: .*|" + "No version history table tpch.nation at or before .*|" + - "Iceberg snapshot ID does not exists: .*"); + "Iceberg snapshot ID does not exists: .*|" + + "Cannot find snapshot with reference name: .*"); } @Override protected void verifyConcurrentUpdateFailurePermissible(Exception e) { - assertThat(e).hasMessageContaining("Failed to commit Iceberg update to table"); + assertThat(e).hasMessageMatching("Failed to commit the transaction during write.*|" + + "Failed to commit during write.*"); } @Override @@ -299,16 +311,40 @@ protected void verifyConcurrentAddColumnFailurePermissible(Exception e) @Test public void testDeleteOnV1Table() { - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_delete_", "WITH (format_version = 1) AS SELECT * FROM orders")) { + try (TestTable table = newTrinoTable("test_delete_", "WITH (format_version = 1) AS SELECT * FROM orders")) { assertQueryFails("DELETE FROM " + table.getName() + " WHERE custkey <= 100", "Iceberg table updates require at least format version 2"); } } + @Test + public void testDeleteOnV1TableWhenManifestFileIsNotExist() + { + try (TestTable table = newTrinoTable("test_delete_", "(a bigint, dt date) WITH (format_version = 1, partitioning = ARRAY['dt'])")) { + assertQuerySucceeds("DELETE FROM " + table.getName() + " WHERE dt = date '2025-02-17'"); + assertQueryReturnsEmptyResult("SELECT * FROM " + table.getName()); + } + } + + @Test @Override public void testCharVarcharComparison() { - assertThatThrownBy(super::testCharVarcharComparison) - .hasMessage("Type not supported for Iceberg: char(3)"); + // with char->varchar coercion on table creation, this is essentially varchar/varchar comparison + try (TestTable table = newTrinoTable( + "test_char_varchar", + "(k, v) AS VALUES" + + " (-1, CAST(NULL AS CHAR(3))), " + + " (3, CAST(' ' AS CHAR(3)))," + + " (6, CAST('x ' AS CHAR(3)))")) { + // varchar of length shorter than column's length + assertThat(query("SELECT k, v FROM " + table.getName() + " WHERE v = CAST(' ' AS varchar(2))")).returnsEmptyResult(); + // varchar of length longer than column's length + assertThat(query("SELECT k, v FROM " + table.getName() + " WHERE v = CAST(' ' AS varchar(4))")).returnsEmptyResult(); + // value that's not all-spaces + assertThat(query("SELECT k, v FROM " + table.getName() + " WHERE v = CAST('x ' AS varchar(2))")).returnsEmptyResult(); + // exact match + assertQuery("SELECT k, v FROM " + table.getName() + " WHERE v = CAST(' ' AS varchar(3))", "VALUES (3, ' ')"); + } } @Test @@ -319,7 +355,7 @@ public void testShowCreateSchema() .matches("CREATE SCHEMA iceberg.tpch\n" + "AUTHORIZATION USER user\n" + "WITH \\(\n" + - "\\s+location = '.*/iceberg_data/tpch'\n" + + "\\s+location = '.*/tpch'\n" + "\\)"); } @@ -358,7 +394,7 @@ public void testShowCreateTable() "WITH (\n" + " format = '" + format.name() + "',\n" + " format_version = 2,\n" + - " location = '\\E.*/iceberg_data/tpch/orders-.*\\Q'\n" + + " location = '\\E.*/tpch/orders-.*\\Q'\n" + ")\\E"); } @@ -371,7 +407,7 @@ public void testPartitionedByRealWithNaN() assertQuery("SELECT part FROM " + tableName, "VALUES cast('NaN' as real)"); assertQuery("SELECT id FROM " + tableName + " WHERE is_nan(part)", "VALUES 1"); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -383,7 +419,7 @@ public void testPartitionedByDoubleWithNaN() assertQuery("SELECT part FROM " + tableName, "VALUES cast('NaN' as double)"); assertQuery("SELECT id FROM " + tableName + " WHERE is_nan(part)", "VALUES 1"); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -422,7 +458,7 @@ private void testDecimalWithPrecisionAndScale(int precision, int scale) assertUpdate(format("CREATE TABLE test_iceberg_decimal (x %s)", decimalType)); assertUpdate(format("INSERT INTO test_iceberg_decimal (x) VALUES (CAST('%s' AS %s))", decimalValue, decimalType), 1); assertQuery("SELECT * FROM test_iceberg_decimal", format("SELECT CAST('%s' AS %s)", decimalValue, decimalType)); - dropTable("test_iceberg_decimal"); + assertUpdate("DROP TABLE test_iceberg_decimal"); } @Test @@ -451,7 +487,7 @@ private void testSelectOrPartitionedByTime(boolean partitioned) assertQuery(format("SELECT x FROM %s WHERE x = TIME '9:00:00'", tableName), "SELECT CAST('9:00:00' AS TIME)"); assertQuery(format("SELECT x FROM %s WHERE y = 12345", tableName), "SELECT CAST('10:12:34' AS TIME)"); assertQuery(format("SELECT x FROM %s WHERE y = 67890", tableName), "SELECT CAST('9:00:00' AS TIME)"); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -485,7 +521,7 @@ private void testSelectOrPartitionedByTimestamp(boolean partitioned) assertQuery(format("SELECT * from %s WHERE _timestamp > TIMESTAMP '2017-06-01 10:12:34' AND _timestamp < TIMESTAMP '2018-05-01 10:12:34'", tableName), select2); assertQuery(format("SELECT * from %s WHERE _timestamp = TIMESTAMP '2018-05-01 10:12:34'", tableName), select3); assertQuery(format("SELECT * from %s WHERE _timestamp > TIMESTAMP '2018-01-01 10:12:34'", tableName), select3); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -919,8 +955,8 @@ public void testCreatePartitionedTable() " ('a_double', NULL, 1e0, 0.5e0, NULL, '1.0', '1.0'), " + " ('a_short_decimal', NULL, 1e0, 0.5e0, NULL, '1.0', '1.0'), " + " ('a_long_decimal', NULL, 1e0, 0.5e0, NULL, '11.0', '11.0'), " + - " ('a_varchar', 234e0, 1e0, 0.5e0, NULL, NULL, NULL), " + - " ('a_varbinary', 114e0, 1e0, 0.5e0, NULL, NULL, NULL), " + + " ('a_varchar', 213e0, 1e0, 0.5e0, NULL, NULL, NULL), " + + " ('a_varbinary', 103e0, 1e0, 0.5e0, NULL, NULL, NULL), " + " ('a_date', NULL, 1e0, 0.5e0, NULL, '2021-07-24', '2021-07-24'), " + " ('a_time', NULL, 1e0, 0.5e0, NULL, NULL, NULL), " + " ('a_timestamp', NULL, 1e0, 0.5e0, NULL, '2021-07-24 03:43:57.987654', '2021-07-24 03:43:57.987654'), " + @@ -929,7 +965,7 @@ public void testCreatePartitionedTable() " ('a_row', NULL, NULL, NULL, NULL, NULL, NULL), " + " ('an_array', NULL, NULL, NULL, NULL, NULL, NULL), " + " ('a_map', NULL, NULL, NULL, NULL, NULL, NULL), " + - " ('a quoted, field', 224e0, 1e0, 0.5e0, NULL, NULL, NULL), " + + " ('a quoted, field', 202e0, 1e0, 0.5e0, NULL, NULL, NULL), " + " (NULL, NULL, NULL, NULL, 2e0, NULL, NULL)"); } case AVRO -> { @@ -1040,21 +1076,198 @@ public void testCreatePartitionedTableWithNestedTypes() " partitioning = ARRAY['_date']" + ")"); - dropTable("test_partitioned_table_nested_type"); + assertUpdate("DROP TABLE test_partitioned_table_nested_type"); } @Test - public void testCreatePartitionedTableWithNestedField() + public void testCreateTableWithUnsupportedNestedFieldPartitioning() { assertQueryFails( - "CREATE TABLE test_partitioned_table_nested_field(parent ROW(child VARCHAR)) WITH (partitioning = ARRAY['\"parent.child\"'])", - "\\QPartitioning by nested field is unsupported: parent.child"); + "CREATE TABLE test_partitioned_table_nested_field_3 (grandparent ROW(parent ROW(child VARCHAR))) WITH (partitioning = ARRAY['\"grandparent.parent\"'])", + "\\QUnable to parse partitioning value: Cannot partition by non-primitive source field: struct<3: child: optional string>"); + assertQueryFails( + "CREATE TABLE test_partitioned_table_nested_field_inside_array (parent ARRAY(ROW(child VARCHAR))) WITH (partitioning = ARRAY['\"parent.child\"'])", + "\\QPartitioning field [parent.element.child] cannot be contained in a array"); assertQueryFails( - "CREATE TABLE test_partitioned_table_nested_field(grandparent ROW(parent ROW(child VARCHAR))) WITH (partitioning = ARRAY['\"grandparent.parent.child\"'])", - "\\QPartitioning by nested field is unsupported: grandparent.parent.child"); + "CREATE TABLE test_partitioned_table_nested_field_inside_map (parent MAP(ROW(child INTEGER), ARRAY(VARCHAR))) WITH (partitioning = ARRAY['\"parent.key.child\"'])", + "\\QPartitioning field [parent.key.child] cannot be contained in a map"); assertQueryFails( - "CREATE TABLE test_partitioned_table_nested_field(grandparent ROW(parent ROW(child VARCHAR))) WITH (partitioning = ARRAY['\"grandparent.parent\"'])", - "\\QUnable to parse partitioning value: Cannot partition by non-primitive source field: struct<3: child: optional string>"); + "CREATE TABLE test_partitioned_table_nested_field_year_transform_in_string (parent ROW(child VARCHAR)) WITH (partitioning = ARRAY['year(\"parent.child\")'])", + "\\QUnable to parse partitioning value: Invalid source type string for transform: year"); + } + + @Test + public void testNestedFieldPartitionedTable() + { + String tableName = "test_nested_field_partitioned_table_" + randomNameSuffix(); + assertQuerySucceeds("CREATE TABLE " + tableName + "(id INTEGER, name VARCHAR, parent ROW(child VARCHAR, child2 VARCHAR))" + + " WITH (partitioning = ARRAY['id', '\"parent.child\"', '\"parent.child2\"'])"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, 'presto', ROW('a', 'b'))", 1); + + assertThat(query("SELECT id, name, parent.child, parent.child2 FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES (1, 'presto', 'a', 'b')"); + + assertUpdate("UPDATE " + tableName + " SET name = 'trino' WHERE parent.child = 'a'", 1); + assertQuerySucceeds("DELETE FROM " + tableName); + assertThat(query("SELECT * FROM " + tableName)) + .returnsEmptyResult(); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, 'trino', ROW('a', 'b'))", 1); + + assertThat(query("SELECT id, name, parent.child, parent.child2 FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES (1, 'trino', 'a', 'b')"); + + String newTableName = "test_nested_field_partitioned_table_" + randomNameSuffix(); + assertQuerySucceeds("ALTER TABLE " + tableName + " RENAME TO " + newTableName); + + assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + newTableName + " EXECUTE OPTIMIZE"); + assertQuerySucceeds(prepareCleanUpSession(), "ALTER TABLE " + newTableName + " EXECUTE expire_snapshots(retention_threshold => '0s')"); + + assertThat(query("SELECT id, name, parent.child, parent.child2 FROM " + newTableName)) + .skippingTypesCheck() + .matches("VALUES (1, 'trino', 'a', 'b')"); + + assertUpdate("DROP TABLE " + newTableName); + } + + @Test + public void testMultipleLevelNestedFieldPartitionedTable() + { + String tableName = "test_multiple_level_nested_field_partitioned_table_" + randomNameSuffix(); + assertQuerySucceeds("CREATE TABLE " + tableName + "(id INTEGER, gradparent ROW(parent ROW(child VARCHAR)))" + + " WITH (partitioning = ARRAY['\"gradparent.parent.child\"'])"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, ROW(ROW('trino')))", 1); + + assertThat(query("SELECT id, gradparent.parent.child FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES (1, 'trino')"); + + assertUpdate("UPDATE " + tableName + " SET id = 2 WHERE gradparent.parent.child = 'trino'", 1); + assertQuerySucceeds("DELETE FROM " + tableName); + assertThat(query("SELECT * FROM " + tableName)) + .returnsEmptyResult(); + assertUpdate("INSERT INTO " + tableName + " VALUES (3, ROW(ROW('trino')))", 1); + + assertThat(query("SELECT id, gradparent.parent.child FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES (3, 'trino')"); + + String newTableName = "test_multiple_level_nested_field_partitioned_table_" + randomNameSuffix(); + assertQuerySucceeds("ALTER TABLE " + tableName + " RENAME TO " + newTableName); + + assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + newTableName + " EXECUTE OPTIMIZE"); + assertQuerySucceeds(prepareCleanUpSession(), "ALTER TABLE " + newTableName + " EXECUTE expire_snapshots(retention_threshold => '0s')"); + + assertThat(query("SELECT id, gradparent.parent.child FROM " + newTableName)) + .skippingTypesCheck() + .matches("VALUES (3, 'trino')"); + + assertUpdate("DROP TABLE " + newTableName); + } + + @Test + public void testNestedFieldPartitionedTableHavingSameChildName() + { + String tableName = "test_nested_field_partitioned_table_having_same_child_name_" + randomNameSuffix(); + assertQuerySucceeds("CREATE TABLE " + tableName + "(id INTEGER, gradparent ROW(parent ROW(child VARCHAR)), parent ROW(child VARCHAR))" + + " WITH (partitioning = ARRAY['\"gradparent.parent.child\"'])"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, ROW(ROW('trino')), ROW('trinodb'))", 1); + + assertThat(query("SELECT id, gradparent.parent.child, parent.child FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES (1, 'trino', 'trinodb')"); + + assertUpdate("UPDATE " + tableName + " SET id = 2 WHERE gradparent.parent.child = 'trino'", 1); + assertQuerySucceeds("DELETE FROM " + tableName); + assertThat(query("SELECT * FROM " + tableName)) + .returnsEmptyResult(); + assertUpdate("INSERT INTO " + tableName + " VALUES (3, ROW(ROW('trino')), ROW('trinodb'))", 1); + + assertThat(query("SELECT id, gradparent.parent.child, parent.child FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES (3, 'trino', 'trinodb')"); + + String newTableName = "test_nested_field_partitioned_table_having_same_child_name_" + randomNameSuffix(); + assertQuerySucceeds("ALTER TABLE " + tableName + " RENAME TO " + newTableName); + + assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + newTableName + " EXECUTE OPTIMIZE"); + assertQuerySucceeds(prepareCleanUpSession(), "ALTER TABLE " + newTableName + " EXECUTE expire_snapshots(retention_threshold => '0s')"); + + assertThat(query("SELECT id, gradparent.parent.child, parent.child FROM " + newTableName)) + .skippingTypesCheck() + .matches("VALUES (3, 'trino', 'trinodb')"); + + assertUpdate("DROP TABLE " + newTableName); + } + + @Test + public void testMergeWithNestedFieldPartitionedTable() + { + String sourceTable = "test_merge_with_nested_field_partitioned_table_source_" + randomNameSuffix(); + String targetTable = "test_merge_with_nested_field_partitioned_table_target_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + sourceTable + " (customer VARCHAR, purchases INT, address ROW (city VARCHAR))" + + " WITH (partitioning = ARRAY['\"address.city\"'])"); + assertUpdate( + "INSERT INTO " + sourceTable + " (customer, purchases, address)" + + " VALUES ('Aaron', 6, ROW('Arches')), ('Ed', 7, ROW('Etherville')), ('Carol', 9, ROW('Centreville')), ('Dave', 11, ROW('Darbyshire'))", + 4); + + assertUpdate("CREATE TABLE " + targetTable + " (customer VARCHAR, purchases INT, address ROW (city VARCHAR))" + + " WITH (partitioning = ARRAY['\"address.city\"'])"); + assertUpdate( + "INSERT INTO " + targetTable + " (customer, purchases, address) " + + " VALUES ('Aaron', 5, ROW('Antioch')), ('Bill', 7, ROW('Buena')), ('Carol', 3, ROW('Cambridge')), ('Dave', 11, ROW('Devon'))", + 4); + + String sql = "MERGE INTO " + targetTable + " t USING " + sourceTable + " s ON (t.customer = s.customer)" + + " WHEN MATCHED AND s.address.city = 'Centreville' THEN DELETE" + + " WHEN MATCHED THEN UPDATE SET purchases = s.purchases + t.purchases" + + " WHEN NOT MATCHED THEN INSERT (customer, purchases, address) VALUES (s.customer, s.purchases, s.address)"; + + assertUpdate(sql, 4); + + assertQuery( + "SELECT customer, purchases, address.city FROM " + targetTable, + "VALUES ('Aaron', 11, 'Antioch'), ('Ed', 7, 'Etherville'), ('Bill', 7, 'Buena'), ('Dave', 22, 'Devon')"); + + assertUpdate("DROP TABLE " + sourceTable); + assertUpdate("DROP TABLE " + targetTable); + } + + @Test + public void testSchemaEvolutionWithNestedFieldPartitioning() + { + String tableName = "test_schema_evolution_with_nested_field_partitioning_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (c1 bigint, parent1 ROW(child VARCHAR), parent2 ROW(child VARCHAR)) WITH (partitioning = ARRAY['\"parent1.child\"'])"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, ROW('BLR'), ROW('BLR'))", 1); + assertQuery("SELECT c1, parent1.child, parent2.child from " + tableName, "VALUES (1, 'BLR', 'BLR')"); + + // Drop end column + assertUpdate("ALTER TABLE " + tableName + " DROP COLUMN parent2"); + assertQuery("SELECT c1, parent1.child FROM " + tableName, "VALUES (1, 'BLR')"); + + assertUpdate("ALTER TABLE " + tableName + " ADD COLUMN parent3 ROW(child VARCHAR)"); + assertUpdate("ALTER TABLE " + tableName + " ADD COLUMN parent4 ROW(child VARCHAR)"); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, ROW('DEL'), ROW('DL'), ROW('IN'))", 1); + assertQuery("SELECT c1, parent1.child, parent3.child, parent4.child FROM " + tableName, "VALUES (1, 'BLR', NULL, NULL), (2, 'DEL', 'DL', 'IN')"); + + // Drop a column (parent3) from middle of table + assertUpdate("ALTER TABLE " + tableName + " DROP COLUMN parent3"); + assertQuery("SELECT c1, parent1.child, parent4.child FROM " + tableName, "VALUES (1, 'BLR', NULL), (2, 'DEL', 'IN')"); + + // Rename nested column + assertUpdate("ALTER TABLE " + tableName + " RENAME COLUMN parent4 TO renamed_parent"); + + // Rename nested partitioned column + assertUpdate("ALTER TABLE " + tableName + " RENAME COLUMN parent1 TO renamed_partitioned_parent"); + + assertQuery("SHOW COLUMNS FROM " + tableName, "VALUES " + + "('c1', 'bigint', '', ''), " + + "('renamed_partitioned_parent', 'row(child varchar)', '', ''), " + + "('renamed_parent', 'row(child varchar)', '', '')"); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -1074,68 +1287,86 @@ public void testCreatePartitionedTableAs() "FROM tpch.tiny.orders", "SELECT count(*) from orders"); - assertEquals( - computeScalar("SHOW CREATE TABLE test_create_partitioned_table_as"), - format( - "CREATE TABLE %s.%s.%s (\n" + - " \"order key\" bigint,\n" + - " ship_priority integer,\n" + - " order_status varchar\n" + - ")\n" + - "WITH (\n" + - " format = '%s',\n" + - " format_version = 2,\n" + - " location = '%s',\n" + - " partitioning = ARRAY['order_status','ship_priority','bucket(\"order key\", 9)']\n" + - ")", - getSession().getCatalog().orElseThrow(), - getSession().getSchema().orElseThrow(), - "test_create_partitioned_table_as", - format, - tempDirPath)); + assertThat(computeScalar("SHOW CREATE TABLE test_create_partitioned_table_as")).isEqualTo(format( + "CREATE TABLE %s.%s.%s (\n" + + " \"order key\" bigint,\n" + + " ship_priority integer,\n" + + " order_status varchar\n" + + ")\n" + + "WITH (\n" + + " format = '%s',\n" + + " format_version = 2,\n" + + " location = '%s',\n" + + " partitioning = ARRAY['order_status','ship_priority','bucket(\"order key\", 9)']\n" + + ")", + getSession().getCatalog().orElseThrow(), + getSession().getSchema().orElseThrow(), + "test_create_partitioned_table_as", + format, + tempDirPath)); assertQuery("SELECT * from test_create_partitioned_table_as", "SELECT orderkey, shippriority, orderstatus FROM orders"); - dropTable("test_create_partitioned_table_as"); - } - - @DataProvider(name = "partitionedTableWithQuotedIdentifierCasing") - public static Object[][] partitionedTableWithQuotedIdentifierCasing() - { - return new Object[][] { - {"x", "x", true}, - {"X", "x", true}, - {"\"x\"", "x", true}, - {"\"X\"", "x", true}, - {"x", "\"x\"", true}, - {"X", "\"x\"", true}, - {"\"x\"", "\"x\"", true}, - {"\"X\"", "\"x\"", true}, - {"x", "X", true}, - {"X", "X", true}, - {"\"x\"", "X", true}, - {"\"X\"", "X", true}, - {"x", "\"X\"", false}, - {"X", "\"X\"", false}, - {"\"x\"", "\"X\"", false}, - {"\"X\"", "\"X\"", false}, - }; + assertUpdate("DROP TABLE test_create_partitioned_table_as"); + } + + @Test + public void testCreatePartitionedTableWithQuotedIdentifierCasing() + { + testCreatePartitionedTableWithQuotedIdentifierCasing("x", "x", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("X", "x", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"x\"", "x", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"X\"", "x", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("x", "\"x\"", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("X", "\"x\"", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"x\"", "\"x\"", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"X\"", "\"x\"", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("x", "X", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("X", "X", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"x\"", "X", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"X\"", "X", true); + testCreatePartitionedTableWithQuotedIdentifierCasing("x", "\"X\"", false); + testCreatePartitionedTableWithQuotedIdentifierCasing("X", "\"X\"", false); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"x\"", "\"X\"", false); + testCreatePartitionedTableWithQuotedIdentifierCasing("\"X\"", "\"X\"", false); } - @Test(dataProvider = "partitionedTableWithQuotedIdentifierCasing") - public void testCreatePartitionedTableWithQuotedIdentifierCasing(String columnName, String partitioningField, boolean success) + private void testCreatePartitionedTableWithQuotedIdentifierCasing(String columnName, String partitioningField, boolean success) { String tableName = "partitioning_" + randomNameSuffix(); @Language("SQL") String sql = format("CREATE TABLE %s (%s bigint) WITH (partitioning = ARRAY['%s'])", tableName, columnName, partitioningField); if (success) { assertUpdate(sql); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } else { assertQueryFails(sql, "Unable to parse partitioning value: .*"); } } + @Test + public void testPartitionColumnNameConflict() + { + try (TestTable table = newTrinoTable("test_conflict_partition", "(ts timestamp, ts_day int) WITH (partitioning = ARRAY['day(ts)'])")) { + assertUpdate("INSERT INTO " + table.getName() + " VALUES (TIMESTAMP '2021-07-24 03:43:57.987654', 1)", 1); + + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES (TIMESTAMP '2021-07-24 03:43:57.987654', 1)"); + assertThat(query("SELECT partition.ts_day_2 FROM \"" + table.getName() + "$partitions\"")) + .matches("VALUES DATE '2021-07-24'"); + } + + try (TestTable table = newTrinoTable("test_conflict_partition", "(ts timestamp, ts_day int)")) { + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES partitioning = ARRAY['day(ts)']"); + assertUpdate("INSERT INTO " + table.getName() + " VALUES (TIMESTAMP '2021-07-24 03:43:57.987654', 1)", 1); + + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES (TIMESTAMP '2021-07-24 03:43:57.987654', 1)"); + assertThat(query("SELECT partition.ts_day_2 FROM \"" + table.getName() + "$partitions\"")) + .matches("VALUES DATE '2021-07-24'"); + } + } + @Test public void testSortByAllTypes() { @@ -1238,14 +1469,15 @@ public void testSortByAllTypes() .matches("VALUES " + values + ", " + highValues + ", " + lowValues); // Insert "large" number of rows, supposedly topping over iceberg.writer-sort-buffer-size so that temporary files are utilized by the sorting writer. - assertUpdate(""" - INSERT INTO %s - SELECT v.* - FROM (VALUES %s, %s, %s) v - CROSS JOIN UNNEST (sequence(1, 10_000)) a(i) - """.formatted(tableName, values, highValues, lowValues), 30000); + assertUpdate( + """ + INSERT INTO %s + SELECT v.* + FROM (VALUES %s, %s, %s) v + CROSS JOIN UNNEST (sequence(1, 10_000)) a(i) + """.formatted(tableName, values, highValues, lowValues), 30000); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -1255,60 +1487,53 @@ public void testEmptySortedByList() assertUpdate("" + "CREATE TABLE " + tableName + " (a_boolean boolean, an_integer integer) " + " WITH (partitioning = ARRAY['an_integer'], sorted_by = ARRAY[])"); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } - @Test(dataProvider = "sortedTableWithQuotedIdentifierCasing") - public void testCreateSortedTableWithQuotedIdentifierCasing(String columnName, String sortField) + @Test + public void testCreateSortedTableWithQuotedIdentifierCasing() { - String tableName = "test_create_sorted_table_with_quotes_" + randomNameSuffix(); - assertUpdate(format("CREATE TABLE %s (%s bigint) WITH (sorted_by = ARRAY['%s'])", tableName, columnName, sortField)); - dropTable(tableName); + testCreateSortedTableWithQuotedIdentifierCasing("col", "col"); + testCreateSortedTableWithQuotedIdentifierCasing("COL", "col"); + testCreateSortedTableWithQuotedIdentifierCasing("\"col\"", "col"); + testCreateSortedTableWithQuotedIdentifierCasing("\"COL\"", "col"); + testCreateSortedTableWithQuotedIdentifierCasing("col", "\"col\""); + testCreateSortedTableWithQuotedIdentifierCasing("COL", "\"col\""); + testCreateSortedTableWithQuotedIdentifierCasing("\"col\"", "\"col\""); + testCreateSortedTableWithQuotedIdentifierCasing("\"COL\"", "\"col\""); } - @DataProvider(name = "sortedTableWithQuotedIdentifierCasing") - public static Object[][] sortedTableWithQuotedIdentifierCasing() + private void testCreateSortedTableWithQuotedIdentifierCasing(String columnName, String sortField) { - return new Object[][] { - {"col", "col"}, - {"COL", "col"}, - {"\"col\"", "col"}, - {"\"COL\"", "col"}, - {"col", "\"col\""}, - {"COL", "\"col\""}, - {"\"col\"", "\"col\""}, - {"\"COL\"", "\"col\""}, - }; + String tableName = "test_create_sorted_table_with_quotes_" + randomNameSuffix(); + assertUpdate(format("CREATE TABLE %s (%s bigint) WITH (sorted_by = ARRAY['%s'])", tableName, columnName, sortField)); + assertUpdate("DROP TABLE " + tableName); } - @Test(dataProvider = "sortedTableWithSortTransform") - public void testCreateSortedTableWithSortTransform(String columnName, String sortField) + @Test + public void testCreateSortedTableWithSortTransform() { - String tableName = "test_sort_with_transform_" + randomNameSuffix(); - assertThatThrownBy(() -> query(format("CREATE TABLE %s (%s TIMESTAMP(6)) WITH (sorted_by = ARRAY['%s'])", tableName, columnName, sortField))) - .hasMessageContaining("Unable to parse sort field"); + testCreateSortedTableWithSortTransform("col", "bucket(col, 3)"); + testCreateSortedTableWithSortTransform("col", "bucket(\"col\", 3)"); + testCreateSortedTableWithSortTransform("col", "truncate(col, 3)"); + testCreateSortedTableWithSortTransform("col", "year(col)"); + testCreateSortedTableWithSortTransform("col", "month(col)"); + testCreateSortedTableWithSortTransform("col", "date(col)"); + testCreateSortedTableWithSortTransform("col", "hour(col)"); } - @DataProvider(name = "sortedTableWithSortTransform") - public static Object[][] sortedTableWithSortTransform() + private void testCreateSortedTableWithSortTransform(String columnName, String sortField) { - return new Object[][] { - {"col", "bucket(col, 3)"}, - {"col", "bucket(\"col\", 3)"}, - {"col", "truncate(col, 3)"}, - {"col", "year(col)"}, - {"col", "month(col)"}, - {"col", "date(col)"}, - {"col", "hour(col)"}, - }; + String tableName = "test_sort_with_transform_" + randomNameSuffix(); + assertThat(query(format("CREATE TABLE %s (%s TIMESTAMP(6)) WITH (sorted_by = ARRAY['%s'])", tableName, columnName, sortField))) + .failure().hasMessageContaining("Unable to parse sort field"); } @Test public void testSortOrderChange() { Session withSmallRowGroups = withSmallRowGroups(getSession()); - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_sort_order_change", "WITH (sorted_by = ARRAY['comment']) AS SELECT * FROM nation WITH NO DATA")) { assertUpdate(withSmallRowGroups, "INSERT INTO " + table.getName() + " SELECT * FROM nation", 25); @@ -1322,10 +1547,10 @@ public void testSortOrderChange() for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) { String path = (String) filePath; if (sortedByComment.contains(path)) { - assertTrue(isFileSorted(path, "comment")); + assertThat(isFileSorted(path, "comment")).isTrue(); } else { - assertTrue(isFileSorted(path, "name")); + assertThat(isFileSorted(path, "name")).isTrue(); } } assertQuery("SELECT * FROM " + table.getName(), "SELECT * FROM nation UNION ALL SELECT * FROM nation"); @@ -1338,13 +1563,12 @@ public void testSortingDisabled() Session withSortingDisabled = Session.builder(withSmallRowGroups(getSession())) .setCatalogSessionProperty(ICEBERG_CATALOG, "sorted_writing_enabled", "false") .build(); - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_sorting_disabled", "WITH (sorted_by = ARRAY['comment']) AS SELECT * FROM nation WITH NO DATA")) { assertUpdate(withSortingDisabled, "INSERT INTO " + table.getName() + " SELECT * FROM nation", 25); for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) { - assertFalse(isFileSorted((String) filePath, "comment")); + assertThat(isFileSorted((String) filePath, "comment")).isFalse(); } assertQuery("SELECT * FROM " + table.getName(), "SELECT * FROM nation"); } @@ -1354,19 +1578,18 @@ public void testSortingDisabled() public void testOptimizeWithSortOrder() { Session withSmallRowGroups = withSmallRowGroups(getSession()); - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_optimize_with_sort_order", "WITH (sorted_by = ARRAY['comment']) AS SELECT * FROM nation WITH NO DATA")) { assertUpdate("INSERT INTO " + table.getName() + " SELECT * FROM nation WHERE nationkey < 10", 10); assertUpdate("INSERT INTO " + table.getName() + " SELECT * FROM nation WHERE nationkey >= 10 AND nationkey < 20", 10); assertUpdate("INSERT INTO " + table.getName() + " SELECT * FROM nation WHERE nationkey >= 20", 5); assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES sorted_by = ARRAY['comment']"); - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. assertUpdate(withSingleWriterPerTask(withSmallRowGroups), "ALTER TABLE " + table.getName() + " EXECUTE optimize"); for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) { - assertTrue(isFileSorted((String) filePath, "comment")); + assertThat(isFileSorted((String) filePath, "comment")).isTrue(); } assertQuery("SELECT * FROM " + table.getName(), "SELECT * FROM nation"); } @@ -1376,22 +1599,19 @@ public void testOptimizeWithSortOrder() public void testUpdateWithSortOrder() { Session withSmallRowGroups = withSmallRowGroups(getSession()); - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_sorted_update", - "WITH (sorted_by = ARRAY['comment']) AS TABLE tpch.tiny.lineitem WITH NO DATA")) { + "WITH (sorted_by = ARRAY['comment']) AS TABLE tpch.tiny.customer WITH NO DATA")) { assertUpdate( withSmallRowGroups, - "INSERT INTO " + table.getName() + " TABLE tpch.tiny.lineitem", - "VALUES 60175"); - assertUpdate(withSmallRowGroups, "UPDATE " + table.getName() + " SET comment = substring(comment, 2)", 60175); + "INSERT INTO " + table.getName() + " TABLE tpch.tiny.customer", + "VALUES 1500"); + assertUpdate(withSmallRowGroups, "UPDATE " + table.getName() + " SET comment = substring(comment, 2)", 1500); assertQuery( - "SELECT orderkey, partkey, suppkey, linenumber, quantity, extendedprice, discount, tax, returnflag, linestatus, shipdate, " + - "commitdate, receiptdate, shipinstruct, shipmode, comment FROM " + table.getName(), - "SELECT orderkey, partkey, suppkey, linenumber, quantity, extendedprice, discount, tax, returnflag, linestatus, shipdate, " + - "commitdate, receiptdate, shipinstruct, shipmode, substring(comment, 2) FROM lineitem"); - for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) { - assertTrue(isFileSorted((String) filePath, "comment")); + "SELECT custkey, name, address, nationkey, phone, acctbal, mktsegment, comment FROM " + table.getName(), + "SELECT custkey, name, address, nationkey, phone, acctbal, mktsegment, substring(comment, 2) FROM customer"); + for (Object filePath : computeActual("SELECT file_path from \"" + table.getName() + "$files\" WHERE content != 1").getOnlyColumnAsSet()) { + assertThat(isFileSorted((String) filePath, "comment")).isTrue(); } } } @@ -1402,28 +1622,27 @@ public void testUpdateWithSortOrder() public void testSortingOnNestedField() { String tableName = "test_sorting_on_nested_field" + randomNameSuffix(); - assertThatThrownBy(() -> query("CREATE TABLE " + tableName + " (nationkey BIGINT, row_t ROW(name VARCHAR, regionkey BIGINT, comment VARCHAR)) " + + assertThat(query("CREATE TABLE " + tableName + " (nationkey BIGINT, row_t ROW(name VARCHAR, regionkey BIGINT, comment VARCHAR)) " + "WITH (sorted_by = ARRAY['row_t.comment'])")) - .hasMessageContaining("Unable to parse sort field: [row_t.comment]"); - assertThatThrownBy(() -> query("CREATE TABLE " + tableName + " (nationkey BIGINT, row_t ROW(name VARCHAR, regionkey BIGINT, comment VARCHAR)) " + + .failure().hasMessageContaining("Unable to parse sort field: [row_t.comment]"); + assertThat(query("CREATE TABLE " + tableName + " (nationkey BIGINT, row_t ROW(name VARCHAR, regionkey BIGINT, comment VARCHAR)) " + "WITH (sorted_by = ARRAY['\"row_t\".\"comment\"'])")) - .hasMessageContaining("Unable to parse sort field: [\"row_t\".\"comment\"]"); - assertThatThrownBy(() -> query("CREATE TABLE " + tableName + " (nationkey BIGINT, row_t ROW(name VARCHAR, regionkey BIGINT, comment VARCHAR)) " + + .failure().hasMessageContaining("Unable to parse sort field: [\"row_t\".\"comment\"]"); + assertThat(query("CREATE TABLE " + tableName + " (nationkey BIGINT, row_t ROW(name VARCHAR, regionkey BIGINT, comment VARCHAR)) " + "WITH (sorted_by = ARRAY['\"row_t.comment\"'])")) - .hasMessageContaining("Column not found: row_t.comment"); + .failure().hasMessageContaining("Column not found: row_t.comment"); } @Test public void testDroppingSortColumn() { Session withSmallRowGroups = withSmallRowGroups(getSession()); - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_dropping_sort_column", "WITH (sorted_by = ARRAY['comment']) AS SELECT * FROM nation WITH NO DATA")) { assertUpdate(withSmallRowGroups, "INSERT INTO " + table.getName() + " SELECT * FROM nation", 25); - assertThatThrownBy(() -> query("ALTER TABLE " + table.getName() + " DROP COLUMN comment")) - .hasMessageContaining("Cannot find source column for sort field"); + assertThat(query("ALTER TABLE " + table.getName() + " DROP COLUMN comment")) + .failure().hasMessageContaining("Cannot find source column for sort field"); } } @@ -1453,23 +1672,30 @@ public void testTableComments() ")"; String createTableSql = format(createTableTemplate, "test table comment", format); assertUpdate(createTableSql); - assertEquals(computeScalar("SHOW CREATE TABLE test_table_comments"), createTableSql); + assertThat(computeScalar("SHOW CREATE TABLE test_table_comments")).isEqualTo(createTableSql); assertUpdate("COMMENT ON TABLE test_table_comments IS 'different test table comment'"); - assertEquals(computeScalar("SHOW CREATE TABLE test_table_comments"), format(createTableTemplate, "different test table comment", format)); + assertThat(computeScalar("SHOW CREATE TABLE test_table_comments")).isEqualTo(format(createTableTemplate, "different test table comment", format)); assertUpdate("COMMENT ON TABLE test_table_comments IS NULL"); - assertEquals(computeScalar("SHOW CREATE TABLE test_table_comments"), createTableWithoutComment); - dropTable("iceberg.tpch.test_table_comments"); + assertThat(computeScalar("SHOW CREATE TABLE test_table_comments")).isEqualTo(createTableWithoutComment); + assertUpdate("DROP TABLE iceberg.tpch.test_table_comments"); assertUpdate(createTableWithoutComment); - assertEquals(computeScalar("SHOW CREATE TABLE test_table_comments"), createTableWithoutComment); + assertThat(computeScalar("SHOW CREATE TABLE test_table_comments")).isEqualTo(createTableWithoutComment); - dropTable("iceberg.tpch.test_table_comments"); + assertUpdate("DROP TABLE iceberg.tpch.test_table_comments"); } @Test public void testRollbackSnapshot() + { + testRollbackSnapshot("ALTER TABLE tpch.test_rollback EXECUTE rollback_to_snapshot(%s)"); + testRollbackSnapshot("ALTER TABLE tpch.test_rollback EXECUTE rollback_to_snapshot(snapshot_id => %s)"); + testRollbackSnapshot("CALL system.rollback_to_snapshot('tpch', 'test_rollback', %s)"); + } + + private void testRollbackSnapshot(String rollbackToSnapshotFormat) { assertUpdate("CREATE TABLE test_rollback (col0 INTEGER, col1 BIGINT)"); long afterCreateTableId = getCurrentSnapshotId("test_rollback"); @@ -1479,17 +1705,17 @@ public void testRollbackSnapshot() assertQuery("SELECT * FROM test_rollback ORDER BY col0", "VALUES (123, CAST(987 AS BIGINT))"); // Check that rollback_to_snapshot can be executed also when it does not do any changes - assertUpdate(format("CALL system.rollback_to_snapshot('tpch', 'test_rollback', %s)", afterFirstInsertId)); + assertUpdate(format(rollbackToSnapshotFormat, afterFirstInsertId)); assertQuery("SELECT * FROM test_rollback ORDER BY col0", "VALUES (123, CAST(987 AS BIGINT))"); assertUpdate("INSERT INTO test_rollback (col0, col1) VALUES (456, CAST(654 AS BIGINT))", 1); assertQuery("SELECT * FROM test_rollback ORDER BY col0", "VALUES (123, CAST(987 AS BIGINT)), (456, CAST(654 AS BIGINT))"); - assertUpdate(format("CALL system.rollback_to_snapshot('tpch', 'test_rollback', %s)", afterFirstInsertId)); + assertUpdate(format(rollbackToSnapshotFormat, afterFirstInsertId)); assertQuery("SELECT * FROM test_rollback ORDER BY col0", "VALUES (123, CAST(987 AS BIGINT))"); - assertUpdate(format("CALL system.rollback_to_snapshot('tpch', 'test_rollback', %s)", afterCreateTableId)); - assertEquals((long) computeActual("SELECT COUNT(*) FROM test_rollback").getOnlyValue(), 0); + assertUpdate(format(rollbackToSnapshotFormat, afterCreateTableId)); + assertThat((long) computeActual("SELECT COUNT(*) FROM test_rollback").getOnlyValue()).isEqualTo(0); assertUpdate("INSERT INTO test_rollback (col0, col1) VALUES (789, CAST(987 AS BIGINT))", 1); long afterSecondInsertId = getCurrentSnapshotId("test_rollback"); @@ -1497,10 +1723,18 @@ public void testRollbackSnapshot() // extra insert which should be dropped on rollback assertUpdate("INSERT INTO test_rollback (col0, col1) VALUES (999, CAST(999 AS BIGINT))", 1); - assertUpdate(format("CALL system.rollback_to_snapshot('tpch', 'test_rollback', %s)", afterSecondInsertId)); + assertUpdate(format(rollbackToSnapshotFormat, afterSecondInsertId)); assertQuery("SELECT * FROM test_rollback ORDER BY col0", "VALUES (789, CAST(987 AS BIGINT))"); - dropTable("test_rollback"); + assertUpdate("DROP TABLE test_rollback"); + } + + @Test + void testRollbackToSnapshotWithNullArgument() + { + assertQueryFails("CALL system.rollback_to_snapshot(NULL, 'customer_orders', 8954597067493422955)", ".*schema cannot be null.*"); + assertQueryFails("CALL system.rollback_to_snapshot('testdb', NULL, 8954597067493422955)", ".*table cannot be null.*"); + assertQueryFails("CALL system.rollback_to_snapshot('testdb', 'customer_orders', NULL)", ".*snapshot_id cannot be null.*"); } @Override @@ -1521,7 +1755,7 @@ public void testSchemaEvolution() assertQuery("SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1, NULL)"); assertUpdate("INSERT INTO test_schema_evolution_drop_end VALUES (3, 4, 5)", 1); assertQuery("SELECT * FROM test_schema_evolution_drop_end", "VALUES(0, 1, NULL), (3, 4, 5)"); - dropTable("test_schema_evolution_drop_end"); + assertUpdate("DROP TABLE test_schema_evolution_drop_end"); assertUpdate("CREATE TABLE test_schema_evolution_drop_middle (col0 INTEGER, col1 INTEGER, col2 INTEGER)"); assertUpdate("INSERT INTO test_schema_evolution_drop_middle VALUES (0, 1, 2)", 1); @@ -1531,26 +1765,75 @@ public void testSchemaEvolution() assertUpdate("ALTER TABLE test_schema_evolution_drop_middle ADD COLUMN col1 INTEGER"); assertUpdate("INSERT INTO test_schema_evolution_drop_middle VALUES (3, 4, 5)", 1); assertQuery("SELECT * FROM test_schema_evolution_drop_middle", "VALUES(0, 2, NULL), (3, 4, 5)"); - dropTable("test_schema_evolution_drop_middle"); + assertUpdate("DROP TABLE test_schema_evolution_drop_middle"); } + @Test @Override public void testDropRowFieldWhenDuplicates() { // Override because Iceberg doesn't allow duplicated field names in a row type assertThatThrownBy(super::testDropRowFieldWhenDuplicates) - .hasMessage("Invalid schema: multiple fields for name col.a: 2 and 3"); + .hasMessage("Field name 'a' specified more than once"); + } + + @Test + @Override // Override because ambiguous field name is disallowed in the connector + public void testDropAmbiguousRowFieldCaseSensitivity() + { + assertThatThrownBy(super::testDropAmbiguousRowFieldCaseSensitivity) + .hasMessage("Field name 'some_field' specified more than once"); + } + + @Test + public void testDuplicatedFieldNames() + { + String tableName = "test_duplicated_field_names" + randomNameSuffix(); + + assertQueryFails("CREATE TABLE " + tableName + "(col row(x int, \"X\" int))", "Field name 'x' specified more than once"); + assertQueryFails("CREATE TABLE " + tableName + " AS SELECT cast(NULL AS row(x int, \"X\" int)) col", "Field name 'x' specified more than once"); + + assertQueryFails("CREATE TABLE " + tableName + "(col array(row(x int, \"X\" int)))", "Field name 'x' specified more than once"); + assertQueryFails("CREATE TABLE " + tableName + " AS SELECT cast(NULL AS array(row(x int, \"X\" int))) col", "Field name 'x' specified more than once"); + + assertQueryFails("CREATE TABLE " + tableName + "(col map(int, row(x int, \"X\" int)))", "Field name 'x' specified more than once"); + assertQueryFails("CREATE TABLE " + tableName + " AS SELECT cast(NULL AS map(int, row(x int, \"X\" int))) col", "Field name 'x' specified more than once"); + + assertQueryFails("CREATE TABLE " + tableName + "(col row(a row(x int, \"X\" int)))", "Field name 'x' specified more than once"); + assertQueryFails("CREATE TABLE " + tableName + " AS SELECT cast(NULL AS row(a row(x int, \"X\" int))) col", "Field name 'x' specified more than once"); + + try (TestTable table = newTrinoTable("test_duplicated_field_names_", "(id int)")) { + assertQueryFails("ALTER TABLE " + table.getName() + " ADD COLUMN col row(x int, \"X\" int)", ".* Field name 'x' specified more than once"); + + assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN col row(\"X\" int)"); + assertQueryFails("ALTER TABLE " + table.getName() + " ADD COLUMN col.x int", "line 1:1: Field 'x' already exists"); + + assertQueryFails("ALTER TABLE " + table.getName() + " ALTER COLUMN col SET DATA TYPE row(x int, \"X\" int)", "Field name 'x' specified more than once"); + } } @Test public void testDropPartitionColumn() { String tableName = "test_drop_partition_column_" + randomNameSuffix(); - assertUpdate("CREATE TABLE " + tableName + " (id INTEGER, name VARCHAR, age INTEGER) WITH (partitioning = ARRAY['id', 'truncate(name, 5)', 'void(age)'])"); + assertUpdate("CREATE TABLE " + tableName + " (id INTEGER, name VARCHAR, age INTEGER, nested ROW(f1 integer, f2 integer)) " + + "WITH (partitioning = ARRAY['id', 'truncate(name, 5)', 'void(age)', '\"nested.f1\"'])"); assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN id", "Cannot drop partition field: id"); assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN name", "Cannot drop partition field: name"); assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN age", "Cannot drop partition field: age"); - dropTable(tableName); + assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN nested", "Failed to drop column.*"); + assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN nested.f1", "Cannot drop partition field: nested.f1"); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + void testDropHiddenMetadataColumn() + { + try (TestTable table = newTrinoTable("test_drop_metadata_column_", "(id int, col int)")) { + assertQueryFails("ALTER TABLE " + table.getName() + " DROP COLUMN \"$partition\"", "line 1:1: Cannot drop hidden column"); + assertQueryFails("ALTER TABLE " + table.getName() + " DROP COLUMN \"$path\"", "line 1:1: Cannot drop hidden column"); + assertQueryFails("ALTER TABLE " + table.getName() + " DROP COLUMN \"$file_modified_time\"", "line 1:1: Cannot drop hidden column"); + } } @Test @@ -1562,7 +1845,7 @@ public void testDropColumnUsedInOlderPartitionSpecs() assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN id", "Cannot drop column which is used by an old partition spec: id"); assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN name", "Cannot drop column which is used by an old partition spec: name"); assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN age", "Cannot drop column which is used by an old partition spec: age"); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -1633,7 +1916,7 @@ public void testLargeInOnPartitionedColumns() assertThat(query("SELECT * FROM test_in_predicate_large_set WHERE " + filter)) .matches("TABLE test_in_predicate_large_set"); - dropTable("test_in_predicate_large_set"); + assertUpdate("DROP TABLE test_in_predicate_large_set"); } @Test @@ -1686,49 +1969,43 @@ private void testCreateTableLikeForFormat(IcebergFileFormat otherFormat) // For this reason the source and the copied table will share the same directory. // This test does not drop intentionally the created tables to avoid affecting the source table or the information_schema. assertUpdate(format("CREATE TABLE test_create_table_like_original (col1 INTEGER, aDate DATE) WITH(format = '%s', location = '%s', partitioning = ARRAY['aDate'])", format, tempDirPath)); - assertEquals( - getTablePropertiesString("test_create_table_like_original"), - format( - """ - WITH ( - format = '%s', - format_version = 2, - location = '%s', - partitioning = ARRAY['adate'] - )""", - format, - tempDirPath)); + assertThat(getTablePropertiesString("test_create_table_like_original")).isEqualTo(format( + """ + WITH ( + format = '%s', + format_version = 2, + location = '%s', + partitioning = ARRAY['adate'] + )""", + format, + tempDirPath)); assertUpdate("CREATE TABLE test_create_table_like_copy0 (LIKE test_create_table_like_original, col2 INTEGER)"); assertUpdate("INSERT INTO test_create_table_like_copy0 (col1, aDate, col2) VALUES (1, CAST('1950-06-28' AS DATE), 3)", 1); assertQuery("SELECT * from test_create_table_like_copy0", "VALUES(1, CAST('1950-06-28' AS DATE), 3)"); assertUpdate("CREATE TABLE test_create_table_like_copy1 (LIKE test_create_table_like_original)"); - assertEquals( - getTablePropertiesString("test_create_table_like_copy1"), - format( - """ - WITH ( - format = '%s', - format_version = 2, - location = '%s' - )""", - format, - getTableLocation("test_create_table_like_copy1"))); + assertThat(getTablePropertiesString("test_create_table_like_copy1")).isEqualTo(format( + """ + WITH ( + format = '%s', + format_version = 2, + location = '%s' + )""", + format, + getTableLocation("test_create_table_like_copy1"))); assertUpdate("CREATE TABLE test_create_table_like_copy2 (LIKE test_create_table_like_original EXCLUDING PROPERTIES)"); - assertEquals( - getTablePropertiesString("test_create_table_like_copy2"), - format( - """ - WITH ( - format = '%s', - format_version = 2, - location = '%s' - )""", - format, - getTableLocation("test_create_table_like_copy2"))); - dropTable("test_create_table_like_copy2"); + assertThat(getTablePropertiesString("test_create_table_like_copy2")).isEqualTo(format( + """ + WITH ( + format = '%s', + format_version = 2, + location = '%s' + )""", + format, + getTableLocation("test_create_table_like_copy2"))); + assertUpdate("DROP TABLE test_create_table_like_copy2"); assertQueryFails("CREATE TABLE test_create_table_like_copy3 (LIKE test_create_table_like_original INCLUDING PROPERTIES)", "Cannot create a table on a non-empty location.*"); @@ -1751,7 +2028,7 @@ public void testPredicating() assertUpdate("CREATE TABLE test_predicating_on_real (col REAL)"); assertUpdate("INSERT INTO test_predicating_on_real VALUES 1.2", 1); assertQuery("SELECT * FROM test_predicating_on_real WHERE col = 1.2", "VALUES 1.2"); - dropTable("test_predicating_on_real"); + assertUpdate("DROP TABLE test_predicating_on_real"); } @Test @@ -2001,7 +2278,46 @@ public void testPartitionPredicatePushdownWithHistoricalPartitionSpecs() .containsAll("VALUES 1, 8, 9, 10") .isFullyPushedDown(); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testPartitionPredicatePushdownWithNestedFieldPartitioning() + { + // Start with a bucket transform, which cannot be used for predicate pushdown + String tableName = "test_partition_predicate_pushdown_with_nested_field_partitioning"; + assertUpdate("CREATE TABLE " + tableName + " (parent ROW(child1 TIMESTAMP(6), child2 INTEGER)) WITH (partitioning = ARRAY['bucket(\"parent.child2\", 3)'])"); + String selectQuery = "SELECT parent.child2 FROM " + tableName + " WHERE CAST(parent.child1 AS date) < DATE '2015-01-02'"; + + String initialValues = + "ROW(ROW(TIMESTAMP '1969-12-31 22:22:22.222222', 8))," + + "ROW(ROW(TIMESTAMP '1969-12-31 23:33:11.456789', 9))," + + "ROW(ROW(TIMESTAMP '1969-12-31 23:44:55.567890', 10))"; + assertUpdate("INSERT INTO " + tableName + " VALUES " + initialValues, 3); + assertThat(query(selectQuery)) + .containsAll("VALUES 8, 9, 10") + .isNotFullyPushedDown(FilterNode.class); + + String hourTransformValues = + "ROW(ROW(TIMESTAMP '2015-01-01 10:01:23.123456', 1))," + + "ROW(ROW(TIMESTAMP '2015-01-02 10:10:02.987654', 2))," + + "ROW(ROW(TIMESTAMP '2015-01-03 10:55:00.456789', 3))"; + // While the bucket transform is still used, the hour transform cannot be used for pushdown + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY['hour(\"parent.child1\")']"); + assertUpdate("INSERT INTO " + tableName + " VALUES " + hourTransformValues, 3); + assertThat(query(selectQuery)) + .containsAll("VALUES 1, 8, 9, 10") + .isNotFullyPushedDown(FilterNode.class); + + // The old partition scheme is no longer used so pushdown using the hour transform is allowed + assertUpdate("DELETE FROM " + tableName + " WHERE year(parent.child1) = 1969", 3); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE optimize"); + assertUpdate("INSERT INTO " + tableName + " VALUES " + initialValues, 3); + assertThat(query(selectQuery)) + .containsAll("VALUES 1, 8, 9, 10") + .isFullyPushedDown(); + + assertUpdate("DROP TABLE " + tableName); } @Test @@ -2102,7 +2418,7 @@ public void testDayTransformDate() assertThat(query("SELECT * FROM test_day_transform_date WHERE date_trunc('year', d) = DATE '2015-01-01'")) .isFullyPushedDown(); - dropTable("test_day_transform_date"); + assertUpdate("DROP TABLE test_day_transform_date"); } @Test @@ -2214,7 +2530,7 @@ else if (format == AVRO) { assertThat(query("SELECT * FROM test_day_transform_timestamp WHERE date_trunc('year', d) = DATE '2015-01-01'")) .isFullyPushedDown(); - dropTable("test_day_transform_timestamp"); + assertUpdate("DROP TABLE test_day_transform_timestamp"); } @Test @@ -2449,7 +2765,7 @@ public void testMonthTransformDate() " (NULL, NULL, NULL, NULL, 15e0, NULL, NULL)"); } - dropTable("test_month_transform_date"); + assertUpdate("DROP TABLE test_month_transform_date"); } @Test @@ -2556,7 +2872,7 @@ else if (format == AVRO) { assertThat(query("SELECT * FROM test_month_transform_timestamp WHERE date_trunc('year', d) = DATE '2015-01-01'")) .isFullyPushedDown(); - dropTable("test_month_transform_timestamp"); + assertUpdate("DROP TABLE test_month_transform_timestamp"); } @Test @@ -2782,7 +3098,7 @@ public void testYearTransformDate() " (NULL, NULL, NULL, NULL, 13e0, NULL, NULL)"); } - dropTable("test_year_transform_date"); + assertUpdate("DROP TABLE test_year_transform_date"); } @Test @@ -2884,7 +3200,7 @@ else if (format == AVRO) { assertThat(query("SELECT * FROM test_year_transform_timestamp WHERE date_trunc('year', d) = DATE '2015-01-01'")) .isFullyPushedDown(); - dropTable("test_year_transform_timestamp"); + assertUpdate("DROP TABLE test_year_transform_timestamp"); } @Test @@ -3032,7 +3348,7 @@ public void testTruncateTextTransform() assertThat(query("SHOW STATS FOR test_truncate_text_transform")) .skippingTypesCheck() .matches("VALUES " + - " ('d', " + (format == PARQUET ? "550e0" : "NULL") + ", 7e0, " + (format == AVRO ? "0.1e0" : "0.125e0") + ", NULL, NULL, NULL), " + + " ('d', " + (format == PARQUET ? "507e0" : "NULL") + ", 7e0, " + (format == AVRO ? "0.1e0" : "0.125e0") + ", NULL, NULL, NULL), " + " ('b', NULL, 8e0, 0e0, NULL, " + (format == AVRO ? "NULL, NULL" : "'1', '101'") + "), " + " (NULL, NULL, NULL, NULL, 8e0, NULL, NULL)"); @@ -3053,10 +3369,16 @@ public void testTruncateTextTransform() assertThat(query("SELECT * FROM test_truncate_text_transform WHERE d LIKE 'abc%'")) .isNotFullyPushedDown(FilterNode.class); - dropTable("test_truncate_text_transform"); + assertUpdate("DROP TABLE test_truncate_text_transform"); + } + + @Test + public void testTruncateIntegerTransform() + { + testTruncateIntegerTransform("integer"); + testTruncateIntegerTransform("bigint"); } - @Test(dataProvider = "truncateNumberTypesProvider") public void testTruncateIntegerTransform(String dataType) { String table = format("test_truncate_%s_transform", dataType); @@ -3147,16 +3469,7 @@ public void testTruncateIntegerTransform(String dataType) assertThat(query("SELECT * FROM " + table + " WHERE d >= 11")) .isNotFullyPushedDown(FilterNode.class); - dropTable(table); - } - - @DataProvider - public Object[][] truncateNumberTypesProvider() - { - return new Object[][] { - {"integer"}, - {"bigint"}, - }; + assertUpdate("DROP TABLE " + table); } @Test @@ -3223,13 +3536,15 @@ else if (format == AVRO) { .isFullyPushedDown(); assertThat(query("SELECT * FROM test_truncate_decimal_transform WHERE d >= 12.20")) - .isNotFullyPushedDown(FilterNode.class); // TODO subsume partition boundary filters on decimals + .isFullyPushedDown(); + assertThat(query("SELECT * FROM test_truncate_decimal_transform WHERE d > 12.19")) + .isFullyPushedDown(); assertThat(query("SELECT * FROM test_truncate_decimal_transform WHERE d > 12.20")) .isNotFullyPushedDown(FilterNode.class); assertThat(query("SELECT * FROM test_truncate_decimal_transform WHERE d >= 12.21")) .isNotFullyPushedDown(FilterNode.class); - dropTable("test_truncate_decimal_transform"); + assertUpdate("DROP TABLE test_truncate_decimal_transform"); } @Test @@ -3244,6 +3559,7 @@ public void testBucketTransform() "CAST('206caec7-68b9-4778-81b2-a12ece70c8b1' AS UUID)", "CAST('906caec7-68b9-4778-81b2-a12ece70c8b1' AS UUID)", "CAST('406caec7-68b9-4778-81b2-a12ece70c8b1' AS UUID)"); + testBucketTransformForType("VARBINARY", "x'04'", "x'21'", "x'02'"); } protected void testBucketTransformForType( @@ -3252,7 +3568,7 @@ protected void testBucketTransformForType( String greaterValueInSameBucket, String valueInOtherBucket) { - String tableName = format("test_bucket_transform%s", type.toLowerCase(Locale.ENGLISH)); + String tableName = format("test_bucket_transform%s", type.toLowerCase(ENGLISH)); assertUpdate(format("CREATE TABLE %s (d %s) WITH (partitioning = ARRAY['bucket(d, 2)'])", tableName, type)); assertUpdate(format("INSERT INTO %s VALUES (NULL), (%s), (%s), (%s)", tableName, value, greaterValueInSameBucket, valueInOtherBucket), 4); @@ -3274,8 +3590,8 @@ protected void testBucketTransformForType( } assertThat(query("SHOW STATS FOR " + tableName)) - .skippingTypesCheck() .exceptColumns("data_size", "low_value", "high_value") // these may vary between types + .skippingTypesCheck() .matches("VALUES " + " ('d', 3e0, " + (format == AVRO ? "0.1e0" : "0.25e0") + ", NULL), " + " (NULL, NULL, NULL, 4e0)"); @@ -3293,7 +3609,7 @@ protected void testBucketTransformForType( assertThat(query("SELECT * FROM " + tableName + " WHERE d >= " + valueInOtherBucket)) .isNotFullyPushedDown(FilterNode.class); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -3321,7 +3637,7 @@ public void testApplyFilterWithNonEmptyConstraintPredicate() " ('b', NULL, 7e0, 0e0, NULL, '1', '7'), " + " (NULL, NULL, NULL, NULL, 7e0, NULL, NULL)"; case PARQUET -> "VALUES " + - " ('d', 364e0, 7e0, 0e0, NULL, NULL, NULL), " + + " ('d', 342e0, 7e0, 0e0, NULL, NULL, NULL), " + " ('b', NULL, 7e0, 0e0, NULL, '1', '7'), " + " (NULL, NULL, NULL, NULL, 7e0, NULL, NULL)"; case AVRO -> "VALUES " + @@ -3378,7 +3694,7 @@ public void testVoidTransform() assertThat(query("SHOW STATS FOR test_void_transform")) .skippingTypesCheck() .matches("VALUES " + - " ('d', " + (format == PARQUET ? "205e0" : "NULL") + ", 5e0, 0.2857142857142857, NULL, NULL, NULL), " + + " ('d', " + (format == PARQUET ? "194e0" : "NULL") + ", 5e0, 0.2857142857142857, NULL, NULL, NULL), " + " ('b', NULL, 7e0, 0e0, NULL, '1', '7'), " + " (NULL, NULL, NULL, NULL, 7e0, NULL, NULL)"); } @@ -3412,7 +3728,7 @@ public void testMetadataDeleteSimple() assertUpdate("DELETE FROM test_metadata_delete_simple WHERE col1 = 1", 3); assertQuery("SELECT sum(col2) FROM test_metadata_delete_simple", "SELECT 701"); assertQuery("SELECT count(*) FROM \"test_metadata_delete_simple$partitions\"", "SELECT 2"); - dropTable("test_metadata_delete_simple"); + assertUpdate("DROP TABLE test_metadata_delete_simple"); } @Test @@ -3444,7 +3760,7 @@ public void testMetadataDelete() assertQuery("SELECT count(*) FROM \"test_metadata_delete$partitions\"", "SELECT 6"); assertQuery("SELECT * FROM test_metadata_delete", "SELECT orderkey, linenumber, linestatus FROM lineitem WHERE linestatus <> 'O' AND linenumber <> 3"); - dropTable("test_metadata_delete"); + assertUpdate("DROP TABLE test_metadata_delete"); } @Test @@ -3467,7 +3783,7 @@ private void testInSet(int inCount) assertUpdate(format("INSERT INTO test_in_set VALUES %s", values), inCount); // This proves that SELECTs with large IN phrases work correctly computeActual(format("SELECT col1 FROM test_in_set WHERE col1 IN (%s)", inList)); - dropTable("test_in_set"); + assertUpdate("DROP TABLE test_in_set"); } @Test @@ -3517,7 +3833,7 @@ public void testBasicTableStatistics() } assertThat(result).containsExactlyElementsOf(expectedStatistics); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } /** @@ -3543,8 +3859,8 @@ public void testBasicAnalyze() " (NULL, NULL, NULL, NULL, 5e0, NULL, NULL)") : ("VALUES " + " ('regionkey', NULL, NULL, 0e0, NULL, '0', '4'), " + - " ('name', " + (format == PARQUET ? "234e0" : "NULL") + ", NULL, 0e0, NULL, NULL, NULL), " + - " ('comment', " + (format == PARQUET ? "639e0" : "NULL") + ", NULL, 0e0, NULL, NULL, NULL), " + + " ('name', " + (format == PARQUET ? "224e0" : "NULL") + ", NULL, 0e0, NULL, NULL, NULL), " + + " ('comment', " + (format == PARQUET ? "626e0" : "NULL") + ", NULL, 0e0, NULL, NULL, NULL), " + " (NULL, NULL, NULL, NULL, 5e0, NULL, NULL)"); String statsWithNdv = format == AVRO @@ -3555,8 +3871,8 @@ public void testBasicAnalyze() " (NULL, NULL, NULL, NULL, 5e0, NULL, NULL)") : ("VALUES " + " ('regionkey', NULL, 5e0, 0e0, NULL, '0', '4'), " + - " ('name', " + (format == PARQUET ? "234e0" : "NULL") + ", 5e0, 0e0, NULL, NULL, NULL), " + - " ('comment', " + (format == PARQUET ? "639e0" : "NULL") + ", 5e0, 0e0, NULL, NULL, NULL), " + + " ('name', " + (format == PARQUET ? "224e0" : "NULL") + ", 5e0, 0e0, NULL, NULL, NULL), " + + " ('comment', " + (format == PARQUET ? "626e0" : "NULL") + ", 5e0, 0e0, NULL, NULL, NULL), " + " (NULL, NULL, NULL, NULL, 5e0, NULL, NULL)"); assertThat(query(defaultSession, "SHOW STATS FOR " + tableName)).skippingTypesCheck().matches(statsWithNdv); @@ -3657,7 +3973,7 @@ public void testMultipleColumnTableStatistics() } assertThat(result).containsExactlyElementsOf(expectedStatistics); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -3669,34 +3985,34 @@ public void testPartitionedTableStatistics() assertUpdate("INSERT INTO test_partitioned_table_statistics VALUES (100, 10)", 1); MaterializedResult result = computeActual("SHOW STATS FOR iceberg.tpch.test_partitioned_table_statistics"); - assertEquals(result.getRowCount(), 3); + assertThat(result.getRowCount()).isEqualTo(3); MaterializedRow row0 = result.getMaterializedRows().get(0); - assertEquals(row0.getField(0), "col1"); - assertEquals(row0.getField(3), 0.0); + assertThat(row0.getField(0)).isEqualTo("col1"); + assertThat(row0.getField(3)).isEqualTo(0.0); if (format != AVRO) { - assertEquals(row0.getField(5), "-10.0"); - assertEquals(row0.getField(6), "100.0"); + assertThat(row0.getField(5)).isEqualTo("-10.0"); + assertThat(row0.getField(6)).isEqualTo("100.0"); } else { - assertNull(row0.getField(5)); - assertNull(row0.getField(6)); + assertThat(row0.getField(5)).isNull(); + assertThat(row0.getField(6)).isNull(); } MaterializedRow row1 = result.getMaterializedRows().get(1); - assertEquals(row1.getField(0), "col2"); - assertEquals(row1.getField(3), 0.0); + assertThat(row1.getField(0)).isEqualTo("col2"); + assertThat(row1.getField(3)).isEqualTo(0.0); if (format != AVRO) { - assertEquals(row1.getField(5), "-1"); - assertEquals(row1.getField(6), "10"); + assertThat(row1.getField(5)).isEqualTo("-1"); + assertThat(row1.getField(6)).isEqualTo("10"); } else { - assertNull(row0.getField(5)); - assertNull(row0.getField(6)); + assertThat(row0.getField(5)).isNull(); + assertThat(row0.getField(6)).isNull(); } MaterializedRow row2 = result.getMaterializedRows().get(2); - assertEquals(row2.getField(4), 2.0); + assertThat(row2.getField(4)).isEqualTo(2.0); assertUpdate("INSERT INTO test_partitioned_table_statistics VALUES " + IntStream.rangeClosed(1, 5) .mapToObj(i -> format("(%d, 10)", i + 100)) @@ -3707,35 +4023,35 @@ public void testPartitionedTableStatistics() .collect(joining(", ")), 5); result = computeActual("SHOW STATS FOR iceberg.tpch.test_partitioned_table_statistics"); - assertEquals(result.getRowCount(), 3); + assertThat(result.getRowCount()).isEqualTo(3); row0 = result.getMaterializedRows().get(0); - assertEquals(row0.getField(0), "col1"); + assertThat(row0.getField(0)).isEqualTo("col1"); if (format != AVRO) { - assertEquals((double) row0.getField(3), 5.0 / 12.0, 1e-10); - assertEquals(row0.getField(5), "-10.0"); - assertEquals(row0.getField(6), "105.0"); + assertThat((double) row0.getField(3)).isCloseTo(5.0 / 12.0, offset(1e-10)); + assertThat(row0.getField(5)).isEqualTo("-10.0"); + assertThat(row0.getField(6)).isEqualTo("105.0"); } else { - assertEquals(row0.getField(3), 0.1); - assertNull(row0.getField(5)); - assertNull(row0.getField(6)); + assertThat(row0.getField(3)).isEqualTo(0.1); + assertThat(row0.getField(5)).isNull(); + assertThat(row0.getField(6)).isNull(); } row1 = result.getMaterializedRows().get(1); - assertEquals(row1.getField(0), "col2"); + assertThat(row1.getField(0)).isEqualTo("col2"); if (format != AVRO) { - assertEquals(row1.getField(3), 0.0); - assertEquals(row1.getField(5), "-1"); - assertEquals(row1.getField(6), "10"); + assertThat(row1.getField(3)).isEqualTo(0.0); + assertThat(row1.getField(5)).isEqualTo("-1"); + assertThat(row1.getField(6)).isEqualTo("10"); } else { - assertEquals(row0.getField(3), 0.1); - assertNull(row0.getField(5)); - assertNull(row0.getField(6)); + assertThat(row0.getField(3)).isEqualTo(0.1); + assertThat(row0.getField(5)).isNull(); + assertThat(row0.getField(6)).isNull(); } row2 = result.getMaterializedRows().get(2); - assertEquals(row2.getField(4), 12.0); + assertThat(row2.getField(4)).isEqualTo(12.0); assertUpdate("INSERT INTO test_partitioned_table_statistics VALUES " + IntStream.rangeClosed(6, 10) .mapToObj(i -> "(100, NULL)") @@ -3743,35 +4059,35 @@ public void testPartitionedTableStatistics() result = computeActual("SHOW STATS FOR iceberg.tpch.test_partitioned_table_statistics"); row0 = result.getMaterializedRows().get(0); - assertEquals(row0.getField(0), "col1"); + assertThat(row0.getField(0)).isEqualTo("col1"); if (format != AVRO) { - assertEquals(row0.getField(3), 5.0 / 17.0); - assertEquals(row0.getField(5), "-10.0"); - assertEquals(row0.getField(6), "105.0"); + assertThat(row0.getField(3)).isEqualTo(5.0 / 17.0); + assertThat(row0.getField(5)).isEqualTo("-10.0"); + assertThat(row0.getField(6)).isEqualTo("105.0"); } else { - assertEquals(row0.getField(3), 0.1); - assertNull(row0.getField(5)); - assertNull(row0.getField(6)); + assertThat(row0.getField(3)).isEqualTo(0.1); + assertThat(row0.getField(5)).isNull(); + assertThat(row0.getField(6)).isNull(); } row1 = result.getMaterializedRows().get(1); - assertEquals(row1.getField(0), "col2"); + assertThat(row1.getField(0)).isEqualTo("col2"); if (format != AVRO) { - assertEquals(row1.getField(3), 5.0 / 17.0); - assertEquals(row1.getField(5), "-1"); - assertEquals(row1.getField(6), "10"); + assertThat(row1.getField(3)).isEqualTo(5.0 / 17.0); + assertThat(row1.getField(5)).isEqualTo("-1"); + assertThat(row1.getField(6)).isEqualTo("10"); } else { - assertEquals(row0.getField(3), 0.1); - assertNull(row0.getField(5)); - assertNull(row0.getField(6)); + assertThat(row0.getField(3)).isEqualTo(0.1); + assertThat(row0.getField(5)).isNull(); + assertThat(row0.getField(6)).isNull(); } row2 = result.getMaterializedRows().get(2); - assertEquals(row2.getField(4), 17.0); + assertThat(row2.getField(4)).isEqualTo(17.0); - dropTable("iceberg.tpch.test_partitioned_table_statistics"); + assertUpdate("DROP TABLE iceberg.tpch.test_partitioned_table_statistics"); } @Test @@ -3821,7 +4137,35 @@ public void testPredicatePushdown() // Unenforced predicate is simplified during split generation, but not reflected here ImmutableMap.of("col1", multipleValues(BIGINT, values))); - dropTable(tableName.getObjectName()); + assertUpdate("DROP TABLE " + tableName.getObjectName()); + } + + @Test + public void testPredicateOnDataColumnIsNotPushedDown() + { + try (TestTable testTable = newTrinoTable( + "test_predicate_on_data_column_is_not_pushed_down", + "(a integer)")) { + assertThat(query("SELECT * FROM " + testTable.getName() + " WHERE a = 10")) + .isNotFullyPushedDown(FilterNode.class); + assertUpdate("INSERT INTO " + testTable.getName() + " VALUES 10", 1); + assertThat(query("SELECT * FROM " + testTable.getName() + " WHERE a = 10")) + .isNotFullyPushedDown(FilterNode.class); + } + } + + @Test + public void testPredicateOnDataColumnForPartitionedTableIsNotPushedDown() + { + try (TestTable testTable = newTrinoTable( + "test_predicate_on_data_column_for_partitioned_table_is_not_pushed_down", + "(a integer, dt date) WITH (partitioning = ARRAY['dt'])")) { + assertThat(query("SELECT * FROM " + testTable.getName() + " WHERE a = 10")) + .isNotFullyPushedDown(FilterNode.class); + assertUpdate("INSERT INTO " + testTable.getName() + " VALUES (10, date '2025-02-18')", 1); + assertThat(query("SELECT * FROM " + testTable.getName() + " WHERE a = 10")) + .isNotFullyPushedDown(FilterNode.class); + } } @Test @@ -3841,11 +4185,17 @@ public void testPredicatesWithStructuralTypes() assertQuery("SELECT id FROM " + tableName + " WHERE struct_t = ROW(21, 22)", "VALUES 21"); assertQuery("SELECT struct_t.f1 FROM " + tableName + " WHERE id = 11 AND map_t = MAP(ARRAY[11, 13], ARRAY[12, 14])", "VALUES 11"); - dropTable(tableName); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testPartitionsTableWithColumnNameConflict() + { + testPartitionsTableWithColumnNameConflict(true); + testPartitionsTableWithColumnNameConflict(false); } - @Test(dataProviderClass = DataProviders.class, dataProvider = "trueFalse") - public void testPartitionsTableWithColumnNameConflict(boolean partitioned) + private void testPartitionsTableWithColumnNameConflict(boolean partitioned) { assertUpdate("DROP TABLE IF EXISTS test_partitions_with_conflict"); assertUpdate("CREATE TABLE test_partitions_with_conflict (" + @@ -3898,7 +4248,14 @@ public void testPartitionsTableWithColumnNameConflict(boolean partitioned) // total_size is not exactly deterministic, so grab whatever value there is "(SELECT total_size FROM \"test_partitions_with_conflict$partitions\"), " + "CAST(" + - " NULL AS row(" + + " ROW (" + + (partitioned ? "" : " NULL, ") + + " NULL, " + + " NULL, " + + " NULL, " + + " NULL " + + " ) " + + " AS row(" + (partitioned ? "" : " p row(min integer, max integer, null_count bigint, nan_count bigint), ") + " row_count row(min integer, max integer, null_count bigint, nan_count bigint), " + " record_count row(min integer, max integer, null_count bigint, nan_count bigint), " + @@ -3917,7 +4274,7 @@ private void assertFilterPushdown( Map expectedEnforcedPredicate, Map expectedUnenforcedPredicate) { - Metadata metadata = getQueryRunner().getMetadata(); + Metadata metadata = getQueryRunner().getPlannerContext().getMetadata(); newTransaction().execute(getSession(), session -> { TableHandle table = metadata.getTableHandle(session, tableName) @@ -3930,24 +4287,54 @@ private void assertFilterPushdown( Optional> result = metadata.applyFilter(session, table, new Constraint(domains)); - assertEquals((expectedUnenforcedPredicate == null && expectedEnforcedPredicate == null), result.isEmpty()); + assertThat((expectedUnenforcedPredicate == null && expectedEnforcedPredicate == null)).isEqualTo(result.isEmpty()); if (result.isPresent()) { - IcebergTableHandle newTable = (IcebergTableHandle) result.get().getHandle().getConnectorHandle(); + IcebergTableHandle newTable = (IcebergTableHandle) result.get().getHandle().connectorHandle(); - assertEquals( - newTable.getEnforcedPredicate(), - TupleDomain.withColumnDomains(expectedEnforcedPredicate.entrySet().stream() - .collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue)))); + assertThat(newTable.getEnforcedPredicate()).isEqualTo(TupleDomain.withColumnDomains(expectedEnforcedPredicate.entrySet().stream() + .collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue)))); - assertEquals( - newTable.getUnenforcedPredicate(), - TupleDomain.withColumnDomains(expectedUnenforcedPredicate.entrySet().stream() - .collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue)))); + assertThat(newTable.getUnenforcedPredicate()).isEqualTo(TupleDomain.withColumnDomains(expectedUnenforcedPredicate.entrySet().stream() + .collect(toImmutableMap(entry -> columns.get(entry.getKey()), Map.Entry::getValue)))); } }); } + @Test + public void testCreateExternalTableWithNonExistingSchemaLocation() + throws Exception + { + String schemaName = "test_schema_without_location" + randomNameSuffix(); + String schemaLocation = "/tmp/" + schemaName; + + fileSystem.createDirectory(Location.of(schemaLocation)); + assertUpdate("CREATE SCHEMA iceberg." + schemaName + " WITH (location = '" + schemaLocation + "')"); + fileSystem.deleteDirectory(Location.of(schemaLocation)); + + String tableName = "test_create_external" + randomNameSuffix(); + String tableLocation = "/tmp/" + tableName; + + String schemaAndTableName = format("%s.%s", schemaName, tableName); + assertUpdate("CREATE TABLE " + schemaAndTableName + " (a bigint, b varchar) WITH (location = '" + tableLocation + "')"); + + assertUpdate( + "INSERT INTO " + schemaAndTableName + "(a, b) VALUES" + + "(NULL, NULL)," + + "(-42, 'abc')," + + "(9223372036854775807, 'abcdefghijklmnopqrstuvwxyz')", + 3); + assertThat(query("SELECT * FROM " + schemaAndTableName)) + .skippingTypesCheck() + .matches("VALUES" + + "(NULL, NULL)," + + "(-42, 'abc')," + + "(9223372036854775807, 'abcdefghijklmnopqrstuvwxyz')"); + + assertUpdate("DROP TABLE " + schemaAndTableName); + assertUpdate("DROP SCHEMA " + schemaName); + } + @Test public void testCreateNestedPartitionedTable() { @@ -3978,7 +4365,7 @@ public void testCreateNestedPartitionedTable() " (CAST(ROW(null, 'this is a random value') AS ROW(int, varchar))), " + " DATE '2021-07-24'", 1); - assertEquals(computeActual("SELECT * from test_nested_table_1").getRowCount(), 1); + assertThat(computeActual("SELECT * from test_nested_table_1").getRowCount()).isEqualTo(1); if (format != AVRO) { assertThat(query("SHOW STATS FOR test_nested_table_1")) @@ -3992,8 +4379,8 @@ public void testCreateNestedPartitionedTable() " ('dbl', NULL, 1e0, 0e0, NULL, '1.0', '1.0'), " + " ('mp', NULL, NULL, " + (format == ORC ? "0e0" : "NULL") + ", NULL, NULL, NULL), " + " ('dec', NULL, 1e0, 0e0, NULL, '1.0', '1.0'), " + - " ('vc', " + (format == PARQUET ? "116e0" : "NULL") + ", 1e0, 0e0, NULL, NULL, NULL), " + - " ('vb', " + (format == PARQUET ? "77e0" : "NULL") + ", 1e0, 0e0, NULL, NULL, NULL), " + + " ('vc', " + (format == PARQUET ? "105e0" : "NULL") + ", 1e0, 0e0, NULL, NULL, NULL), " + + " ('vb', " + (format == PARQUET ? "71e0" : "NULL") + ", 1e0, 0e0, NULL, NULL, NULL), " + " ('ts', NULL, 1e0, 0e0, NULL, '2021-07-24 02:43:57.348000', " + (format == ORC ? "'2021-07-24 02:43:57.348999'" : "'2021-07-24 02:43:57.348000'") + "), " + " ('tstz', NULL, 1e0, 0e0, NULL, '2021-07-24 02:43:57.348 UTC', '2021-07-24 02:43:57.348 UTC'), " + " ('str', NULL, NULL, " + (format == ORC ? "0e0" : "NULL") + ", NULL, NULL, NULL), " + @@ -4021,7 +4408,7 @@ public void testCreateNestedPartitionedTable() " (NULL, NULL, NULL, NULL, 1e0, NULL, NULL)"); } - dropTable("test_nested_table_1"); + assertUpdate("DROP TABLE test_nested_table_1"); assertUpdate("" + "CREATE TABLE test_nested_table_2 (" + @@ -4042,7 +4429,7 @@ public void testCreateNestedPartitionedTable() "map(array[1,2], array[array['ek', 'one'], array['don', 'do', 'two']]), CAST(1.0 as DECIMAL(5,2)), " + "CAST(ROW(1, 'this is a random value', null) AS ROW(int, varchar, array(int))), 'one'", 1); - assertEquals(computeActual("SELECT * from test_nested_table_2").getRowCount(), 1); + assertThat(computeActual("SELECT * from test_nested_table_2").getRowCount()).isEqualTo(1); if (format != AVRO) { assertThat(query("SHOW STATS FOR test_nested_table_2")) @@ -4055,7 +4442,7 @@ public void testCreateNestedPartitionedTable() " ('dbl', NULL, 1e0, 0e0, NULL, '1.0', '1.0'), " + " ('mp', NULL, NULL, " + (format == ORC ? "0e0" : "NULL") + ", NULL, NULL, NULL), " + " ('dec', NULL, 1e0, 0e0, NULL, '1.0', '1.0'), " + - " ('vc', " + (format == PARQUET ? "116e0" : "NULL") + ", 1e0, 0e0, NULL, NULL, NULL), " + + " ('vc', " + (format == PARQUET ? "105e0" : "NULL") + ", 1e0, 0e0, NULL, NULL, NULL), " + " ('str', NULL, NULL, " + (format == ORC ? "0e0" : "NULL") + ", NULL, NULL, NULL), " + " (NULL, NULL, NULL, NULL, 1e0, NULL, NULL)"); } @@ -4077,13 +4464,13 @@ public void testCreateNestedPartitionedTable() assertUpdate("CREATE TABLE test_nested_table_3 WITH (partitioning = ARRAY['int']) AS SELECT * FROM test_nested_table_2", 1); - assertEquals(computeActual("SELECT * FROM test_nested_table_3").getRowCount(), 1); + assertThat(computeActual("SELECT * FROM test_nested_table_3").getRowCount()).isEqualTo(1); assertThat(query("SHOW STATS FOR test_nested_table_3")) .matches("SHOW STATS FOR test_nested_table_2"); - dropTable("test_nested_table_2"); - dropTable("test_nested_table_3"); + assertUpdate("DROP TABLE test_nested_table_2"); + assertUpdate("DROP TABLE test_nested_table_3"); } @Test @@ -4103,23 +4490,16 @@ public void testSerializableReadIsolation() assertQuery("SELECT * FROM test_read_isolation", "VALUES 123, 456, 789"); - dropTable("test_read_isolation"); + assertUpdate("DROP TABLE test_read_isolation"); } private void withTransaction(Consumer consumer) { - transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getAccessControl()) + transaction(getQueryRunner().getTransactionManager(), getQueryRunner().getPlannerContext().getMetadata(), getQueryRunner().getAccessControl()) .readCommitted() .execute(getSession(), consumer); } - private void dropTable(String table) - { - Session session = getSession(); - assertUpdate(session, "DROP TABLE " + table); - assertFalse(getQueryRunner().tableExists(session, table)); - } - @Test public void testOptimizedMetadataQueries() { @@ -4143,7 +4523,7 @@ public void testOptimizedMetadataQueries() // TODO: assert behavior after deleting the last row of a partition, once row-level deletes are supported. // i.e. a query like 'DELETE FROM test_metadata_optimization WHERE b = 6 AND a = 5' - dropTable("test_metadata_optimization"); + assertUpdate("DROP TABLE test_metadata_optimization"); } @Test @@ -4186,7 +4566,7 @@ public void testIncorrectIcebergFileSizes() // Get manifest file MaterializedResult result = computeActual("SELECT path FROM \"test_iceberg_file_size$manifests\""); - assertEquals(result.getRowCount(), 1); + assertThat(result.getRowCount()).isEqualTo(1); String manifestFile = (String) result.getOnlyValue(); // Read manifest file @@ -4199,26 +4579,23 @@ public void testIncorrectIcebergFileSizes() entry = dataFileReader.next(); recordCount++; } - assertEquals(recordCount, 1); + assertThat(recordCount).isEqualTo(1); } // Alter data file entry to store incorrect file size GenericData.Record dataFile = (GenericData.Record) entry.get("data_file"); long alteredValue = 50L; - assertNotEquals(dataFile.get("file_size_in_bytes"), alteredValue); + assertThat(dataFile.get("file_size_in_bytes")) + .isNotEqualTo(alteredValue); dataFile.put("file_size_in_bytes", alteredValue); - // Replace the file through HDFS client. This is required for correct checksums. - HdfsContext context = new HdfsContext(getSession().toConnectorSession()); - org.apache.hadoop.fs.Path manifestFilePath = new org.apache.hadoop.fs.Path(manifestFile); - FileSystem fs = HDFS_ENVIRONMENT.getFileSystem(context, manifestFilePath); - // Write altered metadata - try (OutputStream out = fs.create(manifestFilePath); - DataFileWriter dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<>(schema))) { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (DataFileWriter dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<>(schema))) { dataFileWriter.create(schema, out); dataFileWriter.append(entry); } + fileSystem.newOutputFile(Location.of(manifestFile)).createOrOverwrite(out.toByteArray()); // Ignoring Iceberg provided file size makes the query succeed Session session = Session.builder(getSession()) @@ -4227,9 +4604,11 @@ public void testIncorrectIcebergFileSizes() assertQuery(session, "SELECT * FROM test_iceberg_file_size", "VALUES (123), (456), (758)"); // Using Iceberg provided file size fails the query - assertQueryFails("SELECT * FROM test_iceberg_file_size", ".*Error opening Iceberg split.*\\QIncorrect file size (%d) for file (end of stream not reached)\\E.*".formatted(alteredValue)); + assertQueryFails( + "SELECT * FROM test_iceberg_file_size", + "(Malformed ORC file\\. Invalid file metadata.*)|(.*Malformed Parquet file.*)"); - dropTable("test_iceberg_file_size"); + assertUpdate("DROP TABLE test_iceberg_file_size"); } protected DataFileReader readManifestFile(String location) @@ -4267,6 +4646,18 @@ public void testSplitPruningForFilterOnPartitionColumn() verifySplitCount("SELECT * FROM " + tableName + " WHERE regionkey % 5 = 3", 1); assertUpdate("DROP TABLE " + tableName); + + // Partition by multiple columns + assertUpdate(noRedistributeWrites, "CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['regionkey', 'nationkey']) AS SELECT * FROM nation", 25); + // Create 2 files per partition + assertUpdate(noRedistributeWrites, "INSERT INTO " + tableName + " SELECT * FROM nation", 25); + // sanity check that table contains exactly 50 files + assertThat(computeScalar("SELECT count(*) FROM \"" + tableName + "$files\"")).isEqualTo(50L); + + verifySplitCount("SELECT * FROM " + tableName + " WHERE regionkey % 5 = 3", 10); + verifySplitCount("SELECT * FROM " + tableName + " WHERE (regionkey * 2) - nationkey = 0", 6); + + assertUpdate("DROP TABLE " + tableName); } @Test @@ -4377,8 +4768,8 @@ public void testAllAvailableTypes() " ('a_double', NULL, 1e0, 0.5e0, NULL, '1.0', '1.0'), " + " ('a_short_decimal', NULL, 1e0, 0.5e0, NULL, '1.0', '1.0'), " + " ('a_long_decimal', NULL, 1e0, 0.5e0, NULL, '11.0', '11.0'), " + - " ('a_varchar', " + (format == PARQUET ? "234e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + - " ('a_varbinary', " + (format == PARQUET ? "114e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + + " ('a_varchar', " + (format == PARQUET ? "213e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + + " ('a_varbinary', " + (format == PARQUET ? "103e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + " ('a_date', NULL, 1e0, 0.5e0, NULL, '2021-07-24', '2021-07-24'), " + " ('a_time', NULL, 1e0, 0.5e0, NULL, NULL, NULL), " + " ('a_timestamp', NULL, 1e0, 0.5e0, NULL, " + (format == ORC ? "'2021-07-24 03:43:57.987000', '2021-07-24 03:43:57.987999'" : "'2021-07-24 03:43:57.987654', '2021-07-24 03:43:57.987654'") + "), " + @@ -4431,8 +4822,8 @@ public void testAllAvailableTypes() " ('a_double', NULL, 1e0, 0.5e0, NULL, '1.0', '1.0'), " + " ('a_short_decimal', NULL, 1e0, 0.5e0, NULL, '1.0', '1.0'), " + " ('a_long_decimal', NULL, 1e0, 0.5e0, NULL, '11.0', '11.0'), " + - " ('a_varchar', " + (format == PARQUET ? "234e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + - " ('a_varbinary', " + (format == PARQUET ? "114e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + + " ('a_varchar', " + (format == PARQUET ? "213e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + + " ('a_varbinary', " + (format == PARQUET ? "103e0" : "NULL") + ", 1e0, 0.5e0, NULL, NULL, NULL), " + " ('a_date', NULL, 1e0, 0.5e0, NULL, '2021-07-24', '2021-07-24'), " + " ('a_time', NULL, 1e0, 0.5e0, NULL, NULL, NULL), " + " ('a_timestamp', NULL, 1e0, 0.5e0, NULL, " + (format == ORC ? "'2021-07-24 03:43:57.987000', '2021-07-24 03:43:57.987999'" : "'2021-07-24 03:43:57.987654', '2021-07-24 03:43:57.987654'") + "), " + @@ -4563,35 +4954,34 @@ public void testAllAvailableTypes() assertUpdate("DROP TABLE test_all_types"); } - @Test(dataProvider = "repartitioningDataProvider") - public void testRepartitionDataOnCtas(Session session, String partitioning, int expectedFiles) - { - testRepartitionData(session, "tpch.tiny.orders", true, partitioning, expectedFiles); - } - - @Test(dataProvider = "repartitioningDataProvider") - public void testRepartitionDataOnInsert(Session session, String partitioning, int expectedFiles) + @Test + public void testRepartitionDataOnCtas() { - testRepartitionData(session, "tpch.tiny.orders", false, partitioning, expectedFiles); + // identity partitioning column + testRepartitionData(getSession(), "tpch.tiny.orders", true, "'orderstatus'", 3); + // bucketing + testRepartitionData(getSession(), "tpch.tiny.orders", true, "'bucket(custkey, 13)'", 13); + // varchar-based + testRepartitionData(getSession(), "tpch.tiny.orders", true, "'truncate(comment, 1)'", 35); + // complex; would exceed 100 open writers limit in IcebergPageSink without write repartitioning + testRepartitionData(getSession(), "tpch.tiny.orders", true, "'bucket(custkey, 4)', 'truncate(comment, 1)'", 131); + // same column multiple times + testRepartitionData(getSession(), "tpch.tiny.orders", true, "'truncate(comment, 1)', 'orderstatus', 'bucket(comment, 2)'", 180); } - @DataProvider - public Object[][] repartitioningDataProvider() + @Test + public void testRepartitionDataOnInsert() { - Session defaultSession = getSession(); - - return new Object[][] { - // identity partitioning column - {defaultSession, "'orderstatus'", 3}, - // bucketing - {defaultSession, "'bucket(custkey, 13)'", 13}, - // varchar-based - {defaultSession, "'truncate(comment, 1)'", 35}, - // complex; would exceed 100 open writers limit in IcebergPageSink without write repartitioning - {defaultSession, "'bucket(custkey, 4)', 'truncate(comment, 1)'", 131}, - // same column multiple times - {defaultSession, "'truncate(comment, 1)', 'orderstatus', 'bucket(comment, 2)'", 180}, - }; + // identity partitioning column + testRepartitionData(getSession(), "tpch.tiny.orders", false, "'orderstatus'", 3); + // bucketing + testRepartitionData(getSession(), "tpch.tiny.orders", false, "'bucket(custkey, 13)'", 13); + // varchar-based + testRepartitionData(getSession(), "tpch.tiny.orders", false, "'truncate(comment, 1)'", 35); + // complex; would exceed 100 open writers limit in IcebergPageSink without write repartitioning + testRepartitionData(getSession(), "tpch.tiny.orders", false, "'bucket(custkey, 4)', 'truncate(comment, 1)'", 131); + // same column multiple times + testRepartitionData(getSession(), "tpch.tiny.orders", false, "'truncate(comment, 1)', 'orderstatus', 'bucket(comment, 2)'", 180); } @Test @@ -4681,76 +5071,91 @@ private void testRepartitionData(Session session, String sourceRelation, boolean assertUpdate(session, "DROP TABLE " + tableName); } - @Test(dataProvider = "testDataMappingSmokeTestDataProvider") - public void testSplitPruningForFilterOnNonPartitionColumn(DataMappingTestSetup testSetup) + @Test + public void testSplitPruningForFilterOnNonPartitionColumn() { - if (testSetup.isUnsupportedType()) { - return; - } - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_split_pruning_non_partitioned", "(row_id int, col " + testSetup.getTrinoTypeName() + ")")) { - String tableName = table.getName(); - String sampleValue = testSetup.getSampleValueLiteral(); - String highValue = testSetup.getHighValueLiteral(); - // Insert separately to ensure two files with one value each - assertUpdate("INSERT INTO " + tableName + " VALUES (1, " + sampleValue + ")", 1); - assertUpdate("INSERT INTO " + tableName + " VALUES (2, " + highValue + ")", 1); - assertQuery("select count(*) from \"" + tableName + "$files\"", "VALUES 2"); - - int expectedSplitCount = supportsIcebergFileStatistics(testSetup.getTrinoTypeName()) ? 1 : 2; - verifySplitCount("SELECT row_id FROM " + tableName, 2); - verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col = " + sampleValue, expectedSplitCount); - verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col = " + highValue, expectedSplitCount); - - // ORC max timestamp statistics are truncated to millisecond precision and then appended with 999 microseconds. - // Therefore, sampleValue and highValue are within the max timestamp & there will be 2 splits. - verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col > " + sampleValue, - (format == ORC && testSetup.getTrinoTypeName().contains("timestamp") ? 2 : expectedSplitCount)); - verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col < " + highValue, - (format == ORC && testSetup.getTrinoTypeName().contains("timestamp") ? 2 : expectedSplitCount)); + for (DataMappingTestSetup testSetup : testDataMappingSmokeTestDataProvider()) { + if (testSetup.isUnsupportedType()) { + return; + } + try (TestTable table = newTrinoTable("test_split_pruning_non_partitioned", "(row_id int, col " + testSetup.getTrinoTypeName() + ")")) { + String tableName = table.getName(); + String sampleValue = testSetup.getSampleValueLiteral(); + String highValue = testSetup.getHighValueLiteral(); + // Insert separately to ensure two files with one value each + assertUpdate("INSERT INTO " + tableName + " VALUES (1, " + sampleValue + ")", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, " + highValue + ")", 1); + assertQuery("select count(*) from \"" + tableName + "$files\"", "VALUES 2"); + + int expectedSplitCount = supportsIcebergFileStatistics(testSetup.getTrinoTypeName()) ? 1 : 2; + verifySplitCount("SELECT row_id FROM " + tableName, 2); + verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col = " + sampleValue, expectedSplitCount); + verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col = " + highValue, expectedSplitCount); + + // ORC max timestamp statistics are truncated to millisecond precision and then appended with 999 microseconds. + // Therefore, sampleValue and highValue are within the max timestamp & there will be 2 splits. + verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col > " + sampleValue, + (format == ORC && testSetup.getTrinoTypeName().contains("timestamp") ? 2 : expectedSplitCount)); + verifySplitCount("SELECT row_id FROM " + tableName + " WHERE col < " + highValue, + (format == ORC && testSetup.getTrinoTypeName().contains("timestamp(6)") ? 2 : expectedSplitCount)); + } } } @Test - public void testGetIcebergTableProperties() + public void testGetIcebergTableWithLegacyOrcBloomFilterProperties() + throws IOException { - assertUpdate("CREATE TABLE test_iceberg_get_table_props (x BIGINT)"); - verifyIcebergTableProperties(computeActual("SELECT * FROM \"test_iceberg_get_table_props$properties\"")); - dropTable("test_iceberg_get_table_props"); - } + String tableName = "test_get_table_with_legacy_orc_bloom_filter_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT 1 x, 'INDIA' y", 1); - protected void verifyIcebergTableProperties(MaterializedResult actual) - { - assertThat(actual).isNotNull(); - MaterializedResult expected = resultBuilder(getSession()) - .row("write.format.default", format.name()) - .build(); - assertEqualsIgnoreOrder(actual.getMaterializedRows(), expected.getMaterializedRows()); + String tableLocation = getTableLocation(tableName); + String metadataLocation = getLatestMetadataLocation(fileSystem, tableLocation); + + TableMetadata tableMetadata = TableMetadataParser.read(FILE_IO_FACTORY.create(fileSystem), metadataLocation); + Map newProperties = ImmutableMap.builder() + .putAll(tableMetadata.properties()) + .put("orc.bloom.filter.columns", "x,y") // legacy incorrect property + .put("orc.bloom.filter.fpp", "0.2") // legacy incorrect property + .buildOrThrow(); + TableMetadata newTableMetadata = newTableMetadata( + tableMetadata.schema(), + tableMetadata.spec(), + tableMetadata.sortOrder(), + tableMetadata.location(), + newProperties); + byte[] metadataJson = TableMetadataParser.toJson(newTableMetadata).getBytes(UTF_8); + fileSystem.newOutputFile(Location.of(metadataLocation)).createOrOverwrite(metadataJson); + + assertThat((String) computeScalar("SHOW CREATE TABLE " + tableName)) + .contains("orc_bloom_filter_columns", "orc_bloom_filter_fpp"); } protected abstract boolean supportsIcebergFileStatistics(String typeName); - @Test(dataProvider = "testDataMappingSmokeTestDataProvider") - public void testSplitPruningFromDataFileStatistics(DataMappingTestSetup testSetup) + @Test + public void testSplitPruningFromDataFileStatistics() { - if (testSetup.isUnsupportedType()) { - return; - } - try (TestTable table = new TestTable( - getQueryRunner()::execute, - "test_split_pruning_data_file_statistics", - // Random double is needed to make sure rows are different. Otherwise compression may deduplicate rows, resulting in only one row group - "(col " + testSetup.getTrinoTypeName() + ", r double)")) { - String tableName = table.getName(); - String values = - Stream.concat( - nCopies(100, testSetup.getSampleValueLiteral()).stream(), - nCopies(100, testSetup.getHighValueLiteral()).stream()) - .map(value -> "(" + value + ", rand())") - .collect(joining(", ")); - assertUpdate(withSmallRowGroups(getSession()), "INSERT INTO " + tableName + " VALUES " + values, 200); - - String query = "SELECT * FROM " + tableName + " WHERE col = " + testSetup.getSampleValueLiteral(); - verifyPredicatePushdownDataRead(query, supportsRowGroupStatistics(testSetup.getTrinoTypeName())); + for (DataMappingTestSetup testSetup : testDataMappingSmokeTestDataProvider()) { + if (testSetup.isUnsupportedType()) { + return; + } + try (TestTable table = newTrinoTable( + "test_split_pruning_data_file_statistics", + // Random double is needed to make sure rows are different. Otherwise compression may deduplicate rows, resulting in only one row group + "(col " + testSetup.getTrinoTypeName() + ", r double)")) { + String tableName = table.getName(); + String values = + Stream.concat( + nCopies(100, testSetup.getSampleValueLiteral()).stream(), + nCopies(100, testSetup.getHighValueLiteral()).stream()) + .map(value -> "(" + value + ", rand())") + .collect(joining(", ")); + assertUpdate(withSmallRowGroups(getSession()), "INSERT INTO " + tableName + " VALUES " + values, 200); + + String query = "SELECT * FROM " + tableName + " WHERE col = " + testSetup.getSampleValueLiteral(); + verifyPredicatePushdownDataRead(query, supportsRowGroupStatistics(testSetup.getTrinoTypeName())); + } } } @@ -4758,20 +5163,20 @@ public void testSplitPruningFromDataFileStatistics(DataMappingTestSetup testSetu private void verifySplitCount(String query, int expectedSplitCount) { - MaterializedResultWithQueryId selectAllPartitionsResult = getDistributedQueryRunner().executeWithQueryId(getSession(), query); - assertEqualsIgnoreOrder(selectAllPartitionsResult.getResult().getMaterializedRows(), computeActual(withoutPredicatePushdown(getSession()), query).getMaterializedRows()); - verifySplitCount(selectAllPartitionsResult.getQueryId(), expectedSplitCount); + MaterializedResultWithPlan selectAllPartitionsResult = getDistributedQueryRunner().executeWithPlan(getSession(), query); + assertEqualsIgnoreOrder(selectAllPartitionsResult.result().getMaterializedRows(), computeActual(withoutPredicatePushdown(getSession()), query).getMaterializedRows()); + verifySplitCount(selectAllPartitionsResult.queryId(), expectedSplitCount); } private void verifyPredicatePushdownDataRead(@Language("SQL") String query, boolean supportsPushdown) { - MaterializedResultWithQueryId resultWithPredicatePushdown = getDistributedQueryRunner().executeWithQueryId(getSession(), query); - MaterializedResultWithQueryId resultWithoutPredicatePushdown = getDistributedQueryRunner().executeWithQueryId( + MaterializedResultWithPlan resultWithPredicatePushdown = getDistributedQueryRunner().executeWithPlan(getSession(), query); + MaterializedResultWithPlan resultWithoutPredicatePushdown = getDistributedQueryRunner().executeWithPlan( withoutPredicatePushdown(getSession()), query); - DataSize withPushdownDataSize = getOperatorStats(resultWithPredicatePushdown.getQueryId()).getInputDataSize(); - DataSize withoutPushdownDataSize = getOperatorStats(resultWithoutPredicatePushdown.getQueryId()).getInputDataSize(); + DataSize withPushdownDataSize = getOperatorStats(resultWithPredicatePushdown.queryId()).getInputDataSize(); + DataSize withoutPushdownDataSize = getOperatorStats(resultWithoutPredicatePushdown.queryId()).getInputDataSize(); if (supportsPushdown) { assertThat(withPushdownDataSize).isLessThan(withoutPushdownDataSize); } @@ -4804,7 +5209,7 @@ private void verifySplitCount(QueryId queryId, long expectedSplitCount) } } - private OperatorStats getOperatorStats(QueryId queryId) + protected OperatorStats getOperatorStats(QueryId queryId) { try { return getDistributedQueryRunner().getCoordinator() @@ -4824,36 +5229,16 @@ private OperatorStats getOperatorStats(QueryId queryId) @Override protected TestTable createTableWithDefaultColumns() { - throw new SkipException("Iceberg connector does not support column default values"); + return abort("Iceberg connector does not support column default values"); } @Override protected Optional filterDataMappingSmokeTestData(DataMappingTestSetup dataMappingTestSetup) { String typeName = dataMappingTestSetup.getTrinoTypeName(); - if (typeName.equals("tinyint") - || typeName.equals("smallint") - || typeName.startsWith("char(")) { - // These types are not supported by Iceberg - return Optional.of(dataMappingTestSetup.asUnsupported()); - } - - // According to Iceberg specification all time and timestamp values are stored with microsecond precision. - if (typeName.equals("time") || - typeName.equals("timestamp") || - typeName.equals("timestamp(3) with time zone")) { - return Optional.of(dataMappingTestSetup.asUnsupported()); - } - - return Optional.of(dataMappingTestSetup); - } - - @Override - protected Optional filterCaseSensitiveDataMappingTestData(DataMappingTestSetup dataMappingTestSetup) - { - String typeName = dataMappingTestSetup.getTrinoTypeName(); - if (typeName.equals("char(1)")) { - return Optional.of(dataMappingTestSetup.asUnsupported()); + if (typeName.equals("char(3)")) { + // Use explicitly padded literal in char mapping test due to whitespace padding on coercion to varchar + return Optional.of(new DataMappingTestSetup(typeName, "'ab '", dataMappingTestSetup.getHighValueLiteral())); } return Optional.of(dataMappingTestSetup); } @@ -4871,7 +5256,7 @@ public void testAmbiguousColumnsWithDots() assertUpdate("CREATE TABLE ambiguous (a ROW(cow BIGINT))"); assertThatThrownBy(() -> assertUpdate("ALTER TABLE ambiguous ADD COLUMN \"a.cow\" BIGINT")) - .hasMessage("Failed to add column: Cannot add column with ambiguous name: a.cow, use addColumn(parent, name, type)"); + .hasMessage("Failed to add column: Cannot add column, name already exists: a.cow"); assertUpdate("DROP TABLE ambiguous"); } @@ -4879,22 +5264,23 @@ public void testAmbiguousColumnsWithDots() public void testSchemaEvolutionWithDereferenceProjections() { // Fields are identified uniquely based on unique id's. If a column is dropped and recreated with the same name it should not return dropped data. - assertUpdate("CREATE TABLE evolve_test (dummy BIGINT, a row(b BIGINT, c VARCHAR))"); - assertUpdate("INSERT INTO evolve_test VALUES (1, ROW(1, 'abc'))", 1); - assertUpdate("ALTER TABLE evolve_test DROP COLUMN a"); - assertUpdate("ALTER TABLE evolve_test ADD COLUMN a ROW(b VARCHAR, c BIGINT)"); - assertQuery("SELECT a.b FROM evolve_test", "VALUES NULL"); - assertUpdate("DROP TABLE evolve_test"); + String tableName = "evolve_test_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (dummy BIGINT, a row(b BIGINT, c VARCHAR))"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, ROW(1, 'abc'))", 1); + assertUpdate("ALTER TABLE " + tableName + " DROP COLUMN a"); + assertUpdate("ALTER TABLE " + tableName + " ADD COLUMN a ROW(b VARCHAR, c BIGINT)"); + assertQuery("SELECT a.b FROM " + tableName, "VALUES NULL"); + assertUpdate("DROP TABLE " + tableName); - // Very changing subfield ordering does not revive dropped data - assertUpdate("CREATE TABLE evolve_test (dummy BIGINT, a ROW(b BIGINT, c VARCHAR), d BIGINT) with (partitioning = ARRAY['d'])"); - assertUpdate("INSERT INTO evolve_test VALUES (1, ROW(2, 'abc'), 3)", 1); - assertUpdate("ALTER TABLE evolve_test DROP COLUMN a"); - assertUpdate("ALTER TABLE evolve_test ADD COLUMN a ROW(c VARCHAR, b BIGINT)"); - assertUpdate("INSERT INTO evolve_test VALUES (4, 5, ROW('def', 6))", 1); - assertQuery("SELECT a.b FROM evolve_test WHERE d = 3", "VALUES NULL"); - assertQuery("SELECT a.b FROM evolve_test WHERE d = 5", "VALUES 6"); - assertUpdate("DROP TABLE evolve_test"); + // Verify changing subfield ordering does not revive dropped data + assertUpdate("CREATE TABLE " + tableName + " (dummy BIGINT, a ROW(b BIGINT, c VARCHAR), d BIGINT) with (partitioning = ARRAY['d'])"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, ROW(2, 'abc'), 3)", 1); + assertUpdate("ALTER TABLE " + tableName + " DROP COLUMN a"); + assertUpdate("ALTER TABLE " + tableName + " ADD COLUMN a ROW(c VARCHAR, b BIGINT)"); + assertUpdate("INSERT INTO " + tableName + " VALUES (4, 5, ROW('def', 6))", 1); + assertQuery("SELECT a.b FROM " + tableName + " WHERE d = 3", "VALUES NULL"); + assertQuery("SELECT a.b FROM " + tableName + " WHERE d = 5", "VALUES 6"); + assertUpdate("DROP TABLE " + tableName); } @Test @@ -4943,118 +5329,161 @@ public void testProjectionPushdownOnPartitionedTableWithComments() assertUpdate("DROP TABLE IF EXISTS test_projection_pushdown_comments"); } - @Test(dataProvider = "tableFormatVersion") - public void testOptimize(int formatVersion) - throws Exception + @Test + public void testMaxWriterTaskCount() { - String tableName = "test_optimize_" + randomNameSuffix(); - assertUpdate("CREATE TABLE " + tableName + " (key integer, value varchar) WITH (format_version = " + formatVersion + ")"); - - // DistributedQueryRunner sets node-scheduler.include-coordinator by default, so include coordinator int workerCount = getQueryRunner().getNodeCount(); + checkState(workerCount > 1, "testMaxWriterTaskCount requires multiple workers"); - // optimize an empty table - assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); - assertThat(getActiveFiles(tableName)).isEmpty(); - - assertUpdate("INSERT INTO " + tableName + " VALUES (11, 'eleven')", 1); - assertUpdate("INSERT INTO " + tableName + " VALUES (12, 'zwölf')", 1); - assertUpdate("INSERT INTO " + tableName + " VALUES (13, 'trzynaście')", 1); - assertUpdate("INSERT INTO " + tableName + " VALUES (14, 'quatorze')", 1); - assertUpdate("INSERT INTO " + tableName + " VALUES (15, 'пʼятнадцять')", 1); - - List initialFiles = getActiveFiles(tableName); - assertThat(initialFiles) - .hasSize(5) - // Verify we have sufficiently many test rows with respect to worker count. - .hasSizeGreaterThan(workerCount); + assertUpdate("CREATE TABLE test_max_writer_task_count_insert (id BIGINT) WITH (partitioning = ARRAY['id'])"); - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. - computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); - assertThat(query("SELECT sum(key), listagg(value, ' ') WITHIN GROUP (ORDER BY key) FROM " + tableName)) - .matches("VALUES (BIGINT '65', VARCHAR 'eleven zwölf trzynaście quatorze пʼятнадцять')"); - List updatedFiles = getActiveFiles(tableName); - assertThat(updatedFiles) - .hasSizeBetween(1, workerCount) - .doesNotContainAnyElementsOf(initialFiles); - // No files should be removed (this is expire_snapshots's job, when it exists) - assertThat(getAllDataFilesFromTableDirectory(tableName)) - .containsExactlyInAnyOrderElementsOf(concat(initialFiles, updatedFiles)); - - // optimize with low retention threshold, nothing should change - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. - computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE (file_size_threshold => '33B')"); - assertThat(query("SELECT sum(key), listagg(value, ' ') WITHIN GROUP (ORDER BY key) FROM " + tableName)) - .matches("VALUES (BIGINT '65', VARCHAR 'eleven zwölf trzynaście quatorze пʼятнадцять')"); - assertThat(getActiveFiles(tableName)).isEqualTo(updatedFiles); - assertThat(getAllDataFilesFromTableDirectory(tableName)) - .containsExactlyInAnyOrderElementsOf(concat(initialFiles, updatedFiles)); - - // optimize with delimited procedure name - assertQueryFails("ALTER TABLE " + tableName + " EXECUTE \"optimize\"", "Table procedure not registered: optimize"); - assertUpdate("ALTER TABLE " + tableName + " EXECUTE \"OPTIMIZE\""); - // optimize with delimited parameter name (and procedure name) - assertUpdate("ALTER TABLE " + tableName + " EXECUTE \"OPTIMIZE\" (\"file_size_threshold\" => '33B')"); // TODO (https://github.com/trinodb/trino/issues/11326) this should fail - assertUpdate("ALTER TABLE " + tableName + " EXECUTE \"OPTIMIZE\" (\"FILE_SIZE_THRESHOLD\" => '33B')"); - assertUpdate("DROP TABLE " + tableName); + Session session = Session.builder(getSession()) + // disable writer scaling for the test + .setSystemProperty(SCALE_WRITERS, "false") + .setSystemProperty(TASK_SCALE_WRITERS_ENABLED, "false") + // limit number of writer tasks to 1 + .setSystemProperty(MAX_WRITER_TASK_COUNT, "1") + .setSystemProperty(MAX_HASH_PARTITION_COUNT, Integer.toString(workerCount)) + .build(); + QueryId id = getDistributedQueryRunner() + .executeWithPlan(session, """ + INSERT INTO test_max_writer_task_count_insert + SELECT * FROM TABLE(sequence(start => 0, stop => 100, step => 1)) + """) + .queryId(); + StagesInfo stagesInfo = getDistributedQueryRunner() + .getCoordinator() + .getFullQueryInfo(id) + .getStages() + .orElseThrow(); + StageId outputStageId = stagesInfo.getOutputStageId(); + StageInfo writerStage = stagesInfo.getSubStages(outputStageId).getFirst(); + assertThat(PlanNodeSearcher.searchFrom(writerStage.getPlan().getRoot()).whereIsInstanceOfAny(TableWriterNode.class).matches()).isTrue(); + assertThat(writerStage.getTasks().size()).isEqualTo(1); + + assertUpdate("DROP TABLE IF EXISTS test_max_writer_task_count_insert"); + } + + @Test + public void testOptimize() + throws Exception + { + for (int formatVersion = IcebergConfig.FORMAT_VERSION_SUPPORT_MIN; formatVersion < IcebergConfig.FORMAT_VERSION_SUPPORT_MAX; formatVersion++) { + String tableName = "test_optimize_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (key integer, value varchar) WITH (format_version = " + formatVersion + ")"); + + // DistributedQueryRunner sets node-scheduler.include-coordinator by default, so include coordinator + int workerCount = getQueryRunner().getNodeCount(); + + // optimize an empty table + assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(getActiveFiles(tableName)).isEmpty(); + + assertUpdate("INSERT INTO " + tableName + " VALUES (11, 'eleven')", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (12, 'zwölf')", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (13, 'trzynaście')", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (14, 'quatorze')", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (15, 'пʼятнадцять')", 1); + + List initialFiles = getActiveFiles(tableName); + assertThat(initialFiles) + .hasSize(5) + // Verify we have sufficiently many test rows with respect to worker count. + .hasSizeGreaterThan(workerCount); + + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. + computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(query("SELECT sum(key), listagg(value, ' ') WITHIN GROUP (ORDER BY key) FROM " + tableName)) + .matches("VALUES (BIGINT '65', VARCHAR 'eleven zwölf trzynaście quatorze пʼятнадцять')"); + List updatedFiles = getActiveFiles(tableName); + assertThat(updatedFiles) + .hasSizeBetween(1, workerCount) + .doesNotContainAnyElementsOf(initialFiles); + // No files should be removed (this is expire_snapshots's job, when it exists) + assertThat(getAllDataFilesFromTableDirectory(tableName)) + .containsExactlyInAnyOrderElementsOf(concat(initialFiles, updatedFiles)); + + // optimize with low retention threshold, nothing should change + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. + computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE (file_size_threshold => '33B')"); + assertThat(query("SELECT sum(key), listagg(value, ' ') WITHIN GROUP (ORDER BY key) FROM " + tableName)) + .matches("VALUES (BIGINT '65', VARCHAR 'eleven zwölf trzynaście quatorze пʼятнадцять')"); + assertThat(getActiveFiles(tableName)).isEqualTo(updatedFiles); + assertThat(getAllDataFilesFromTableDirectory(tableName)) + .containsExactlyInAnyOrderElementsOf(concat(initialFiles, updatedFiles)); + + // optimize with delimited procedure name + assertQueryFails("ALTER TABLE " + tableName + " EXECUTE \"optimize\"", "Table procedure not registered: optimize"); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE \"OPTIMIZE\""); + // optimize with delimited parameter name (and procedure name) + assertUpdate("ALTER TABLE " + tableName + " EXECUTE \"OPTIMIZE\" (\"file_size_threshold\" => '33B')"); // TODO (https://github.com/trinodb/trino/issues/11326) this should fail + assertUpdate("ALTER TABLE " + tableName + " EXECUTE \"OPTIMIZE\" (\"FILE_SIZE_THRESHOLD\" => '33B')"); + assertUpdate("DROP TABLE " + tableName); + } } - @Test(dataProvider = "tableFormatVersion") - public void testOptimizeForPartitionedTable(int formatVersion) + @Test + public void testOptimizeForPartitionedTable() throws IOException { - // This test will have its own session to make sure partitioning is indeed forced and is not a result - // of session configuration - Session session = testSessionBuilder() - .setCatalog(getQueryRunner().getDefaultSession().getCatalog()) - .setSchema(getQueryRunner().getDefaultSession().getSchema()) - .setSystemProperty("use_preferred_write_partitioning", "true") - .build(); - String tableName = "test_repartitiong_during_optimize_" + randomNameSuffix(); - assertUpdate(session, "CREATE TABLE " + tableName + " (key varchar, value integer) WITH (format_version = " + formatVersion + ", partitioning = ARRAY['key'])"); - // optimize an empty table - assertQuerySucceeds(withSingleWriterPerTask(session), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); - - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 1)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 2)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 3)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 4)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 5)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 6)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 7)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('two', 8)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('two', 9)", 1); - assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('three', 10)", 1); - - List initialFiles = getActiveFiles(tableName); - assertThat(initialFiles).hasSize(10); - - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. - computeActual(withSingleWriterPerTask(session), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); - - assertThat(query(session, "SELECT sum(value), listagg(key, ' ') WITHIN GROUP (ORDER BY key) FROM " + tableName)) - .matches("VALUES (BIGINT '55', VARCHAR 'one one one one one one one three two two')"); - - List updatedFiles = getActiveFiles(tableName); - // as we force repartitioning there should be only 3 partitions - assertThat(updatedFiles).hasSize(3); - assertThat(getAllDataFilesFromTableDirectory(tableName)).containsExactlyInAnyOrderElementsOf(concat(initialFiles, updatedFiles)); - - assertUpdate("DROP TABLE " + tableName); + for (int formatVersion = IcebergConfig.FORMAT_VERSION_SUPPORT_MIN; formatVersion < IcebergConfig.FORMAT_VERSION_SUPPORT_MAX; formatVersion++) { + // This test will have its own session to make sure partitioning is indeed forced and is not a result + // of session configuration + Session session = testSessionBuilder() + .setCatalog(getQueryRunner().getDefaultSession().getCatalog()) + .setSchema(getQueryRunner().getDefaultSession().getSchema()) + .setSystemProperty("use_preferred_write_partitioning", "true") + .build(); + String tableName = "test_repartitiong_during_optimize_" + randomNameSuffix(); + assertUpdate(session, "CREATE TABLE " + tableName + " (key varchar, value integer) WITH (format_version = " + formatVersion + ", partitioning = ARRAY['key'])"); + // optimize an empty table + assertQuerySucceeds(withSingleWriterPerTask(session), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 1)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 2)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 3)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 4)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 5)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 6)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('one', 7)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('two', 8)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('two', 9)", 1); + assertUpdate(session, "INSERT INTO " + tableName + " VALUES ('three', 10)", 1); + + List initialFiles = getActiveFiles(tableName); + assertThat(initialFiles).hasSize(10); + + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. + computeActual(withSingleWriterPerTask(session), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + + assertThat(query(session, "SELECT sum(value), listagg(key, ' ') WITHIN GROUP (ORDER BY key) FROM " + tableName)) + .matches("VALUES (BIGINT '55', VARCHAR 'one one one one one one one three two two')"); + + List updatedFiles = getActiveFiles(tableName); + // as we force repartitioning there should be only 3 partitions + assertThat(updatedFiles).hasSize(3); + assertThat(getAllDataFilesFromTableDirectory(tableName)).containsExactlyInAnyOrderElementsOf(ImmutableSet.copyOf(concat(initialFiles, updatedFiles))); + + assertUpdate("DROP TABLE " + tableName); + } } - @DataProvider - public Object[][] tableFormatVersion() + @Test + public void testOptimizeTimePartitionedTable() { - return IntStream.rangeClosed(IcebergConfig.FORMAT_VERSION_SUPPORT_MIN, IcebergConfig.FORMAT_VERSION_SUPPORT_MAX).boxed() - .collect(DataProviders.toDataProvider()); + testOptimizeTimePartitionedTable("date", "%s", 15); + testOptimizeTimePartitionedTable("date", "day(%s)", 15); + testOptimizeTimePartitionedTable("date", "month(%s)", 3); + testOptimizeTimePartitionedTable("timestamp(6)", "day(%s)", 15); + testOptimizeTimePartitionedTable("timestamp(6)", "month(%s)", 3); + testOptimizeTimePartitionedTable("timestamp(6) with time zone", "day(%s)", 15); + testOptimizeTimePartitionedTable("timestamp(6) with time zone", "month(%s)", 3); } - @Test(dataProvider = "testOptimizeTimePartitionedTableDataProvider") - public void testOptimizeTimePartitionedTable(String dataType, String partitioningFormat, int expectedFilesAfterOptimize) + private void testOptimizeTimePartitionedTable(String dataType, String partitioningFormat, int expectedFilesAfterOptimize) { String tableName = "test_optimize_time_partitioned_" + - (dataType + "_" + partitioningFormat).toLowerCase(Locale.ENGLISH).replaceAll("[^a-z0-9_]", ""); + (dataType + "_" + partitioningFormat).toLowerCase(ENGLISH).replaceAll("[^a-z0-9_]", ""); assertUpdate(format("CREATE TABLE %s(p %s, val varchar) WITH (partitioning = ARRAY['%s'])", tableName, dataType, format(partitioningFormat, "p"))); // Do several inserts so ensure more than one input file @@ -5089,7 +5518,7 @@ public void testOptimizeTimePartitionedTable(String dataType, String partitionin .isGreaterThanOrEqualTo(5); assertUpdate( - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. // Use UTC zone so that DATE and TIMESTAMP WITH TIME ZONE comparisons align with partition boundaries. withSingleWriterPerTask(Session.builder(getSession()) .setTimeZoneKey(UTC_KEY) @@ -5105,7 +5534,7 @@ public void testOptimizeTimePartitionedTable(String dataType, String partitionin // Verify that WHERE CAST(p AS date) ... form works in non-UTC zone assertUpdate( - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. withSingleWriterPerTask(Session.builder(getSession()) .setTimeZoneKey(getTimeZoneKey("Asia/Kathmandu")) .build()), @@ -5122,20 +5551,6 @@ public void testOptimizeTimePartitionedTable(String dataType, String partitionin assertUpdate("DROP TABLE " + tableName); } - @DataProvider - public static Object[][] testOptimizeTimePartitionedTableDataProvider() - { - return new Object[][] { - {"date", "%s", 15}, - {"date", "day(%s)", 15}, - {"date", "month(%s)", 3}, - {"timestamp(6)", "day(%s)", 15}, - {"timestamp(6)", "month(%s)", 3}, - {"timestamp(6) with time zone", "day(%s)", 15}, - {"timestamp(6) with time zone", "month(%s)", 3}, - }; - } - @Test public void testOptimizeTableAfterDeleteWithFormatVersion2() { @@ -5151,7 +5566,7 @@ public void testOptimizeTableAfterDeleteWithFormatVersion2() "SELECT summary['total-delete-files'] FROM \"" + tableName + "$snapshots\" WHERE snapshot_id = " + getCurrentSnapshotId(tableName), "VALUES '1'"); - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); List updatedFiles = getActiveFiles(tableName); @@ -5185,24 +5600,26 @@ public void testOptimizeCleansUpDeleteFiles() List allDataFilesAfterDelete = getAllDataFilesFromTableDirectory(tableName); assertThat(allDataFilesAfterDelete).hasSize(6); - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. - computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE regionkey = 4"); + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. + computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE regionkey = 3"); computeActual(sessionWithShortRetentionUnlocked, "ALTER TABLE " + tableName + " EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '0s')"); computeActual(sessionWithShortRetentionUnlocked, "ALTER TABLE " + tableName + " EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '0s')"); assertQuery( "SELECT summary['total-delete-files'] FROM \"" + tableName + "$snapshots\" WHERE snapshot_id = " + getCurrentSnapshotId(tableName), - "VALUES '1'"); + "VALUES '0'"); List allDataFilesAfterOptimizeWithWhere = getAllDataFilesFromTableDirectory(tableName); assertThat(allDataFilesAfterOptimizeWithWhere) - .hasSize(6) - .doesNotContain(allDataFilesInitially.stream().filter(file -> file.contains("regionkey=4")) + .hasSize(5) + .doesNotContain(allDataFilesInitially.stream().filter(file -> file.contains("regionkey=3")) + .toArray(String[]::new)) + .contains(allDataFilesInitially.stream().filter(file -> !file.contains("regionkey=3")) .toArray(String[]::new)); assertThat(query("SELECT * FROM " + tableName)) .matches("SELECT * FROM nation WHERE nationkey != 7"); - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); computeActual(sessionWithShortRetentionUnlocked, "ALTER TABLE " + tableName + " EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '0s')"); computeActual(sessionWithShortRetentionUnlocked, "ALTER TABLE " + tableName + " EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '0s')"); @@ -5213,7 +5630,44 @@ public void testOptimizeCleansUpDeleteFiles() List allDataFilesAfterFullOptimize = getAllDataFilesFromTableDirectory(tableName); assertThat(allDataFilesAfterFullOptimize) .hasSize(5) - .doesNotContain(allDataFilesInitially.toArray(new String[0])); + // All files skipped from OPTIMIZE as they have no deletes and there's only one file per partition + .contains(allDataFilesAfterOptimizeWithWhere.toArray(new String[0])); + + assertThat(query("SELECT * FROM " + tableName)) + .matches("SELECT * FROM nation WHERE nationkey != 7"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testOptimizeFilesDoNotInheritSequenceNumber() + throws IOException + { + String tableName = "test_optimize_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM nation", 25); + + assertUpdate("DELETE FROM " + tableName + " WHERE nationkey = 7", 1); + + // Verify that delete file exists + assertQuery( + "SELECT summary['total-delete-files'] FROM \"" + tableName + "$snapshots\" WHERE snapshot_id = " + getCurrentSnapshotId(tableName), + "VALUES '1'"); + + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. + computeActual(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + + List activeEntries = getIcebergEntries(tableName); + assertThat(activeEntries).hasSize(3); + + // New rewritten data file should not inherit sequence number as it is a rewrite + assertThat(activeEntries.stream().filter(entry -> entry.status() == 1)) + .hasSize(1) + .allMatch(entry -> entry.sequenceNumber() == 2 && entry.fileSequenceNumber() == 3); + + // Other files should inherit sequence number + assertThat(activeEntries.stream().filter(entry -> entry.status() == 2)) + .hasSize(2) + .allMatch(entry -> entry.sequenceNumber().equals(entry.fileSequenceNumber())); assertThat(query("SELECT * FROM " + tableName)) .matches("SELECT * FROM nation WHERE nationkey != 7"); @@ -5229,8 +5683,8 @@ public void testOptimizeSnapshot() assertUpdate("CREATE TABLE " + tableName + " (a) AS VALUES 11", 1); long snapshotId = getCurrentSnapshotId(tableName); assertUpdate("INSERT INTO " + tableName + " VALUES 22", 1); - assertThatThrownBy(() -> query("ALTER TABLE \"%s@%d\" EXECUTE OPTIMIZE".formatted(tableName, snapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, snapshotId)); + assertThat(query("ALTER TABLE \"%s@%d\" EXECUTE OPTIMIZE".formatted(tableName, snapshotId))) + .failure().hasMessage(format("line 1:7: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, snapshotId)); assertThat(query("SELECT * FROM " + tableName)) .matches("VALUES 11, 22"); @@ -5240,10 +5694,69 @@ public void testOptimizeSnapshot() @Test public void testOptimizeSystemTable() { - assertThatThrownBy(() -> query("ALTER TABLE \"nation$files\" EXECUTE OPTIMIZE")) - .hasMessage("This connector does not support table procedures"); - assertThatThrownBy(() -> query("ALTER TABLE \"nation$snapshots\" EXECUTE OPTIMIZE")) - .hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$files\" EXECUTE OPTIMIZE")) + .failure().hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$snapshots\" EXECUTE OPTIMIZE")) + .failure().hasMessage("This connector does not support table procedures"); + } + + @Test + void testOptimizeOnlyOneFileShouldHaveNoEffect() + { + String tableName = "test_optimize_one_file_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (a integer)"); + assertUpdate("INSERT INTO " + tableName + " VALUES 1, 2", 2); + + List initialFiles = getActiveFiles(tableName); + assertThat(initialFiles).hasSize(1); + + computeActual("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(query("SELECT a FROM " + tableName)) + .matches("VALUES 1, 2"); + assertThat(getActiveFiles(tableName)) + .containsExactlyInAnyOrderElementsOf(initialFiles); + + assertUpdate("DELETE FROM " + tableName + " WHERE a = 1", 1); + // Calling optimize after adding a DELETE should result in compaction + computeActual("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(query("SELECT a FROM " + tableName)) + .matches("VALUES 2"); + assertThat(getActiveFiles(tableName)) + .hasSize(1) + .doesNotContainAnyElementsOf(initialFiles); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + void testOptimizeAfterChangeInPartitioning() + { + String tableName = "test_optimize_after_change_in_partitioning_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['bucket(nationkey, 5)']) AS SELECT * FROM tpch.tiny.supplier", 100); + List initialFiles = getActiveFiles(tableName); + assertThat(initialFiles).hasSize(5); + + // OPTIMIZE shouldn't have to rewrite files + computeActual("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(query("SELECT COUNT(*) FROM " + tableName)).matches("VALUES BIGINT '100'"); + assertThat(getActiveFiles(tableName)) + .containsExactlyInAnyOrderElementsOf(initialFiles); + + // Change in partitioning should result in OPTIMIZE rewriting all files + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY['nationkey']"); + computeActual("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(query("SELECT COUNT(*) FROM " + tableName)).matches("VALUES BIGINT '100'"); + List filesAfterPartioningChange = getActiveFiles(tableName); + assertThat(filesAfterPartioningChange) + .hasSize(25) + .doesNotContainAnyElementsOf(initialFiles); + + // OPTIMIZE shouldn't have to rewrite files anymore + computeActual("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(query("SELECT COUNT(*) FROM " + tableName)).matches("VALUES BIGINT '100'"); + assertThat(getActiveFiles(tableName)) + .hasSize(25) + .containsExactlyInAnyOrderElementsOf(filesAfterPartioningChange); } private List getActiveFiles(String tableName) @@ -5253,6 +5766,17 @@ private List getActiveFiles(String tableName) .collect(toImmutableList()); } + private List getIcebergEntries(String tableName) + { + return computeActual(format("SELECT status, data_file.file_path, sequence_number, file_sequence_number FROM \"%s$entries\"", tableName)) + .getMaterializedRows() + .stream() + .map(row -> new IcebergEntry((int) row.getField(0), (String) row.getField(1), (Long) row.getField(2), (Long) row.getField(3))) + .collect(toImmutableList()); + } + + private record IcebergEntry(int status, String filePath, Long sequenceNumber, Long fileSequenceNumber) {} + protected String getTableLocation(String tableName) { Pattern locationPattern = Pattern.compile(".*location = '(.*?)'.*", Pattern.DOTALL); @@ -5279,10 +5803,10 @@ public void testOptimizeParameterValidation() "\\Qline 1:7: Table 'iceberg.tpch.no_such_table_exists' does not exist"); assertQueryFails( "ALTER TABLE nation EXECUTE OPTIMIZE (file_size_threshold => '33')", - "\\QUnable to set catalog 'iceberg' table procedure 'OPTIMIZE' property 'file_size_threshold' to ['33']: size is not a valid data size string: 33"); + "\\Qline 1:38: Unable to set catalog 'iceberg' table procedure 'OPTIMIZE' property 'file_size_threshold' to ['33']: size is not a valid data size string: 33"); assertQueryFails( "ALTER TABLE nation EXECUTE OPTIMIZE (file_size_threshold => '33s')", - "\\QUnable to set catalog 'iceberg' table procedure 'OPTIMIZE' property 'file_size_threshold' to ['33s']: Unknown unit: s"); + "\\Qline 1:38: Unable to set catalog 'iceberg' table procedure 'OPTIMIZE' property 'file_size_threshold' to ['33s']: Unknown unit: s"); } @Test @@ -5292,7 +5816,7 @@ public void testTargetMaxFileSize() @Language("SQL") String createTableSql = format("CREATE TABLE %s AS SELECT * FROM tpch.sf1.lineitem LIMIT 100000", tableName); Session session = Session.builder(getSession()) - .setSystemProperty("task_writer_count", "1") + .setSystemProperty("task_min_writer_count", "1") // task scale writers should be disabled since we want to write with a single task writer .setSystemProperty("task_scale_writers_enabled", "false") .build(); @@ -5303,7 +5827,7 @@ public void testTargetMaxFileSize() DataSize maxSize = DataSize.of(40, DataSize.Unit.KILOBYTE); session = Session.builder(getSession()) - .setSystemProperty("task_writer_count", "1") + .setSystemProperty("task_min_writer_count", "1") // task scale writers should be disabled since we want to write with a single task writer .setSystemProperty("task_scale_writers_enabled", "false") .setCatalogSessionProperty("iceberg", "target_max_file_size", maxSize.toString()) @@ -5321,16 +5845,52 @@ public void testTargetMaxFileSize() .forEach(row -> assertThat((Long) row.getField(0)).isBetween(1L, maxSize.toBytes() * 6)); } + @Test + public void testTargetMaxFileSizeOnSortedTable() + { + String tableName = "test_default_max_file_size_sorted_" + randomNameSuffix(); + @Language("SQL") String createTableSql = format("CREATE TABLE %s WITH (sorted_by = ARRAY['shipdate']) AS SELECT * FROM tpch.sf1.lineitem LIMIT 100000", tableName); + + Session session = Session.builder(getSession()) + .setSystemProperty("task_min_writer_count", "1") + // task scale writers should be disabled since we want to write with a single task writer + .setSystemProperty("task_scale_writers_enabled", "false") + .build(); + assertUpdate(session, createTableSql, 100000); + List initialFiles = getActiveFiles(tableName); + assertThat(initialFiles.size()).isLessThanOrEqualTo(3); + assertUpdate(format("DROP TABLE %s", tableName)); + + DataSize maxSize = DataSize.of(40, DataSize.Unit.KILOBYTE); + session = Session.builder(getSession()) + .setSystemProperty("task_min_writer_count", "1") + // task scale writers should be disabled since we want to write with a single task writer + .setSystemProperty("task_scale_writers_enabled", "false") + .setCatalogSessionProperty("iceberg", "target_max_file_size", maxSize.toString()) + .build(); + + assertUpdate(session, createTableSql, 100000); + assertThat(query(format("SELECT count(*) FROM %s", tableName))).matches("VALUES BIGINT '100000'"); + List updatedFiles = getActiveFiles(tableName); + assertThat(updatedFiles.size()).isGreaterThan(5); + + computeActual(format("SELECT file_size_in_bytes FROM \"%s$files\"", tableName)) + .getMaterializedRows() + // as target_max_file_size is set to quite low value it can happen that created files are bigger, + // so just to be safe we check if it is not much bigger + .forEach(row -> assertThat((Long) row.getField(0)).isBetween(1L, maxSize.toBytes() * 20)); + } + @Test public void testDroppingIcebergAndCreatingANewTableWithTheSameNameShouldBePossible() { assertUpdate("CREATE TABLE test_iceberg_recreate (a_int) AS VALUES (1)", 1); assertThat(query("SELECT min(a_int) FROM test_iceberg_recreate")).matches("VALUES 1"); - dropTable("test_iceberg_recreate"); + assertUpdate("DROP TABLE test_iceberg_recreate"); assertUpdate("CREATE TABLE test_iceberg_recreate (a_varchar) AS VALUES ('Trino')", 1); assertThat(query("SELECT min(a_varchar) FROM test_iceberg_recreate")).matches("VALUES CAST('Trino' AS varchar)"); - dropTable("test_iceberg_recreate"); + assertUpdate("DROP TABLE test_iceberg_recreate"); } @Test @@ -5348,6 +5908,217 @@ public void testDropTableDeleteData() assertUpdate("DROP TABLE " + tableName); } + @Test + void testPartitionHiddenColumn() + { + String tableName = "test_partition_" + randomNameSuffix(); + @Language("SQL") String createTable = "CREATE TABLE " + tableName + " " + + "WITH (partitioning = ARRAY['zip']) AS " + + "SELECT * FROM (VALUES " + + "(0, 0), (3, 0), (6, 0), " + + "(1, 1), (4, 1), (7, 1), " + + "(2, 2), (5, 2) " + + " ) t(userid, zip)"; + assertUpdate(createTable, 8); + + // Describe output should not have the $partition hidden column + assertThat(query("DESCRIBE " + tableName)) + .skippingTypesCheck() + .matches("VALUES ('userid', 'integer', '', ''), ('zip', 'integer', '', '')"); + + String somePath = (String) computeScalar("SELECT \"$partition\" FROM " + tableName + " WHERE userid = 2"); + String anotherPath = (String) computeScalar("SELECT \"$partition\" FROM " + tableName + " WHERE userid = 3"); + assertThat(query("SELECT userid FROM " + tableName + " WHERE \"$partition\" = '" + somePath + "'")) + .matches("VALUES 2, 5") + .isFullyPushedDown(); + assertThat(query("SELECT userid FROM " + tableName + " WHERE \"$partition\" IN ('" + somePath + "', '" + anotherPath + "')")) + .matches("VALUES 0, 2, 3, 5, 6") + .isFullyPushedDown(); + assertThat(query("SELECT userid FROM " + tableName + " WHERE \"$partition\" <> '" + somePath + "'")) + .matches("VALUES 0, 1, 3, 4, 6, 7") + .isFullyPushedDown(); + assertThat(query("SELECT userid FROM " + tableName + " WHERE \"$partition\" = '" + somePath + "' AND userid > 0")) + .matches("VALUES 2, 5"); + + assertThat(query("SELECT userid FROM " + tableName + " WHERE \"$partition\" IS NOT NULL")) + .matches("VALUES 0, 1, 2, 3, 4, 5, 6, 7") + .isFullyPushedDown(); + assertThat(query("SELECT userid FROM " + tableName + " WHERE \"$partition\" IS NULL")) + .returnsEmptyResult() + .isFullyPushedDown(); + + String min = format == AVRO ? "NULL" : "'2'"; + String max = format == AVRO ? "NULL" : "'5'"; + assertThat(query("SHOW STATS FOR (SELECT userid FROM " + tableName + " WHERE \"$partition\" = '" + somePath + "')")) + .skippingTypesCheck() + .matches("VALUES " + + "('userid', NULL, 2e0, 0e0, NULL, " + min + ", " + max + "), " + + "(NULL, NULL, NULL, NULL, 2e0, NULL, NULL)"); + + // EXPLAIN triggers stats calculation and also rendering + assertQuerySucceeds("EXPLAIN SELECT userid FROM " + tableName + " WHERE \"$partition\" = '" + somePath + "'"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + void testPartitionHiddenNestedField() + { + try (TestTable table = newTrinoTable("test_nested_partition", "WITH (partitioning = ARRAY['\"part.f\"']) AS SELECT 1 id, CAST(ROW(10) AS ROW(f int)) part")) { + assertThat(query("SELECT id, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'part.f=10')"); + } + } + + @Test + void testPartitionHiddenColumnNull() + { + try (TestTable table = newTrinoTable("test_null_partition", "WITH (partitioning = ARRAY['part']) AS SELECT 1 id, CAST(NULL AS integer) part")) { + assertThat(query("SELECT id, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'part=null')"); + } + } + + @Test + void testPartitionHiddenColumnWithNonPartitionTable() + { + try (TestTable table = newTrinoTable("test_non_partition", " AS SELECT 1 id")) { + assertThat(query("SELECT id, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR '')"); + } + } + + @Test + void testPartitionHiddenColumnMultiplePartitions() + { + try (TestTable table = newTrinoTable("test_multiple_partition", "WITH (partitioning = ARRAY['p1', 'p2']) AS SELECT 1 id, 10 p1, 100 p2")) { + assertThat(query("SELECT id, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'p1=10/p2=100')"); + } + } + + @Test + void testPartitionHiddenColumnTransform() + { + testPartitionHiddenColumnTransform("year(part)", "timestamp '2017-05-01 10:12:34'", "part_year=2017", "timestamp '2018-05-01 10:12:34'", "part_year=2018"); + testPartitionHiddenColumnTransform("month(part)", "timestamp '2017-05-01 10:12:34'", "part_month=2017-05", "timestamp '2018-05-01 10:12:34'", "part_month=2018-05"); + testPartitionHiddenColumnTransform("day(part)", "timestamp '2017-05-01 10:12:34'", "part_day=2017-05-01", "timestamp '2018-05-01 10:12:34'", "part_day=2018-05-01"); + testPartitionHiddenColumnTransform("hour(part)", "timestamp '2017-05-01 10:12:34'", "part_hour=2017-05-01-10", "timestamp '2018-05-01 10:12:34'", "part_hour=2018-05-01-10"); + testPartitionHiddenColumnTransform("bucket(part, 10)", "1", "part_bucket=6", "2", "part_bucket=2"); + testPartitionHiddenColumnTransform("truncate(part, 3)", "'abcde'", "part_trunc=abc", "'vwxyz'", "part_trunc=vwx"); + } + + private void testPartitionHiddenColumnTransform(String partitioning, String firstInput, String firstPartition, String secondInput, String secondPartition) + { + try (TestTable table = newTrinoTable("test_transform_partition", "WITH (partitioning = ARRAY['" + partitioning + "']) AS SELECT 1 id, " + firstInput + " part")) { + assertUpdate("INSERT INTO " + table.getName() + " VALUES (2, " + secondInput + ")", 1); + + assertThat(computeActual("SELECT \"$partition\" FROM " + table.getName()).getOnlyColumnAsSet()) + .containsExactlyInAnyOrder(firstPartition, secondPartition); + + assertThat(query("SELECT id FROM " + table.getName() + " WHERE \"$partition\" = '" + firstPartition + "'")) + .isFullyPushedDown() + .matches("VALUES 1"); + + assertThat(query("SELECT id FROM " + table.getName() + " WHERE \"$partition\" = '" + secondPartition + "'")) + .isFullyPushedDown() + .matches("VALUES 2"); + } + } + + @Test + void testPartitionHiddenColumnRenameColumn() + { + try (TestTable table = newTrinoTable("test_rename_partition", "WITH (partitioning = ARRAY['part']) AS SELECT 1 id, 10 part")) { + assertThat(query("SELECT id, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'part=10')"); + + assertUpdate("ALTER TABLE " + table.getName() + " RENAME COLUMN part TO renamed_part"); + assertThat(query("SELECT id, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'part=10')"); + + assertUpdate("INSERT INTO " + table.getName() + " VALUES (2, 20)", 1); + assertThat(query("SELECT id, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'part=10'), (2, VARCHAR 'part=20')"); + + assertThat(query("SELECT id FROM " + table.getName() + " WHERE \"$partition\" = 'part=10'")) + .isFullyPushedDown() + .matches("VALUES 1"); + + assertThat(query("SELECT id FROM " + table.getName() + " WHERE \"$partition\" = 'part=20'")) + .isFullyPushedDown() + .matches("VALUES 2"); + } + } + + @Test + void testPartitionHiddenColumnChangePartition() + { + try (TestTable table = newTrinoTable("test_change_partition", "WITH (partitioning = ARRAY['y']) AS SELECT 1 x, 10 y")) { + assertThat(query("SELECT x, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'y=10')"); + + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES partitioning = ARRAY['x']"); + assertThat(query("SELECT x, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'y=10')"); + + assertUpdate("INSERT INTO " + table.getName() + " VALUES (2, 20)", 1); + assertThat(query("SELECT x, \"$partition\" FROM " + table.getName())) + .matches("VALUES (1, VARCHAR 'y=10'), (2, VARCHAR 'x=2')"); + + assertThat(query("SELECT x FROM " + table.getName() + " WHERE \"$partition\" = 'y=10'")) + .isFullyPushedDown() + .matches("VALUES 1"); + + assertThat(query("SELECT x FROM " + table.getName() + " WHERE \"$partition\" = 'x=2'")) + .isFullyPushedDown() + .matches("VALUES 2"); + } + } + + @Test + void testOptimizeWithPartitionHiddenColumn() + { + try (TestTable table = newTrinoTable("test_optimize_partition", "(id int, part int) WITH (partitioning = ARRAY['bucket(part, 3)'])")) { + assertUpdate("INSERT INTO " + table.getName() + " VALUES (1, 10), (2, 20), (3, 30)", 3); + assertUpdate("INSERT INTO " + table.getName() + " VALUES (4, 10), (5, 20), (6, 30)", 3); + + Set filesInBucket0Before = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=0'").getOnlyColumnAsSet(); + Set filesInBucket1Before = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=1'").getOnlyColumnAsSet(); + Set filesInBucket2Before = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=2'").getOnlyColumnAsSet(); + + assertThat(filesInBucket0Before).hasSize(2); + assertThat(filesInBucket1Before).hasSize(2); + assertThat(filesInBucket2Before).hasSize(2); + + // Execute optimize procedure on the specific partition + assertUpdate("ALTER TABLE " + table.getName() + " EXECUTE OPTIMIZE WHERE \"$partition\" = 'part_bucket=0'"); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES (1, 10), (2, 20), (3, 30), (4, 10), (5, 20), (6, 30)"); + + Set filesInBucket0After = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=0'").getOnlyColumnAsSet(); + Set filesInBucket1After = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=1'").getOnlyColumnAsSet(); + Set filesInBucket2After = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=2'").getOnlyColumnAsSet(); + + assertThat(filesInBucket0After).hasSize(1).doesNotContain(filesInBucket0Before); + assertThat(filesInBucket1After).hasSize(2).isEqualTo(filesInBucket1Before); + assertThat(filesInBucket2After).hasSize(2).isEqualTo(filesInBucket2Before); + + // Repeat optimize procedure on the same bucket and verify that the file isn't rewritten + assertUpdate("ALTER TABLE " + table.getName() + " EXECUTE OPTIMIZE WHERE \"$partition\" = 'part_bucket=0'"); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES (1, 10), (2, 20), (3, 30), (4, 10), (5, 20), (6, 30)"); + + Set filesInBucket0Repeat = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=0'").getOnlyColumnAsSet(); + Set filesInBucket1Repeat = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=1'").getOnlyColumnAsSet(); + Set filesInBucket2Repeat = computeActual("SELECT \"$path\" FROM " + table.getName() + " WHERE \"$partition\" = 'part_bucket=2'").getOnlyColumnAsSet(); + + assertThat(filesInBucket0Repeat).hasSize(1).isEqualTo(filesInBucket0After); + assertThat(filesInBucket1Repeat).hasSize(2).isEqualTo(filesInBucket1After); + assertThat(filesInBucket2Repeat).hasSize(2).isEqualTo(filesInBucket2After); + } + } + @Test public void testPathHiddenColumn() { @@ -5416,7 +6187,7 @@ public void testOptimizeWithPathColumn() List initialFiles = getActiveFiles(tableName); assertThat(initialFiles).hasSize(4); - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE \"$path\" = '" + firstPath + "' OR \"$path\" = '" + secondPath + "'"); assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE \"$path\" = '" + thirdPath + "' OR \"$path\" = '" + fourthPath + "'"); @@ -5429,9 +6200,211 @@ public void testOptimizeWithPathColumn() } @Test - public void testDeleteWithPathColumn() + public void testCollectingStatisticsWithPathColumnPredicate() { - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_delete_with_path_", "(key int)")) { + assertQuerySucceeds("EXPLAIN SELECT * FROM region WHERE \"$path\" = ''"); + + Session collectingStatisticsSession = Session.builder(getSession()) + .setSystemProperty("collect_plan_statistics_for_all_queries", "true") + .build(); + String tableName = "test_collect_statistics_with_path_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + "(id integer, value integer)"); + + assertUpdate("INSERT INTO " + tableName + " VALUES (1, 1)", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, 2)", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (3, null)", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (4, 4)", 1); + + // Make sure the whole table has stats + MaterializedResult tableStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (SELECT * FROM %s WHERE \"$path\" IS NOT NULL)".formatted(tableName)); + MaterializedResult expectedTableStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 4.0, 0.0, null, "1", "4") + .row("value", null, 3.0, 0.25, null, "1", "4") + .row(null, null, null, null, 4.0, null, null) + .build(); + if (format == AVRO) { + expectedTableStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 4.0, 0.0, null, null, null) + .row("value", null, 3.0, 0.1, null, null, null) + .row(null, null, null, null, 4.0, null, null) + .build(); + } + assertThat(tableStatistics).containsExactlyElementsOf(expectedTableStatistics); + + String firstPath = (String) computeScalar(collectingStatisticsSession, "SELECT \"$path\" FROM " + tableName + " WHERE id = 1"); + String secondPath = (String) computeScalar(collectingStatisticsSession, "SELECT \"$path\" FROM " + tableName + " WHERE id = 2"); + String thirdPath = (String) computeScalar(collectingStatisticsSession, "SELECT \"$path\" FROM " + tableName + " WHERE id = 3"); + String fourthPath = (String) computeScalar(collectingStatisticsSession, "SELECT \"$path\" FROM " + tableName + " WHERE id = 4"); + + String pathPredicateSql = "SELECT * FROM " + tableName + " WHERE \"$path\" = '%s'"; + // Check the predicate with path + assertQuery(collectingStatisticsSession, pathPredicateSql.formatted(firstPath), "VALUES (1, 1)"); + assertQuery(collectingStatisticsSession, pathPredicateSql.formatted(secondPath), "VALUES (2, 2)"); + assertQuery(collectingStatisticsSession, "SELECT COUNT(*) FROM %s WHERE \"$path\" = '%s' OR \"$path\" = '%s'".formatted(tableName, thirdPath, fourthPath), "VALUES 2"); + + MaterializedResult firstPathStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (" + pathPredicateSql.formatted(firstPath) + ")"); + MaterializedResult expectedFirstPathStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, "1", "1") + .row("value", null, 1.0, 0.0, null, "1", "1") + .row(null, null, null, null, 1.0, null, null) + .build(); + if (format == AVRO) { + expectedFirstPathStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, null, null) + .row("value", null, 1.0, 0.0, null, null, null) + .row(null, null, null, null, 1.0, null, null) + .build(); + } + assertThat(firstPathStatistics).containsExactlyElementsOf(expectedFirstPathStatistics); + + MaterializedResult secondThirdPathStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (SELECT * FROM %s WHERE \"$path\" IN ('%s', '%s'))".formatted(tableName, secondPath, thirdPath)); + MaterializedResult expectedSecondThirdPathStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 2.0, 0.0, null, "2", "3") + .row("value", null, 1.0, 0.5, null, "2", "2") + .row(null, null, null, null, 2.0, null, null) + .build(); + if (format == AVRO) { + expectedSecondThirdPathStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 2.0, 0.0, null, null, null) + .row("value", null, 2.0, 0.0, null, null, null) + .row(null, null, null, null, 2.0, null, null) + .build(); + } + assertThat(secondThirdPathStatistics).containsExactlyElementsOf(expectedSecondThirdPathStatistics); + + MaterializedResult fourthPathStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (" + pathPredicateSql.formatted(fourthPath) + ")"); + MaterializedResult expectedFourthPathStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, "4", "4") + .row("value", null, 1.0, 0.0, null, "4", "4") + .row(null, null, null, null, 1.0, null, null) + .build(); + if (format == AVRO) { + expectedFourthPathStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, null, null) + .row("value", null, 1.0, 0.0, null, null, null) + .row(null, null, null, null, 1.0, null, null) + .build(); + } + assertThat(fourthPathStatistics).containsExactlyElementsOf(expectedFourthPathStatistics); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testCollectingStatisticsWithFileModifiedTimeColumnPredicate() + throws InterruptedException + { + assertQuerySucceeds("EXPLAIN SELECT * FROM region WHERE \"$file_modified_time\" = TIMESTAMP '2001-08-22 03:04:05.321 UTC'"); + + Session collectingStatisticsSession = Session.builder(getSession()) + .setSystemProperty("collect_plan_statistics_for_all_queries", "true") + .build(); + String tableName = "test_collect_statistics_with_file_modified_time_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + "(id integer, value integer)"); + + assertUpdate("INSERT INTO " + tableName + " VALUES (1, 1)", 1); + storageTimePrecision.sleep(1); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, 2)", 1); + storageTimePrecision.sleep(1); + assertUpdate("INSERT INTO " + tableName + " VALUES (3, null)", 1); + storageTimePrecision.sleep(1); + assertUpdate("INSERT INTO " + tableName + " VALUES (4, 4)", 1); + + // Make sure the whole table has stats + MaterializedResult tableStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (SELECT * FROM %s WHERE \"$file_modified_time\" IS NOT NULL)".formatted(tableName)); + MaterializedResult expectedTableStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 4.0, 0.0, null, "1", "4") + .row("value", null, 3.0, 0.25, null, "1", "4") + .row(null, null, null, null, 4.0, null, null) + .build(); + if (format == AVRO) { + expectedTableStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 4.0, 0.0, null, null, null) + .row("value", null, 3.0, 0.1, null, null, null) + .row(null, null, null, null, 4.0, null, null) + .build(); + } + assertThat(tableStatistics).containsExactlyElementsOf(expectedTableStatistics); + + ZonedDateTime firstFileModifiedTime = (ZonedDateTime) computeScalar(collectingStatisticsSession, "SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 1"); + ZonedDateTime secondFileModifiedTime = (ZonedDateTime) computeScalar(collectingStatisticsSession, "SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 2"); + ZonedDateTime thirdFileModifiedTime = (ZonedDateTime) computeScalar(collectingStatisticsSession, "SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 3"); + ZonedDateTime fourthFileModifiedTime = (ZonedDateTime) computeScalar(collectingStatisticsSession, "SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 4"); + + String fileModifiedTimePredicateSql = "SELECT * FROM " + tableName + " WHERE \"$file_modified_time\" = from_iso8601_timestamp('%s')"; + // Check the predicate with fileModifiedTime + assertQuery(collectingStatisticsSession, fileModifiedTimePredicateSql.formatted(firstFileModifiedTime.format(ISO_OFFSET_DATE_TIME)), "SELECT 1, 1"); + assertQuery(collectingStatisticsSession, fileModifiedTimePredicateSql.formatted(secondFileModifiedTime.format(ISO_OFFSET_DATE_TIME)), "SELECT 2, 2"); + assertQuery(collectingStatisticsSession, "SELECT COUNT(*) FROM %s WHERE \"$file_modified_time\" = from_iso8601_timestamp('%s') OR \"$file_modified_time\" = from_iso8601_timestamp('%s')".formatted(tableName, thirdFileModifiedTime.format(ISO_OFFSET_DATE_TIME), fourthFileModifiedTime.format(ISO_OFFSET_DATE_TIME)), "VALUES 2"); + + MaterializedResult firstFileModifiedTimeStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (" + fileModifiedTimePredicateSql.formatted(firstFileModifiedTime.format(ISO_OFFSET_DATE_TIME)) + ")"); + MaterializedResult expectedFirstFileModifiedTimeStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, "1", "1") + .row("value", null, 1.0, 0.0, null, "1", "1") + .row(null, null, null, null, 1.0, null, null) + .build(); + if (format == AVRO) { + expectedFirstFileModifiedTimeStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, null, null) + .row("value", null, 1.0, 0.0, null, null, null) + .row(null, null, null, null, 1.0, null, null) + .build(); + } + assertThat(firstFileModifiedTimeStatistics).containsExactlyElementsOf(expectedFirstFileModifiedTimeStatistics); + + MaterializedResult secondThirdFileModifiedTimeStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (SELECT * FROM %s WHERE \"$file_modified_time\" IN (from_iso8601_timestamp('%s'), from_iso8601_timestamp('%s')))".formatted(tableName, secondFileModifiedTime.format(ISO_OFFSET_DATE_TIME), thirdFileModifiedTime.format(ISO_OFFSET_DATE_TIME))); + MaterializedResult expectedSecondThirdFileModifiedTimetatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 2.0, 0.0, null, "2", "3") + .row("value", null, 1.0, 0.5, null, "2", "2") + .row(null, null, null, null, 2.0, null, null) + .build(); + if (format == AVRO) { + expectedSecondThirdFileModifiedTimetatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 2.0, 0.0, null, null, null) + .row("value", null, 2.0, 0.0, null, null, null) + .row(null, null, null, null, 2.0, null, null) + .build(); + } + assertThat(secondThirdFileModifiedTimeStatistics).containsExactlyElementsOf(expectedSecondThirdFileModifiedTimetatistics); + + MaterializedResult fourthFileModifiedTimeStatistics = computeActual(collectingStatisticsSession, "SHOW STATS FOR (" + fileModifiedTimePredicateSql.formatted(fourthFileModifiedTime.format(ISO_OFFSET_DATE_TIME)) + ")"); + MaterializedResult expectedFourthFileModifiedTimeStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, "4", "4") + .row("value", null, 1.0, 0.0, null, "4", "4") + .row(null, null, null, null, 1.0, null, null) + .build(); + if (format == AVRO) { + expectedFourthFileModifiedTimeStatistics = + resultBuilder(collectingStatisticsSession, VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR) + .row("id", null, 1.0, 0.0, null, null, null) + .row("value", null, 1.0, 0.0, null, null, null) + .row(null, null, null, null, 1.0, null, null) + .build(); + } + assertThat(fourthFileModifiedTimeStatistics).containsExactlyElementsOf(expectedFourthFileModifiedTimeStatistics); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testDeleteWithPathColumn() + { + try (TestTable table = newTrinoTable("test_delete_with_path_", "(key int)")) { assertUpdate("INSERT INTO " + table.getName() + " VALUES (1)", 1); sleepUninterruptibly(1, MILLISECONDS); assertUpdate("INSERT INTO " + table.getName() + " VALUES (2)", 1); @@ -5450,7 +6423,7 @@ public void testFileModifiedTimeHiddenColumn() if (storageTimePrecision.toMillis(1) > 1) { storageTimePrecision.sleep(1); } - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_file_modified_time_", "(col) AS VALUES (1)")) { + try (TestTable table = newTrinoTable("test_file_modified_time_", "(col) AS VALUES (1)")) { // Describe output should not have the $file_modified_time hidden column assertThat(query("DESCRIBE " + table.getName())) .skippingTypesCheck() @@ -5463,7 +6436,8 @@ public void testFileModifiedTimeHiddenColumn() storageTimePrecision.sleep(1); assertUpdate("INSERT INTO " + table.getName() + " VALUES (2)", 1); ZonedDateTime anotherFileModifiedTime = (ZonedDateTime) computeScalar("SELECT max(\"$file_modified_time\") FROM " + table.getName()); - assertNotEquals(fileModifiedTime, anotherFileModifiedTime); + assertThat(fileModifiedTime) + .isNotEqualTo(anotherFileModifiedTime); assertThat(anotherFileModifiedTime).isAfter(fileModifiedTime); // to detect potential clock backward adjustment assertThat(query("SELECT col FROM " + table.getName() + " WHERE \"$file_modified_time\" = from_iso8601_timestamp('" + fileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "')")) @@ -5492,50 +6466,49 @@ public void testFileModifiedTimeHiddenColumn() public void testOptimizeWithFileModifiedTimeColumn() throws Exception { - String tableName = "test_optimize_with_file_modified_time_" + randomNameSuffix(); - assertUpdate("CREATE TABLE " + tableName + " (id integer)"); - - assertUpdate("INSERT INTO " + tableName + " VALUES (1)", 1); - storageTimePrecision.sleep(1); - assertUpdate("INSERT INTO " + tableName + " VALUES (2)", 1); - storageTimePrecision.sleep(1); - assertUpdate("INSERT INTO " + tableName + " VALUES (3)", 1); - storageTimePrecision.sleep(1); - assertUpdate("INSERT INTO " + tableName + " VALUES (4)", 1); - - ZonedDateTime firstFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 1"); - ZonedDateTime secondFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 2"); - ZonedDateTime thirdFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 3"); - ZonedDateTime fourthFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 4"); - // Sanity check - assertThat(List.of(firstFileModifiedTime, secondFileModifiedTime, thirdFileModifiedTime, fourthFileModifiedTime)) - .doesNotHaveDuplicates(); + try (TestTable table = newTrinoTable("test_optimize_with_file_modified_time_", "(id INT)")) { + String tableName = table.getName(); - List initialFiles = getActiveFiles(tableName); - assertThat(initialFiles).hasSize(4); + assertUpdate("INSERT INTO " + tableName + " VALUES (1)", 1); + storageTimePrecision.sleep(1); + assertUpdate("INSERT INTO " + tableName + " VALUES (2)", 1); + storageTimePrecision.sleep(1); + assertUpdate("INSERT INTO " + tableName + " VALUES (3)", 1); + storageTimePrecision.sleep(1); + assertUpdate("INSERT INTO " + tableName + " VALUES (4)", 1); - storageTimePrecision.sleep(1); - // For optimize we need to set task_writer_count to 1, otherwise it will create more than one file. - assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE " + - "\"$file_modified_time\" = from_iso8601_timestamp('" + firstFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "') OR " + - "\"$file_modified_time\" = from_iso8601_timestamp('" + secondFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "')"); - assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE " + - "\"$file_modified_time\" = from_iso8601_timestamp('" + thirdFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "') OR " + - "\"$file_modified_time\" = from_iso8601_timestamp('" + fourthFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "')"); + ZonedDateTime firstFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 1"); + ZonedDateTime secondFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 2"); + ZonedDateTime thirdFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 3"); + ZonedDateTime fourthFileModifiedTime = (ZonedDateTime) computeScalar("SELECT \"$file_modified_time\" FROM " + tableName + " WHERE id = 4"); + // Sanity check + assertThat(List.of(firstFileModifiedTime, secondFileModifiedTime, thirdFileModifiedTime, fourthFileModifiedTime)) + .doesNotHaveDuplicates(); - List updatedFiles = getActiveFiles(tableName); - assertThat(updatedFiles) - .hasSize(2) - .doesNotContainAnyElementsOf(initialFiles); + List initialFiles = getActiveFiles(tableName); + assertThat(initialFiles).hasSize(4); - assertUpdate("DROP TABLE " + tableName); + storageTimePrecision.sleep(1); + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. + assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE " + + "\"$file_modified_time\" = from_iso8601_timestamp('" + firstFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "') OR " + + "\"$file_modified_time\" = from_iso8601_timestamp('" + secondFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "')"); + assertQuerySucceeds(withSingleWriterPerTask(getSession()), "ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE " + + "\"$file_modified_time\" = from_iso8601_timestamp('" + thirdFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "') OR " + + "\"$file_modified_time\" = from_iso8601_timestamp('" + fourthFileModifiedTime.format(ISO_OFFSET_DATE_TIME) + "')"); + + List updatedFiles = getActiveFiles(tableName); + assertThat(updatedFiles) + .hasSize(2) + .doesNotContainAnyElementsOf(initialFiles); + } } @Test public void testDeleteWithFileModifiedTimeColumn() throws Exception { - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_delete_with_file_modified_time_", "(key int)")) { + try (TestTable table = newTrinoTable("test_delete_with_file_modified_time_", "(key int)")) { assertUpdate("INSERT INTO " + table.getName() + " VALUES (1)", 1); storageTimePrecision.sleep(1); assertUpdate("INSERT INTO " + table.getName() + " VALUES (2)", 1); @@ -5567,9 +6540,9 @@ public void testExpireSnapshots() .matches("VALUES (BIGINT '3', VARCHAR 'one two')"); List updatedFiles = getAllMetadataFilesFromTableDirectory(tableLocation); List updatedSnapshots = getSnapshotIds(tableName); - assertThat(updatedFiles.size()).isEqualTo(initialFiles.size() - 2); + assertThat(updatedFiles).hasSize(initialFiles.size() - 2); assertThat(updatedSnapshots.size()).isLessThan(initialSnapshots.size()); - assertThat(updatedSnapshots.size()).isEqualTo(1); + assertThat(updatedSnapshots).hasSize(1); assertThat(initialSnapshots).containsAll(updatedSnapshots); } @@ -5604,8 +6577,8 @@ public void testExpireSnapshotsOnSnapshot() assertUpdate("CREATE TABLE " + tableName + " (a) AS VALUES 11", 1); long snapshotId = getCurrentSnapshotId(tableName); assertUpdate("INSERT INTO " + tableName + " VALUES 22", 1); - assertThatThrownBy(() -> query("ALTER TABLE \"%s@%d\" EXECUTE EXPIRE_SNAPSHOTS".formatted(tableName, snapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, snapshotId)); + assertThat(query("ALTER TABLE \"%s@%d\" EXECUTE EXPIRE_SNAPSHOTS".formatted(tableName, snapshotId))) + .failure().hasMessage(format("line 1:7: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, snapshotId)); assertThat(query("SELECT * FROM " + tableName)) .matches("VALUES 11, 22"); @@ -5615,10 +6588,10 @@ public void testExpireSnapshotsOnSnapshot() @Test public void testExpireSnapshotsSystemTable() { - assertThatThrownBy(() -> query("ALTER TABLE \"nation$files\" EXECUTE EXPIRE_SNAPSHOTS")) - .hasMessage("This connector does not support table procedures"); - assertThatThrownBy(() -> query("ALTER TABLE \"nation$snapshots\" EXECUTE EXPIRE_SNAPSHOTS")) - .hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$files\" EXECUTE EXPIRE_SNAPSHOTS")) + .failure().hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$snapshots\" EXECUTE EXPIRE_SNAPSHOTS")) + .failure().hasMessage("This connector does not support table procedures"); } @Test @@ -5630,7 +6603,7 @@ public void testExplainExpireSnapshotOutput() assertUpdate("INSERT INTO " + tableName + " VALUES ('two', 2)", 1); assertExplain("EXPLAIN ALTER TABLE " + tableName + " EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '0s')", - "SimpleTableExecute\\[table = iceberg:schemaTableName:tpch.test_expiring_snapshots.*\\{retentionThreshold=0\\.00s}.*"); + "SimpleTableExecute\\[table = iceberg:schemaTableName:tpch.test_expiring_snapshots.*\\[retentionThreshold=0\\.00s].*"); } @Test @@ -5641,13 +6614,29 @@ public void testExpireSnapshotsParameterValidation() "\\Qline 1:7: Table 'iceberg.tpch.no_such_table_exists' does not exist"); assertQueryFails( "ALTER TABLE nation EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '33')", - "\\QUnable to set catalog 'iceberg' table procedure 'EXPIRE_SNAPSHOTS' property 'retention_threshold' to ['33']: duration is not a valid data duration string: 33"); + "\\Qline 1:46: Unable to set catalog 'iceberg' table procedure 'EXPIRE_SNAPSHOTS' property 'retention_threshold' to ['33']: duration is not a valid data duration string: 33"); assertQueryFails( "ALTER TABLE nation EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '33mb')", - "\\QUnable to set catalog 'iceberg' table procedure 'EXPIRE_SNAPSHOTS' property 'retention_threshold' to ['33mb']: Unknown time unit: mb"); + "\\Qline 1:46: Unable to set catalog 'iceberg' table procedure 'EXPIRE_SNAPSHOTS' property 'retention_threshold' to ['33mb']: Unknown time unit: mb"); assertQueryFails( "ALTER TABLE nation EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '33s')", - "\\QRetention specified (33.00s) is shorter than the minimum retention configured in the system (7.00d). Minimum retention can be changed with iceberg.expire_snapshots.min-retention configuration property or iceberg.expire_snapshots_min_retention session property"); + "\\QRetention specified (33.00s) is shorter than the minimum retention configured in the system (7.00d). Minimum retention can be changed with iceberg.expire-snapshots.min-retention configuration property or iceberg.expire_snapshots_min_retention session property"); + } + + @Test + public void testRemoveOrphanFilesWithUnexpectedMissingManifest() + throws Exception + { + String tableName = "test_remove_orphan_files_with_missing_manifest_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (key varchar, value integer)"); + assertUpdate("INSERT INTO " + tableName + " VALUES ('one', 1)", 1); + String manifestFileToRemove = (String) computeScalar("SELECT path FROM \"" + tableName + "$manifests\""); + fileSystem.deleteFile(Location.of(manifestFileToRemove)); + + assertThat(query("ALTER TABLE " + tableName + " EXECUTE REMOVE_ORPHAN_FILES")) + .failure() + .hasErrorCode(ICEBERG_INVALID_METADATA) + .hasMessage("Manifest file does not exist: " + manifestFileToRemove); } @Test @@ -5750,7 +6739,7 @@ private void testCleaningUpWithTableWithSpecifiedLocation(String suffix) List prunedMetadataFiles = getAllMetadataFilesFromTableDirectory(tableDirectory); List prunedSnapshots = getSnapshotIds(tableName); assertThat(prunedMetadataFiles).as("prunedMetadataFiles") - .hasSize(initialMetadataFiles.size() - 3); + .hasSize(initialMetadataFiles.size() - 2); assertThat(prunedSnapshots).as("prunedSnapshots") .hasSizeLessThan(initialSnapshots.size()) .hasSize(1); @@ -5768,7 +6757,7 @@ public void testExplainRemoveOrphanFilesOutput() assertUpdate("INSERT INTO " + tableName + " VALUES ('two', 2)", 1); assertExplain("EXPLAIN ALTER TABLE " + tableName + " EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '0s')", - "SimpleTableExecute\\[table = iceberg:schemaTableName:tpch.test_remove_orphan_files.*\\{retentionThreshold=0\\.00s}.*"); + "SimpleTableExecute\\[table = iceberg:schemaTableName:tpch.test_remove_orphan_files.*\\[retentionThreshold=0\\.00s].*"); } @Test @@ -5779,13 +6768,13 @@ public void testRemoveOrphanFilesParameterValidation() "\\Qline 1:7: Table 'iceberg.tpch.no_such_table_exists' does not exist"); assertQueryFails( "ALTER TABLE nation EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '33')", - "\\QUnable to set catalog 'iceberg' table procedure 'REMOVE_ORPHAN_FILES' property 'retention_threshold' to ['33']: duration is not a valid data duration string: 33"); + "\\Qline 1:49: Unable to set catalog 'iceberg' table procedure 'REMOVE_ORPHAN_FILES' property 'retention_threshold' to ['33']: duration is not a valid data duration string: 33"); assertQueryFails( "ALTER TABLE nation EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '33mb')", - "\\QUnable to set catalog 'iceberg' table procedure 'REMOVE_ORPHAN_FILES' property 'retention_threshold' to ['33mb']: Unknown time unit: mb"); + "\\Qline 1:49: Unable to set catalog 'iceberg' table procedure 'REMOVE_ORPHAN_FILES' property 'retention_threshold' to ['33mb']: Unknown time unit: mb"); assertQueryFails( "ALTER TABLE nation EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '33s')", - "\\QRetention specified (33.00s) is shorter than the minimum retention configured in the system (7.00d). Minimum retention can be changed with iceberg.remove_orphan_files.min-retention configuration property or iceberg.remove_orphan_files_min_retention session property"); + "\\QRetention specified (33.00s) is shorter than the minimum retention configured in the system (7.00d). Minimum retention can be changed with iceberg.remove-orphan-files.min-retention configuration property or iceberg.remove_orphan_files_min_retention session property"); } @Test @@ -5796,8 +6785,8 @@ public void testRemoveOrphanFilesOnSnapshot() assertUpdate("CREATE TABLE " + tableName + " (a) AS VALUES 11", 1); long snapshotId = getCurrentSnapshotId(tableName); assertUpdate("INSERT INTO " + tableName + " VALUES 22", 1); - assertThatThrownBy(() -> query("ALTER TABLE \"%s@%d\" EXECUTE REMOVE_ORPHAN_FILES".formatted(tableName, snapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, snapshotId)); + assertThat(query("ALTER TABLE \"%s@%d\" EXECUTE REMOVE_ORPHAN_FILES".formatted(tableName, snapshotId))) + .failure().hasMessage(format("line 1:7: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, snapshotId)); assertThat(query("SELECT * FROM " + tableName)) .matches("VALUES 11, 22"); @@ -5807,10 +6796,10 @@ public void testRemoveOrphanFilesOnSnapshot() @Test public void testRemoveOrphanFilesSystemTable() { - assertThatThrownBy(() -> query("ALTER TABLE \"nation$files\" EXECUTE REMOVE_ORPHAN_FILES")) - .hasMessage("This connector does not support table procedures"); - assertThatThrownBy(() -> query("ALTER TABLE \"nation$snapshots\" EXECUTE REMOVE_ORPHAN_FILES")) - .hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$files\" EXECUTE REMOVE_ORPHAN_FILES")) + .failure().hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$snapshots\" EXECUTE REMOVE_ORPHAN_FILES")) + .failure().hasMessage("This connector does not support table procedures"); } @Test @@ -5848,13 +6837,251 @@ public void testUpdatingFileFormat() assertUpdate("DROP TABLE " + tableName); } + @Test + public void testCreateTableAsWithCompressionCodecs() + { + String compressionProperty = getCompressionPropertyName(format); + + for (HiveCompressionCodec compressionCodec : getCompressionCodecs(Optional.empty())) { + String tableName = format("test_ctas_%s_codec_%s_%s", compressionCodec.name(), format, randomNameSuffix()); + if (isCompressionCodecSupportedForFormat(format, compressionCodec)) { + assertUpdate( + format("CREATE TABLE %s WITH (format = '%s', compression_codec = '%s') AS SELECT * FROM nation", tableName, format, compressionCodec.name()), + "SELECT count(*) FROM nation"); + + assertThat(getTableProperties(tableName)) + .containsEntry(DEFAULT_FILE_FORMAT, format.toString()) + .containsEntry(compressionProperty, compressionCodec.name()); + + assertThat(query("SELECT * FROM " + tableName)).matches("SELECT * FROM nation"); + assertThat(query(format("SELECT count(*) FROM \"%s$files\" WHERE file_path LIKE '%%.%s'", tableName, format.name().toLowerCase(ENGLISH)))) + .matches("SELECT BIGINT '1'"); + + assertUpdate( + "INSERT INTO " + tableName + " SELECT * FROM nation WHERE nationkey >= 10", + "SELECT count(*) FROM nation WHERE nationkey >= 10"); + + assertUpdate("DROP TABLE " + tableName); + } + else { + assertQueryFails( + format("CREATE TABLE %s WITH (format = '%s', compression_codec = '%s') AS SELECT * FROM nation", tableName, format, compressionCodec.name()), + "Compression codec LZ4 not supported for .*"); + } + } + } + + @Test + public void testCreateTableAsWithCompressionCodecUnsupported() + { + String tableName = format("test_ctas_unsupported_%s_%s", format, randomNameSuffix()); + + assertQueryFails("CREATE TABLE " + tableName + " WITH (format = '" + format + "', compression_codec = 'unsupported') AS SELECT * FROM nation", + ".* \\QUnable to set catalog 'iceberg' table property 'compression_codec' to ['unsupported']: Invalid value [unsupported]. Valid values: [NONE, SNAPPY, LZ4, ZSTD, GZIP]"); + } + + @Test + public void testUpdatingOnlyCompressionCodec() + { + String tableName = "test_updating_compression_codec_" + randomNameSuffix(); + HiveCompressionCodec initialCompressionCodec = HiveCompressionCodec.ZSTD; + + List compressionCodecs = getCompressionCodecs(Optional.of(initialCompressionCodec)); + + for (HiveCompressionCodec compressionCodec : compressionCodecs) { + String compressionProperty = getCompressionPropertyName(format); + String newCompressionCodec = compressionCodec.name(); + + assertThat(query(format("CREATE TABLE %s WITH (format = '%s', compression_codec = '%s') AS SELECT * FROM nation WHERE nationkey < 10", tableName, format, initialCompressionCodec))) + .matches("SELECT count(*) FROM nation WHERE nationkey < 10"); + assertThat(getTableProperties(tableName)) + .containsEntry(DEFAULT_FILE_FORMAT, format.toString()) + .containsEntry(compressionProperty, initialCompressionCodec.name()); + + if (isCompressionCodecSupportedForFormat(format, compressionCodec)) { + assertUpdate(format("ALTER TABLE %s SET PROPERTIES compression_codec = '%s'", tableName, newCompressionCodec)); + assertThat(getTableProperties(tableName)) + .containsEntry(DEFAULT_FILE_FORMAT, format.toString()) + .containsEntry(compressionProperty, newCompressionCodec); + assertUpdate( + "INSERT INTO " + tableName + " SELECT * FROM nation WHERE nationkey >= 10", + "SELECT count(*) FROM nation WHERE nationkey >= 10"); + + assertThat(query("SELECT * FROM " + tableName)).matches("SELECT * FROM nation"); + assertThat(query(format("SELECT count(*) FROM \"%s$files\" WHERE file_path LIKE '%%.%s'", tableName, format.name().toLowerCase(ENGLISH)))) + .matches("SELECT BIGINT '2'"); + } + else { + assertQueryFails( + format("ALTER TABLE %s SET PROPERTIES compression_codec = '%s'", tableName, newCompressionCodec), + "Compression codec LZ4 not supported for .*"); + } + assertUpdate("DROP TABLE " + tableName); + } + } + + @Test + public void testUpdatingOnlyFileFormat() + { + String tableName = "test_updating_compression_codec_" + randomNameSuffix(); + + List newFileFormats = getFileFormats(format); + + for (IcebergFileFormat fileFormat : newFileFormats) { + for (HiveCompressionCodec compressionCodec : getCompressionCodecs(Optional.empty())) { + String compressionProperty = getCompressionPropertyName(format); + + ImmutableMap.Builder fileCounter = ImmutableMap.builder(); + + if (isCompressionCodecSupportedForFormat(format, compressionCodec)) { + assertUpdate( + format("CREATE TABLE %s WITH (format = '%s', compression_codec = '%s') AS SELECT * FROM nation WHERE nationkey < 10", tableName, format, compressionCodec), + "SELECT count(*) FROM nation WHERE nationkey < 10"); + + assertThat(getTableProperties(tableName)) + .containsEntry(DEFAULT_FILE_FORMAT, format.toString()) + .containsEntry(compressionProperty, compressionCodec.name()); + + fileCounter.put(format, 1); + + compressionProperty = getCompressionPropertyName(fileFormat); + + if (isCompressionCodecSupportedForFormat(fileFormat, compressionCodec)) { + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES format = '" + fileFormat + "'"); + assertThat(getTableProperties(tableName)) + .containsEntry(DEFAULT_FILE_FORMAT, fileFormat.toString()) + .containsEntry(compressionProperty, compressionCodec.name()); + assertUpdate( + "INSERT INTO " + tableName + " SELECT * FROM nation WHERE nationkey >= 10", + "SELECT count(*) FROM nation WHERE nationkey >= 10"); + fileCounter.put(fileFormat, 1); + + assertThat(query("SELECT * FROM " + tableName)).matches("SELECT * FROM nation"); + + // Verify number of files per suffix + for (Map.Entry entry : fileCounter.buildOrThrow().entrySet()) { + assertThat(query(format("SELECT count(*) FROM \"%s$files\" WHERE file_path LIKE '%%.%s'", tableName, entry.getKey().name().toLowerCase(ENGLISH)))) + .matches("SELECT BIGINT '1'"); + } + } + else { + assertQueryFails("ALTER TABLE " + tableName + " SET PROPERTIES format = '" + fileFormat + "'", + "Compression codec LZ4 not supported for .*"); + } + + assertUpdate("DROP TABLE " + tableName); + } + else { + assertQueryFails(format("CREATE TABLE %s WITH (format = '%s', compression_codec = '%s') AS SELECT * FROM nation", tableName, format, compressionCodec), + "Compression codec LZ4 not supported for .*"); + } + } + } + } + + @Test + public void testUpdatingCompressionCodecBothFileFormatAndCompression() + { + String tableName = "test_updating_file_format_compression_codec_" + randomNameSuffix(); + HiveCompressionCodec initialCompressionCodec = HiveCompressionCodec.ZSTD; + + List newFileFormats = getFileFormats(format); + + for (IcebergFileFormat fileFormat : newFileFormats) { + List compressionCodecs = getCompressionCodecs(Optional.of(initialCompressionCodec)); + for (HiveCompressionCodec compressionCodec : compressionCodecs) { + String compressionProperty = getCompressionPropertyName(format); + String newCompressionCodec = compressionCodec.name(); + + ImmutableMap.Builder fileCounter = ImmutableMap.builder(); + + // Create initial table + assertUpdate( + format("CREATE TABLE %s WITH (format = '%s', compression_codec = '%s') AS SELECT * FROM nation WHERE nationkey < 10", tableName, format, initialCompressionCodec), + "SELECT count(*) FROM nation WHERE nationkey < 10"); + assertThat(getTableProperties(tableName)) + .containsEntry(DEFAULT_FILE_FORMAT, format.toString()) + .containsEntry(compressionProperty, initialCompressionCodec.name()); + + fileCounter.put(format, 1); + + compressionProperty = getCompressionPropertyName(fileFormat); + + // Modify both storage and compression properties + if (isCompressionCodecSupportedForFormat(fileFormat, compressionCodec)) { + assertUpdate(format("ALTER TABLE %s SET PROPERTIES format = '%s', compression_codec = '%s'", tableName, fileFormat, newCompressionCodec)); + assertThat(getTableProperties(tableName)) + .containsEntry(DEFAULT_FILE_FORMAT, fileFormat.toString()) + .containsEntry(compressionProperty, newCompressionCodec); + assertUpdate( + "INSERT INTO " + tableName + " SELECT * FROM nation WHERE nationkey >= 10", + "SELECT count(*) FROM nation WHERE nationkey >= 10"); + fileCounter.put(fileFormat, 1); + + assertThat(query("SELECT * FROM " + tableName)).matches("SELECT * FROM nation"); + + // Verify number of files per suffix + for (Map.Entry entry : fileCounter.buildOrThrow().entrySet()) { + assertThat(query(format("SELECT count(*) FROM \"%s$files\" WHERE file_path LIKE '%%.%s'", tableName, entry.getKey().name().toLowerCase(ENGLISH)))) + .matches("SELECT BIGINT '1'"); + } + } + else { + assertQueryFails(format("ALTER TABLE %s SET PROPERTIES format = '%s', compression_codec = '%s'", tableName, fileFormat, newCompressionCodec), + "Compression codec LZ4 not supported for .*"); + } + + assertUpdate("DROP TABLE " + tableName); + } + } + } + + @Test + public void testUpdatingFileFormatCompressionMixed() + { + try (TestTable table = newTrinoTable("test_updating_file_format_compression_mixed", "WITH (format = 'AVRO') AS SELECT * FROM nation")) { + assertThat(getTableProperties(table.getName())).doesNotContainKey(PARQUET_COMPRESSION); + + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES compression_codec = 'ZSTD'"); + + assertThat(getTableProperties(table.getName())) + .containsEntry(DEFAULT_FILE_FORMAT, AVRO.name()) + .containsEntry(AVRO_COMPRESSION, ZSTD.name()); + } + } + + @Test + public void testUpdatingMaxCommitRetry() + { + try (TestTable table = newTrinoTable("test_max_commit_retry", "(x int) WITH (max_commit_retry = 1)")) { + assertThat(computeScalar("SELECT value FROM \"" + table.getName() + "$properties\" WHERE key = 'commit.retry.num-retries'")) + .isEqualTo("1"); + + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES max_commit_retry = 100"); + assertThat(computeScalar("SELECT value FROM \"" + table.getName() + "$properties\" WHERE key = 'commit.retry.num-retries'")) + .isEqualTo("100"); + + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES max_commit_retry = 0"); + assertThat(computeScalar("SELECT value FROM \"" + table.getName() + "$properties\" WHERE key = 'commit.retry.num-retries'")) + .isEqualTo("0"); + + assertQueryFails("ALTER TABLE " + table.getName() + " SET PROPERTIES max_commit_retry = -1", ".* max_commit_retry must be greater than or equal to 0"); + assertThat(computeScalar("SELECT value FROM \"" + table.getName() + "$properties\" WHERE key = 'commit.retry.num-retries'")) + .isEqualTo("0"); + + assertQueryFails("ALTER TABLE " + table.getName() + " SET PROPERTIES max_commit_retry = NULL", ".* \\QInvalid null value for catalog 'iceberg' table property 'max_commit_retry' from [null]"); + assertThat(computeScalar("SELECT value FROM \"" + table.getName() + "$properties\" WHERE key = 'commit.retry.num-retries'")) + .isEqualTo("0"); + } + } + @Test public void testUpdatingInvalidTableProperty() { String tableName = "test_updating_invalid_table_property_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " (a INT, b INT)"); - assertThatThrownBy(() -> query("ALTER TABLE " + tableName + " SET PROPERTIES not_a_valid_table_property = 'a value'")) - .hasMessage("Catalog 'iceberg' table property 'not_a_valid_table_property' does not exist"); + assertThat(query("ALTER TABLE " + tableName + " SET PROPERTIES not_a_valid_table_property = 'a value'")) + .failure().hasMessage("line 1:76: Catalog 'iceberg' table property 'not_a_valid_table_property' does not exist"); assertUpdate("DROP TABLE " + tableName); } @@ -5930,6 +7157,15 @@ public void testEmptyDelete() assertUpdate("DROP TABLE " + tableName); } + @Test + public void testEmptyFilesTruncate() + { + try (TestTable table = newTrinoTable("test_empty_files_truncate_", "AS SELECT 1 AS id")) { + assertUpdate("TRUNCATE TABLE " + table.getName()); + assertQueryReturnsEmptyResult("SELECT * FROM \"" + table.getName() + "$files\""); + } + } + @Test public void testModifyingOldSnapshotIsNotPossible() { @@ -5939,20 +7175,20 @@ public void testModifyingOldSnapshotIsNotPossible() long oldSnapshotId = getCurrentSnapshotId(tableName); assertUpdate(format("INSERT INTO %s VALUES 4,5,6", tableName), 3); assertQuery(format("SELECT * FROM %s FOR VERSION AS OF %d", tableName, oldSnapshotId), "VALUES 1,2,3"); - assertThatThrownBy(() -> query(format("INSERT INTO \"%s@%d\" VALUES 7,8,9", tableName, oldSnapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, oldSnapshotId)); - assertThatThrownBy(() -> query(format("DELETE FROM \"%s@%d\" WHERE col = 5", tableName, oldSnapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, oldSnapshotId)); - assertThatThrownBy(() -> query(format("UPDATE \"%s@%d\" SET col = 50 WHERE col = 5", tableName, oldSnapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, oldSnapshotId)); - assertThatThrownBy(() -> query(format("INSERT INTO \"%s@%d\" VALUES 7,8,9", tableName, getCurrentSnapshotId(tableName)))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, getCurrentSnapshotId(tableName))); - assertThatThrownBy(() -> query(format("DELETE FROM \"%s@%d\" WHERE col = 9", tableName, getCurrentSnapshotId(tableName)))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, getCurrentSnapshotId(tableName))); + assertThat(query(format("INSERT INTO \"%s@%d\" VALUES 7,8,9", tableName, oldSnapshotId))) + .failure().hasMessage(format("line 1:1: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, oldSnapshotId)); + assertThat(query(format("DELETE FROM \"%s@%d\" WHERE col = 5", tableName, oldSnapshotId))) + .failure().hasMessage(format("line 1:1: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, oldSnapshotId)); + assertThat(query(format("UPDATE \"%s@%d\" SET col = 50 WHERE col = 5", tableName, oldSnapshotId))) + .failure().hasMessage(format("line 1:1: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, oldSnapshotId)); + assertThat(query(format("INSERT INTO \"%s@%d\" VALUES 7,8,9", tableName, getCurrentSnapshotId(tableName)))) + .failure().hasMessage(format("line 1:1: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, getCurrentSnapshotId(tableName))); + assertThat(query(format("DELETE FROM \"%s@%d\" WHERE col = 9", tableName, getCurrentSnapshotId(tableName)))) + .failure().hasMessage(format("line 1:1: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, getCurrentSnapshotId(tableName))); assertThatThrownBy(() -> assertUpdate(format("UPDATE \"%s@%d\" set col = 50 WHERE col = 5", tableName, getCurrentSnapshotId(tableName)))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, getCurrentSnapshotId(tableName))); - assertThatThrownBy(() -> query(format("ALTER TABLE \"%s@%d\" EXECUTE OPTIMIZE", tableName, oldSnapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, oldSnapshotId)); + .hasMessage(format("line 1:1: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, getCurrentSnapshotId(tableName))); + assertThat(query(format("ALTER TABLE \"%s@%d\" EXECUTE OPTIMIZE", tableName, oldSnapshotId))) + .failure().hasMessage(format("line 1:7: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, oldSnapshotId)); assertQuery(format("SELECT * FROM %s", tableName), "VALUES 1,2,3,4,5,6"); assertUpdate("DROP TABLE " + tableName); @@ -6052,8 +7288,8 @@ public void testInsertIntoBucketedColumnTaskWriterCount() int taskWriterCount = 4; assertThat(taskWriterCount).isGreaterThan(getQueryRunner().getNodeCount()); Session session = Session.builder(getSession()) - .setSystemProperty(TASK_WRITER_COUNT, String.valueOf(taskWriterCount)) - .setSystemProperty(TASK_PARTITIONED_WRITER_COUNT, String.valueOf(taskWriterCount)) + .setSystemProperty(TASK_MIN_WRITER_COUNT, String.valueOf(taskWriterCount)) + .setSystemProperty(TASK_MAX_WRITER_COUNT, String.valueOf(taskWriterCount)) .build(); String tableName = "test_inserting_into_bucketed_column_task_writer_count_" + randomNameSuffix(); @@ -6073,44 +7309,54 @@ public void testReadFromVersionedTableWithSchemaEvolution() assertQuerySucceeds("CREATE TABLE " + tableName + "(col1 varchar)"); long v1SnapshotId = getCurrentSnapshotId(tableName); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v1SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR)) - .returnsEmptyResult(); + .result() + .hasTypes(ImmutableList.of(VARCHAR)) + .isEmpty(); assertUpdate("ALTER TABLE " + tableName + " ADD COLUMN col2 integer"); assertThat(query("SELECT * FROM " + tableName)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) - .returnsEmptyResult(); + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) + .isEmpty(); assertUpdate("INSERT INTO " + tableName + " VALUES ('a', 11)", 1); long v2SnapshotId = getCurrentSnapshotId(tableName); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) .matches("VALUES (VARCHAR 'a', 11)"); assertThat(query("SELECT * FROM " + tableName)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) .matches("VALUES (VARCHAR 'a', 11)"); assertUpdate("ALTER TABLE " + tableName + " ADD COLUMN col3 bigint"); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) .matches("VALUES (VARCHAR 'a', 11)"); assertThat(query("SELECT * FROM " + tableName)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER, BIGINT)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER, BIGINT)) .matches("VALUES (VARCHAR 'a', 11, CAST(NULL AS bigint))"); assertUpdate("INSERT INTO " + tableName + " VALUES ('b', 22, 32)", 1); long v3SnapshotId = getCurrentSnapshotId(tableName); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v1SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR)) - .returnsEmptyResult(); + .result() + .hasTypes(ImmutableList.of(VARCHAR)) + .isEmpty(); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) .matches("VALUES (VARCHAR 'a', 11)"); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v3SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER, BIGINT)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER, BIGINT)) .matches("VALUES (VARCHAR 'a', 11, NULL), (VARCHAR 'b', 22, BIGINT '32')"); assertThat(query("SELECT * FROM " + tableName)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER, BIGINT)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER, BIGINT)) .matches("VALUES (VARCHAR 'a', 11, NULL), (VARCHAR 'b', 22, BIGINT '32')"); } @@ -6122,42 +7368,51 @@ public void testReadFromVersionedTableWithSchemaEvolutionDropColumn() assertQuerySucceeds("CREATE TABLE " + tableName + "(col1 varchar, col2 integer, col3 boolean)"); long v1SnapshotId = getCurrentSnapshotId(tableName); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v1SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) - .returnsEmptyResult(); + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) + .isEmpty(); assertUpdate("INSERT INTO " + tableName + " VALUES ('a', 1, true)", 1); long v2SnapshotId = getCurrentSnapshotId(tableName); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) .matches("VALUES (VARCHAR 'a', 1, true)"); assertUpdate("ALTER TABLE " + tableName + " DROP COLUMN col3"); assertUpdate("INSERT INTO " + tableName + " VALUES ('b', 2)", 1); long v3SnapshotId = getCurrentSnapshotId(tableName); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v3SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) .matches("VALUES (VARCHAR 'a', 1), (VARCHAR 'b', 2)"); assertThat(query("SELECT * FROM " + tableName)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) .matches("VALUES (VARCHAR 'a', 1), (VARCHAR 'b', 2)"); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) .matches("VALUES (VARCHAR 'a', 1, true)"); assertUpdate("ALTER TABLE " + tableName + " DROP COLUMN col2"); assertUpdate("INSERT INTO " + tableName + " VALUES ('c')", 1); long v4SnapshotId = getCurrentSnapshotId(tableName); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v4SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR)) + .result() + .hasTypes(ImmutableList.of(VARCHAR)) .matches("VALUES (VARCHAR 'a'), (VARCHAR 'b'), (VARCHAR 'c')"); assertThat(query("SELECT * FROM " + tableName)) - .hasOutputTypes(ImmutableList.of(VARCHAR)) + .result() + .hasTypes(ImmutableList.of(VARCHAR)) .matches("VALUES (VARCHAR 'a'), (VARCHAR 'b'), (VARCHAR 'c')"); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v3SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER)) .matches("VALUES (VARCHAR 'a', 1), (VARCHAR 'b', 2)"); assertThat(query("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId)) - .hasOutputTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) + .result() + .hasTypes(ImmutableList.of(VARCHAR, INTEGER, BOOLEAN)) .matches("VALUES (VARCHAR 'a', 1, true)"); assertUpdate("DROP TABLE " + tableName); @@ -6228,7 +7483,7 @@ public void testReadFromVersionedTableWithExpiredHistory() assertQuerySucceeds(sessionWithShortRetentionUnlocked, "ALTER TABLE " + tableName + " EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '0s')"); List updatedSnapshots = getSnapshotIds(tableName); assertThat(updatedSnapshots.size()).isLessThan(initialSnapshots.size()); - assertThat(updatedSnapshots.size()).isEqualTo(1); + assertThat(updatedSnapshots).hasSize(1); assertThat(query("SELECT sum(value), listagg(key, ' ') WITHIN GROUP (ORDER BY key) FROM " + tableName + " FOR VERSION AS OF " + v3SnapshotId)) .matches("VALUES (BIGINT '3', VARCHAR 'one two')"); @@ -6254,92 +7509,272 @@ public void testDeleteRetainsTableHistory() List snapshotsAfterDelete = getTableHistory(tableName); assertThat(snapshotsAfterDelete.size()).isGreaterThan(snapshots.size()); assertThat(snapshotsAfterDelete).containsAll(snapshots); - assertUpdate("DROP TABLE " + tableName); } @Test - public void testMergeSimpleSelectPartitioned() + public void testDeleteRetainsMetadataFile() { - String targetTable = "merge_simple_target_" + randomNameSuffix(); - String sourceTable = "merge_simple_source_" + randomNameSuffix(); - assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address'])", targetTable)); + String tableName = "test_delete_retains_metadata_file_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + "(c1 INT, c2 INT)"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, 1), (2, 2), (3, 3)", 3); + assertUpdate("INSERT INTO " + tableName + " VALUES (3, 3), (4, 4), (5, 5)", 3); + List metadataLogEntries = getLatestSequenceNumbersInMetadataLogEntries(tableName); - assertUpdate(format("INSERT INTO %s (customer, purchases, address) VALUES ('Aaron', 5, 'Antioch'), ('Bill', 7, 'Buena'), ('Carol', 3, 'Cambridge'), ('Dave', 11, 'Devon')", targetTable), 4); + assertUpdate("DELETE FROM " + tableName + " WHERE c1 < 4", 4); + List metadataLogEntriesAfterDelete = getLatestSequenceNumbersInMetadataLogEntries(tableName); + assertThat(metadataLogEntriesAfterDelete) + .hasSizeGreaterThan(metadataLogEntries.size()) + .containsAll(metadataLogEntries); + assertUpdate("DROP TABLE " + tableName); + } - assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)", sourceTable)); + @Test + public void testCreateOrReplaceTableSnapshots() + { + try (TestTable table = newTrinoTable("test_create_or_replace_", " AS SELECT BIGINT '42' a, DOUBLE '-38.5' b")) { + long v1SnapshotId = getCurrentSnapshotId(table.getName()); - assertUpdate(format("INSERT INTO %s (customer, purchases, address) VALUES ('Aaron', 6, 'Arches'), ('Ed', 7, 'Etherville'), ('Carol', 9, 'Centreville'), ('Dave', 11, 'Darbyshire')", sourceTable), 4); + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " AS SELECT BIGINT '-42' a, DOUBLE '38.5' b", 1); + assertThat(query("SELECT CAST(a AS bigint), b FROM " + table.getName())) + .matches("VALUES (BIGINT '-42', 385e-1)"); - String sql = format("MERGE INTO %s t USING %s s ON (t.customer = s.customer)", targetTable, sourceTable) + - " WHEN MATCHED AND s.address = 'Centreville' THEN DELETE" + - " WHEN MATCHED THEN UPDATE SET purchases = s.purchases + t.purchases, address = s.address" + - " WHEN NOT MATCHED THEN INSERT (customer, purchases, address) VALUES(s.customer, s.purchases, s.address)"; + assertThat(query("SELECT a, b FROM " + table.getName() + " FOR VERSION AS OF " + v1SnapshotId)) + .matches("VALUES (BIGINT '42', -385e-1)"); + } + } - assertUpdate(sql, 4); + @Test + public void testCreateOrReplaceTableChangeColumnNamesAndTypes() + { + try (TestTable table = newTrinoTable("test_create_or_replace_", " AS SELECT BIGINT '42' a, DOUBLE '-38.5' b")) { + long v1SnapshotId = getCurrentSnapshotId(table.getName()); - assertQuery("SELECT * FROM " + targetTable, "VALUES ('Aaron', 11, 'Arches'), ('Ed', 7, 'Etherville'), ('Bill', 7, 'Buena'), ('Dave', 22, 'Darbyshire')"); + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " AS SELECT CAST(ARRAY[ROW('test')] AS ARRAY(ROW(field VARCHAR))) a, VARCHAR 'test2' b", 1); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES (CAST(ARRAY[ROW('test')] AS ARRAY(ROW(field VARCHAR))), VARCHAR 'test2')"); - assertUpdate("DROP TABLE " + sourceTable); - assertUpdate("DROP TABLE " + targetTable); + assertThat(query("SELECT * FROM " + table.getName() + " FOR VERSION AS OF " + v1SnapshotId)) + .matches("VALUES (BIGINT '42', -385e-1)"); + } } - @Test(dataProvider = "partitionedAndBucketedProvider") - public void testMergeUpdateWithVariousLayouts(int writers, String partitioning) + @Test + public void testCreateOrReplaceTableChangePartitionedTableIntoUnpartitioned() { - Session session = Session.builder(getSession()) - .setSystemProperty(TASK_WRITER_COUNT, String.valueOf(writers)) - .build(); + try (TestTable table = newTrinoTable("test_create_or_replace_", " WITH (partitioning=ARRAY['a']) AS SELECT BIGINT '42' a, 'some data' b UNION ALL SELECT BIGINT '43' a, 'another data' b")) { + long v1SnapshotId = getCurrentSnapshotId(table.getName()); - String targetTable = "merge_formats_target_" + randomNameSuffix(); - String sourceTable = "merge_formats_source_" + randomNameSuffix(); - assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchase VARCHAR) %s", targetTable, partitioning)); + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " WITH (sorted_by=ARRAY['a']) AS SELECT BIGINT '22' a, 'new data' b", 1); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES (BIGINT '22', CAST('new data' AS VARCHAR))"); - assertUpdate(format("INSERT INTO %s (customer, purchase) VALUES ('Dave', 'dates'), ('Lou', 'limes'), ('Carol', 'candles')", targetTable), 3); - assertQuery("SELECT * FROM " + targetTable, "VALUES ('Dave', 'dates'), ('Lou', 'limes'), ('Carol', 'candles')"); + assertThat(query("SELECT partition FROM \"" + table.getName() + "$partitions\"")) + .matches("VALUES (ROW(CAST (ROW(NULL) AS ROW(a BIGINT))))"); - assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchase VARCHAR)", sourceTable)); + assertThat(query("SELECT * FROM " + table.getName() + " FOR VERSION AS OF " + v1SnapshotId)) + .matches("VALUES (BIGINT '42', CAST('some data' AS VARCHAR)), (BIGINT '43', CAST('another data' AS VARCHAR))"); - assertUpdate(format("INSERT INTO %s (customer, purchase) VALUES ('Craig', 'candles'), ('Len', 'limes'), ('Joe', 'jellybeans')", sourceTable), 3); + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .contains("sorted_by = ARRAY['a ASC NULLS FIRST']"); + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .doesNotContain("partitioning = ARRAY['a']"); + } + } - String sql = format("MERGE INTO %s t USING %s s ON (t.purchase = s.purchase)", targetTable, sourceTable) + - " WHEN MATCHED AND s.purchase = 'limes' THEN DELETE" + - " WHEN MATCHED THEN UPDATE SET customer = CONCAT(t.customer, '_', s.customer)" + - " WHEN NOT MATCHED THEN INSERT (customer, purchase) VALUES(s.customer, s.purchase)"; + @Test + public void testCreateOrReplaceTableChangeUnpartitionedTableIntoPartitioned() + { + try (TestTable table = newTrinoTable("test_create_or_replace_", " WITH (sorted_by=ARRAY['a']) AS SELECT BIGINT '22' a, CAST('some data' AS VARCHAR) b")) { + long v1SnapshotId = getCurrentSnapshotId(table.getName()); - assertUpdate(session, sql, 3); + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " WITH (partitioning=ARRAY['a']) AS SELECT BIGINT '42' a, 'some data' b UNION ALL SELECT BIGINT '43' a, 'another data' b", 2); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES (BIGINT '42', CAST('some data' AS VARCHAR)), (BIGINT '43', CAST('another data' AS VARCHAR))"); - assertQuery("SELECT * FROM " + targetTable, "VALUES ('Dave', 'dates'), ('Carol_Craig', 'candles'), ('Joe', 'jellybeans')"); - assertUpdate("DROP TABLE " + sourceTable); - assertUpdate("DROP TABLE " + targetTable); + assertThat(query("SELECT partition FROM \"" + table.getName() + "$partitions\"")) + .matches("VALUES (ROW(CAST (ROW(BIGINT '42') AS ROW(a BIGINT)))), (ROW(CAST (ROW(BIGINT '43') AS ROW(a BIGINT))))"); + + assertThat(query("SELECT * FROM " + table.getName() + " FOR VERSION AS OF " + v1SnapshotId)) + .matches("VALUES (BIGINT '22', CAST('some data' AS VARCHAR))"); + + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .contains("partitioning = ARRAY['a']"); + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .doesNotContain("sorted_by = ARRAY['a ASC NULLS FIRST']"); + } } - @DataProvider - public Object[][] partitionedAndBucketedProvider() + @Test + public void testCreateOrReplaceTableWithComments() { - List writerCounts = ImmutableList.of(1, 4); - List partitioningTypes = ImmutableList.builder() - .add("") - .add("WITH (partitioning = ARRAY['customer'])") - .add("WITH (partitioning = ARRAY['purchase'])") - .add("WITH (partitioning = ARRAY['bucket(customer, 3)'])") - .add("WITH (partitioning = ARRAY['bucket(purchase, 4)'])") - .build(); + try (TestTable table = newTrinoTable("test_create_or_replace_", " (a BIGINT COMMENT 'This is a column') COMMENT 'This is a table'")) { + long v1SnapshotId = getCurrentSnapshotId(table.getName()); - List data = new ArrayList<>(); - for (int writers : writerCounts) { - for (String partitioning : partitioningTypes) { - data.add(new Object[] {writers, partitioning}); - } + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " AS SELECT 1 a", 1); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES 1"); + + assertThat(query("SELECT * FROM " + table.getName() + " FOR VERSION AS OF " + v1SnapshotId)) + .returnsEmptyResult(); + + assertThat(getTableComment(table.getName())) + .isNull(); + assertThat(getColumnComment(table.getName(), "a")) + .isNull(); + + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " (a BIGINT COMMENT 'This is a column') COMMENT 'This is a table'"); + + assertThat(getTableComment(table.getName())) + .isEqualTo("This is a table"); + assertThat(getColumnComment(table.getName(), "a")) + .isEqualTo("This is a column"); } - return data.toArray(Object[][]::new); } - @Test(dataProvider = "partitionedAndBucketedProvider") - public void testMergeMultipleOperations(int writers, String partitioning) + @Test + public void testCreateOrReplaceTableWithSameLocation() + { + try (TestTable table = newTrinoTable( + "test_create_or_replace_with_same_location_", + "(a integer)")) { + String initialTableLocation = getTableLocation(table.getName()); + assertUpdate("INSERT INTO " + table.getName() + " VALUES 1", 1); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES 1"); + long v1SnapshotId = getCurrentSnapshotId(table.getName()); + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " (a integer)"); + assertThat(getTableLocation(table.getName())) + .isEqualTo(initialTableLocation); + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " (a integer) WITH (location = '" + initialTableLocation + "')"); + String initialTableLocationWithTrailingSlash = initialTableLocation.endsWith("/") ? initialTableLocation : initialTableLocation + "/"; + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " (a integer) WITH (location = '" + initialTableLocationWithTrailingSlash + "')"); + assertThat(getTableLocation(table.getName())) + .isEqualTo(initialTableLocation); + assertThat(query("SELECT * FROM " + table.getName())) + .returnsEmptyResult(); + assertUpdate("CREATE OR REPLACE TABLE " + table.getName() + " WITH (location = '" + initialTableLocation + "') AS SELECT 2 as a", 1); + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES 2"); + assertThat(getTableLocation(table.getName())) + .isEqualTo(initialTableLocation); + assertThat(query("SELECT * FROM " + table.getName() + " FOR VERSION AS OF " + v1SnapshotId)) + .matches("VALUES 1"); + } + } + + @Test + public void testCreateOrReplaceTableWithChangeInLocation() + { + try (TestTable table = newTrinoTable("test_create_or_replace_change_location_", "(a integer) ")) { + String initialTableLocation = getTableLocation(table.getName()) + randomNameSuffix(); + long v1SnapshotId = getCurrentSnapshotId(table.getName()); + assertQueryFails( + "CREATE OR REPLACE TABLE " + table.getName() + " (a integer) WITH (location = '%s')".formatted(initialTableLocation), + "The provided location '%s' does not match the existing table location '.*'".formatted(initialTableLocation)); + assertQueryFails( + "CREATE OR REPLACE TABLE " + table.getName() + " WITH (location = '%s') AS SELECT 1 AS a".formatted(initialTableLocation), + "The provided location '%s' does not match the existing table location '.*'".formatted(initialTableLocation)); + assertThat(getCurrentSnapshotId(table.getName())) + .isEqualTo(v1SnapshotId); + } + } + + @Test + public void testMergeSimpleSelectPartitioned() + { + String targetTable = "merge_simple_target_" + randomNameSuffix(); + String sourceTable = "merge_simple_source_" + randomNameSuffix(); + assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address'])", targetTable)); + + assertUpdate(format("INSERT INTO %s (customer, purchases, address) VALUES ('Aaron', 5, 'Antioch'), ('Bill', 7, 'Buena'), ('Carol', 3, 'Cambridge'), ('Dave', 11, 'Devon')", targetTable), 4); + + assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)", sourceTable)); + + assertUpdate(format("INSERT INTO %s (customer, purchases, address) VALUES ('Aaron', 6, 'Arches'), ('Ed', 7, 'Etherville'), ('Carol', 9, 'Centreville'), ('Dave', 11, 'Darbyshire')", sourceTable), 4); + + String sql = format("MERGE INTO %s t USING %s s ON (t.customer = s.customer)", targetTable, sourceTable) + + " WHEN MATCHED AND s.address = 'Centreville' THEN DELETE" + + " WHEN MATCHED THEN UPDATE SET purchases = s.purchases + t.purchases, address = s.address" + + " WHEN NOT MATCHED THEN INSERT (customer, purchases, address) VALUES(s.customer, s.purchases, s.address)"; + + assertUpdate(sql, 4); + + assertQuery("SELECT * FROM " + targetTable, "VALUES ('Aaron', 11, 'Arches'), ('Ed', 7, 'Etherville'), ('Bill', 7, 'Buena'), ('Dave', 22, 'Darbyshire')"); + + assertUpdate("DROP TABLE " + sourceTable); + assertUpdate("DROP TABLE " + targetTable); + } + + @Test + public void testMergeUpdateWithVariousLayouts() + { + testMergeUpdateWithVariousLayouts(1, ""); + testMergeUpdateWithVariousLayouts(4, ""); + testMergeUpdateWithVariousLayouts(1, "WITH (partitioning = ARRAY['customer'])"); + testMergeUpdateWithVariousLayouts(4, "WITH (partitioning = ARRAY['customer'])"); + testMergeUpdateWithVariousLayouts(1, "WITH (partitioning = ARRAY['purchase'])"); + testMergeUpdateWithVariousLayouts(4, "WITH (partitioning = ARRAY['purchase'])"); + testMergeUpdateWithVariousLayouts(1, "WITH (partitioning = ARRAY['bucket(customer, 3)'])"); + testMergeUpdateWithVariousLayouts(4, "WITH (partitioning = ARRAY['bucket(customer, 3)'])"); + testMergeUpdateWithVariousLayouts(1, "WITH (partitioning = ARRAY['bucket(purchase, 4)'])"); + testMergeUpdateWithVariousLayouts(4, "WITH (partitioning = ARRAY['bucket(purchase, 4)'])"); + } + + private void testMergeUpdateWithVariousLayouts(int writers, String partitioning) + { + Session session = Session.builder(getSession()) + .setSystemProperty(TASK_MIN_WRITER_COUNT, String.valueOf(writers)) + .build(); + + String targetTable = "merge_formats_target_" + randomNameSuffix(); + String sourceTable = "merge_formats_source_" + randomNameSuffix(); + assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchase VARCHAR) %s", targetTable, partitioning)); + + assertUpdate(format("INSERT INTO %s (customer, purchase) VALUES ('Dave', 'dates'), ('Lou', 'limes'), ('Carol', 'candles')", targetTable), 3); + assertQuery("SELECT * FROM " + targetTable, "VALUES ('Dave', 'dates'), ('Lou', 'limes'), ('Carol', 'candles')"); + + assertUpdate(format("CREATE TABLE %s (customer VARCHAR, purchase VARCHAR)", sourceTable)); + + assertUpdate(format("INSERT INTO %s (customer, purchase) VALUES ('Craig', 'candles'), ('Len', 'limes'), ('Joe', 'jellybeans')", sourceTable), 3); + + String sql = format("MERGE INTO %s t USING %s s ON (t.purchase = s.purchase)", targetTable, sourceTable) + + " WHEN MATCHED AND s.purchase = 'limes' THEN DELETE" + + " WHEN MATCHED THEN UPDATE SET customer = CONCAT(t.customer, '_', s.customer)" + + " WHEN NOT MATCHED THEN INSERT (customer, purchase) VALUES(s.customer, s.purchase)"; + + assertUpdate(session, sql, 3); + + assertQuery("SELECT * FROM " + targetTable, "VALUES ('Dave', 'dates'), ('Carol_Craig', 'candles'), ('Joe', 'jellybeans')"); + assertUpdate("DROP TABLE " + sourceTable); + assertUpdate("DROP TABLE " + targetTable); + } + + @Test + @Override + public void testMergeMultipleOperations() + { + testMergeMultipleOperations(1, "", false); + testMergeMultipleOperations(4, "", false); + testMergeMultipleOperations(1, "WITH (partitioning = ARRAY['customer'])", false); + testMergeMultipleOperations(4, "WITH (partitioning = ARRAY['customer'])", false); + testMergeMultipleOperations(1, "WITH (partitioning = ARRAY['purchase'])", false); + testMergeMultipleOperations(4, "WITH (partitioning = ARRAY['purchase'])", false); + testMergeMultipleOperations(1, "WITH (partitioning = ARRAY['bucket(customer, 3)'])", false); + testMergeMultipleOperations(4, "WITH (partitioning = ARRAY['bucket(customer, 3)'])", false); + testMergeMultipleOperations(1, "WITH (partitioning = ARRAY['bucket(purchase, 4)'])", false); + testMergeMultipleOperations(4, "WITH (partitioning = ARRAY['bucket(purchase, 4)'])", false); + testMergeMultipleOperations(1, "", true); + testMergeMultipleOperations(4, "WITH (partitioning = ARRAY['customer'])", true); + testMergeMultipleOperations(1, "WITH (partitioning = ARRAY['bucket(customer, 3)'])", true); + testMergeMultipleOperations(4, "WITH (partitioning = ARRAY['bucket(purchase, 4)'])", true); + } + + private void testMergeMultipleOperations(int writers, String partitioning, boolean determinePartitionCountForWrite) { Session session = Session.builder(getSession()) - .setSystemProperty(TASK_WRITER_COUNT, String.valueOf(writers)) - .setSystemProperty(TASK_PARTITIONED_WRITER_COUNT, String.valueOf(writers)) + .setSystemProperty(TASK_MIN_WRITER_COUNT, String.valueOf(writers)) + .setSystemProperty(TASK_MAX_WRITER_COUNT, String.valueOf(writers)) + .setSystemProperty(DETERMINE_PARTITION_COUNT_FOR_WRITE_ENABLED, Boolean.toString(determinePartitionCountForWrite)) .build(); int targetCustomerCount = 32; @@ -6426,8 +7861,18 @@ public void testMergeSimpleQueryPartitioned() assertUpdate("DROP TABLE " + targetTable); } - @Test(dataProvider = "partitionedBucketedFailure") - public void testMergeMultipleRowsMatchFails(String createTableSql) + @Test + @Override + public void testMergeMultipleRowsMatchFails() + { + testMergeMultipleRowsMatchFails("CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)"); + testMergeMultipleRowsMatchFails("CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['bucket(customer, 3)'])"); + testMergeMultipleRowsMatchFails("CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])"); + testMergeMultipleRowsMatchFails("CREATE TABLE %s (customer VARCHAR, address VARCHAR, purchases INT) WITH (partitioning = ARRAY['address'])"); + testMergeMultipleRowsMatchFails("CREATE TABLE %s (purchases INT, customer VARCHAR, address VARCHAR) WITH (partitioning = ARRAY['address', 'customer'])"); + } + + private void testMergeMultipleRowsMatchFails(String createTableSql) { String targetTable = "merge_multiple_target_" + randomNameSuffix(); String sourceTable = "merge_multiple_source_" + randomNameSuffix(); @@ -6451,20 +7896,36 @@ public void testMergeMultipleRowsMatchFails(String createTableSql) assertUpdate("DROP TABLE " + targetTable); } - @DataProvider - public Object[][] partitionedBucketedFailure() - { - return new Object[][] { - {"CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)"}, - {"CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['bucket(customer, 3)'])"}, - {"CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])"}, - {"CREATE TABLE %s (customer VARCHAR, address VARCHAR, purchases INT) WITH (partitioning = ARRAY['address'])"}, - {"CREATE TABLE %s (purchases INT, customer VARCHAR, address VARCHAR) WITH (partitioning = ARRAY['address', 'customer'])"} - }; - } - - @Test(dataProvider = "targetAndSourceWithDifferentPartitioning") - public void testMergeWithDifferentPartitioning(String testDescription, String createTargetTableSql, String createSourceTableSql) + @Test + public void testMergeWithDifferentPartitioning() + { + testMergeWithDifferentPartitioning( + "target_partitioned_source_and_target_partitioned_and_bucketed", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])"); + testMergeWithDifferentPartitioning( + "target_flat_source_partitioned_by_customer", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)", + "CREATE TABLE %s (purchases INT, address VARCHAR, customer VARCHAR) WITH (partitioning = ARRAY['customer'])"); + testMergeWithDifferentPartitioning( + "target_partitioned_by_customer_source_flat", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)"); + testMergeWithDifferentPartitioning( + "target_bucketed_by_customer_source_flat", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['bucket(customer, 3)'])", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)"); + testMergeWithDifferentPartitioning( + "target_partitioned_source_partitioned_and_bucketed", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])"); + testMergeWithDifferentPartitioning( + "target_partitioned_target_partitioned_and_bucketed", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])", + "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])"); + } + + private void testMergeWithDifferentPartitioning(String testDescription, String createTargetTableSql, String createSourceTableSql) { String targetTable = format("%s_target_%s", testDescription, randomNameSuffix()); String sourceTable = format("%s_source_%s", testDescription, randomNameSuffix()); @@ -6489,43 +7950,6 @@ public void testMergeWithDifferentPartitioning(String testDescription, String cr assertUpdate("DROP TABLE " + targetTable); } - @DataProvider - public Object[][] targetAndSourceWithDifferentPartitioning() - { - return new Object[][] { - { - "target_partitioned_source_and_target_partitioned_and_bucketed", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])", - }, - { - "target_flat_source_partitioned_by_customer", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)", - "CREATE TABLE %s (purchases INT, address VARCHAR, customer VARCHAR) WITH (partitioning = ARRAY['customer'])" - }, - { - "target_partitioned_by_customer_source_flat", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)", - }, - { - "target_bucketed_by_customer_source_flat", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['bucket(customer, 3)'])", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR)", - }, - { - "target_partitioned_source_partitioned_and_bucketed", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])", - }, - { - "target_partitioned_target_partitioned_and_bucketed", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['address', 'bucket(customer, 3)'])", - "CREATE TABLE %s (customer VARCHAR, purchases INT, address VARCHAR) WITH (partitioning = ARRAY['customer'])", - } - }; - } - @Override protected OptionalInt maxSchemaNameLength() { @@ -6545,17 +7969,17 @@ public void testSnapshotSummariesHaveTrinoQueryIdFormatV1() String tableName = "test_snapshot_query_ids_v1" + randomNameSuffix(); // Create empty table - assertQueryIdStored(tableName, executeWithQueryId(format("CREATE TABLE %s (a bigint, b bigint) WITH (format_version = 1, partitioning = ARRAY['a'])", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("CREATE TABLE %s (a bigint, b bigint) WITH (format_version = 1, partitioning = ARRAY['a'])", tableName))); // Insert some records, creating 3 partitions - assertQueryIdStored(tableName, executeWithQueryId(format("INSERT INTO %s VALUES (1, 100), (2, 300), (2, 350), (3, 250)", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("INSERT INTO %s VALUES (1, 100), (2, 300), (2, 350), (3, 250)", tableName))); // Delete whole partition - assertQueryIdStored(tableName, executeWithQueryId(format("DELETE FROM %s WHERE a = 2", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("DELETE FROM %s WHERE a = 2", tableName))); // Insert some more and then optimize - assertQueryIdStored(tableName, executeWithQueryId(format("INSERT INTO %s VALUES (1, 400)", tableName))); - assertQueryIdStored(tableName, executeWithQueryId(format("ALTER TABLE %s EXECUTE OPTIMIZE", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("INSERT INTO %s VALUES (1, 400)", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("ALTER TABLE %s EXECUTE OPTIMIZE", tableName))); } @Test @@ -6567,50 +7991,26 @@ public void testSnapshotSummariesHaveTrinoQueryIdFormatV2() assertUpdate(format("INSERT INTO %s VALUES (1, 1), (1, 4), (1, 20), (2, 2)", sourceTableName), 4); // Create table with CTAS - assertQueryIdStored(tableName, executeWithQueryId(format("CREATE TABLE %s WITH (format_version = 2, partitioning = ARRAY['a']) " + + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("CREATE TABLE %s WITH (format_version = 2, partitioning = ARRAY['a']) " + "AS SELECT * FROM %s", tableName, sourceTableName))); // Insert records - assertQueryIdStored(tableName, executeWithQueryId(format("INSERT INTO %s VALUES (1, 100), (2, 300), (3, 250)", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("INSERT INTO %s VALUES (1, 100), (2, 300), (3, 250)", tableName))); // Delete a whole partition - assertQueryIdStored(tableName, executeWithQueryId(format("DELETE FROM %s WHERE a = 2", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("DELETE FROM %s WHERE a = 2", tableName))); // Delete an individual row - assertQueryIdStored(tableName, executeWithQueryId(format("DELETE FROM %s WHERE a = 1 AND b = 4", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("DELETE FROM %s WHERE a = 1 AND b = 4", tableName))); // Update an individual row - assertQueryIdStored(tableName, executeWithQueryId(format("UPDATE %s SET b = 900 WHERE a = 1 AND b = 1", tableName))); + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("UPDATE %s SET b = 900 WHERE a = 1 AND b = 1", tableName))); // Merge - assertQueryIdStored(tableName, executeWithQueryId(format("MERGE INTO %s t USING %s s ON t.a = s.a AND t.b = s.b " + + assertQueryIdAndUserStored(tableName, executeWithQueryId(format("MERGE INTO %s t USING %s s ON t.a = s.a AND t.b = s.b " + "WHEN MATCHED THEN UPDATE SET b = t.b * 50", tableName, sourceTableName))); } - @Test - public void testMaterializedViewSnapshotSummariesHaveTrinoQueryId() - { - String matViewName = "test_materialized_view_snapshot_query_ids" + randomNameSuffix(); - String sourceTableName = "test_source_table_for_mat_view" + randomNameSuffix(); - assertUpdate(format("CREATE TABLE %s (a bigint, b bigint)", sourceTableName)); - assertUpdate(format("INSERT INTO %s VALUES (1, 1), (1, 4), (2, 2)", sourceTableName), 3); - - // create a materialized view - QueryId matViewCreateQueryId = getDistributedQueryRunner() - .executeWithQueryId(getSession(), format("CREATE MATERIALIZED VIEW %s WITH (partitioning = ARRAY['a']) AS SELECT * FROM %s", matViewName, sourceTableName)) - .getQueryId(); - - // fetch the underlying storage table name so we can inspect its snapshot summary after the REFRESH - // running queries against "materialized_view$snapshots" is not supported - String storageTable = (String) getDistributedQueryRunner() - .execute(getSession(), format("SELECT storage_table FROM system.metadata.materialized_views WHERE name = '%s'", matViewName)) - .getOnlyValue(); - - assertQueryIdStored(storageTable, matViewCreateQueryId); - - assertQueryIdStored(storageTable, executeWithQueryId(format("REFRESH MATERIALIZED VIEW %s", matViewName))); - } - @Override protected OptionalInt maxTableNameLength() { @@ -6628,7 +8028,7 @@ protected OptionalInt maxTableRenameLength() @Test public void testSetPartitionedColumnType() { - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_set_partitioned_column_type_", "WITH (partitioning = ARRAY['part']) AS SELECT 1 AS id, CAST(123 AS integer) AS part")) { + try (TestTable table = newTrinoTable("test_set_partitioned_column_type_", "WITH (partitioning = ARRAY['part']) AS SELECT 1 AS id, CAST(123 AS integer) AS part")) { assertUpdate("ALTER TABLE " + table.getName() + " ALTER COLUMN part SET DATA TYPE bigint"); assertThat(query("SELECT part FROM " + table.getName())) @@ -6643,7 +8043,7 @@ public void testSetPartitionedColumnType() @Test public void testSetTransformPartitionedColumnType() { - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_set_partitioned_column_type_", "WITH (partitioning = ARRAY['bucket(part, 10)']) AS SELECT CAST(123 AS integer) AS part")) { + try (TestTable table = newTrinoTable("test_set_partitioned_column_type_", "WITH (partitioning = ARRAY['bucket(part, 10)']) AS SELECT CAST(123 AS integer) AS part")) { assertUpdate("ALTER TABLE " + table.getName() + " ALTER COLUMN part SET DATA TYPE bigint"); assertThat(query("SELECT * FROM " + table.getName())) @@ -6660,12 +8060,8 @@ public void testAlterTableWithUnsupportedProperties() assertUpdate("CREATE TABLE " + tableName + " (a bigint)"); - assertQueryFails("ALTER TABLE " + tableName + " SET PROPERTIES orc_bloom_filter_columns = ARRAY['a']", - "The following properties cannot be updated: orc_bloom_filter_columns"); assertQueryFails("ALTER TABLE " + tableName + " SET PROPERTIES location = '/var/data/table/', orc_bloom_filter_fpp = 0.5", "The following properties cannot be updated: location, orc_bloom_filter_fpp"); - assertQueryFails("ALTER TABLE " + tableName + " SET PROPERTIES format = 'ORC', orc_bloom_filter_columns = ARRAY['a']", - "The following properties cannot be updated: orc_bloom_filter_columns"); assertUpdate("DROP TABLE " + tableName); } @@ -6682,12 +8078,16 @@ public void testDropTableWithMissingMetadataFile() // Delete current metadata file fileSystem.deleteFile(metadataLocation); - assertFalse(fileSystem.newInputFile(metadataLocation).exists(), "Current metadata file should not exist"); + assertThat(fileSystem.newInputFile(metadataLocation).exists()) + .describedAs("Current metadata file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(Location.of(tableLocation)).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(Location.of(tableLocation)).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -6699,17 +8099,21 @@ public void testDropTableWithMissingSnapshotFile() String tableLocation = getTableLocation(tableName); String metadataLocation = getLatestMetadataLocation(fileSystem, tableLocation); - TableMetadata tableMetadata = TableMetadataParser.read(new ForwardingFileIo(fileSystem), metadataLocation); + TableMetadata tableMetadata = TableMetadataParser.read(FILE_IO_FACTORY.create(fileSystem), metadataLocation); Location currentSnapshotFile = Location.of(tableMetadata.currentSnapshot().manifestListLocation()); // Delete current snapshot file fileSystem.deleteFile(currentSnapshotFile); - assertFalse(fileSystem.newInputFile(currentSnapshotFile).exists(), "Current snapshot file should not exist"); + assertThat(fileSystem.newInputFile(currentSnapshotFile).exists()) + .describedAs("Current snapshot file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(Location.of(tableLocation)).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(Location.of(tableLocation)).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -6721,18 +8125,22 @@ public void testDropTableWithMissingManifestListFile() String tableLocation = getTableLocation(tableName); String metadataLocation = getLatestMetadataLocation(fileSystem, tableLocation); - FileIO fileIo = new ForwardingFileIo(fileSystem); + FileIO fileIo = FILE_IO_FACTORY.create(fileSystem); TableMetadata tableMetadata = TableMetadataParser.read(fileIo, metadataLocation); Location manifestListFile = Location.of(tableMetadata.currentSnapshot().allManifests(fileIo).get(0).path()); // Delete Manifest List file fileSystem.deleteFile(manifestListFile); - assertFalse(fileSystem.newInputFile(manifestListFile).exists(), "Manifest list file should not exist"); + assertThat(fileSystem.newInputFile(manifestListFile).exists()) + .describedAs("Manifest list file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(Location.of(tableLocation)).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(Location.of(tableLocation)).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -6746,17 +8154,21 @@ public void testDropTableWithMissingDataFile() Location tableLocation = Location.of(getTableLocation(tableName)); Location tableDataPath = tableLocation.appendPath("data"); FileIterator fileIterator = fileSystem.listFiles(tableDataPath); - assertTrue(fileIterator.hasNext()); + assertThat(fileIterator.hasNext()).isTrue(); Location dataFile = fileIterator.next().location(); // Delete data file fileSystem.deleteFile(dataFile); - assertFalse(fileSystem.newInputFile(dataFile).exists(), "Data file should not exist"); + assertThat(fileSystem.newInputFile(dataFile).exists()) + .describedAs("Data file should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -6771,11 +8183,13 @@ public void testDropTableWithNonExistentTableLocation() // Delete table location fileSystem.deleteDirectory(tableLocation); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); // try to drop table assertUpdate("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); } @Test @@ -6792,7 +8206,9 @@ public void testCorruptedTableLocation() // break the table by deleting all metadata files fileSystem.deleteDirectory(metadataLocation); - assertFalse(fileSystem.listFiles(metadataLocation).hasNext(), "Metadata location should not exist"); + assertThat(fileSystem.listFiles(metadataLocation).hasNext()) + .describedAs("Metadata location should not exist") + .isFalse(); // Assert queries fail cleanly assertQueryFails("TABLE " + tableName, "Metadata not found in metadata location for table " + schemaTableName); @@ -6800,6 +8216,8 @@ public void testCorruptedTableLocation() assertQueryFails("SELECT 1 FROM " + tableName + " WHERE false", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("SHOW CREATE TABLE " + tableName, "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("CREATE TABLE a_new_table (LIKE " + tableName + " EXCLUDING PROPERTIES)", "Metadata not found in metadata location for table " + schemaTableName); + assertQueryFails("CREATE OR REPLACE TABLE " + tableName + " (id INT, country VARCHAR, independence ROW(month VARCHAR, year INT))", "Metadata not found in metadata location for table " + schemaTableName); + assertQueryFails("CREATE OR REPLACE TABLE " + tableName + " AS SELECT 1 x, 'IRELAND' y", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("DESCRIBE " + tableName, "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("SHOW COLUMNS FROM " + tableName, "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("SHOW STATS FOR " + tableName, "Metadata not found in metadata location for table " + schemaTableName); @@ -6808,7 +8226,7 @@ public void testCorruptedTableLocation() assertQueryFails("ALTER TABLE " + tableName + " EXECUTE vacuum", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("ALTER TABLE " + tableName + " RENAME TO bad_person_some_new_name", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("ALTER TABLE " + tableName + " ADD COLUMN foo int", "Metadata not found in metadata location for table " + schemaTableName); - // TODO (https://github.com/trinodb/trino/issues/16248) ADD field + assertQueryFails("ALTER TABLE " + tableName + " ADD COLUMN independence.month int", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN country", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("ALTER TABLE " + tableName + " DROP COLUMN independence.month", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("ALTER TABLE " + tableName + " SET PROPERTIES format = 'PARQUET'", "Metadata not found in metadata location for table " + schemaTableName); @@ -6816,10 +8234,10 @@ public void testCorruptedTableLocation() assertQueryFails("UPDATE " + tableName + " SET country = 'AUSTRIA'", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("DELETE FROM " + tableName, "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("MERGE INTO " + tableName + " USING (SELECT 1 a) input ON true WHEN MATCHED THEN DELETE", "Metadata not found in metadata location for table " + schemaTableName); - assertQueryFails("TRUNCATE TABLE " + tableName, "This connector does not support truncating tables"); + assertQueryFails("TRUNCATE TABLE " + tableName, "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("COMMENT ON TABLE " + tableName + " IS NULL", "Metadata not found in metadata location for table " + schemaTableName); assertQueryFails("COMMENT ON COLUMN " + tableName + ".foo IS NULL", "Metadata not found in metadata location for table " + schemaTableName); - assertQueryFails("CALL iceberg.system.rollback_to_snapshot(CURRENT_SCHEMA, '" + tableName + "', 8954597067493422955)", "Metadata not found in metadata location for table " + schemaTableName); + assertQueryFails("ALTER TABLE " + tableName + " EXECUTE rollback_to_snapshot(8954597067493422955)", "Metadata not found in metadata location for table " + schemaTableName); // Avoid failing metadata queries assertQuery("SHOW TABLES LIKE 'test_corrupted_table_location_%' ESCAPE '\\'", "VALUES '" + tableName + "'"); @@ -6830,8 +8248,10 @@ public void testCorruptedTableLocation() // DROP TABLE should succeed so that users can remove their corrupted table assertQuerySucceeds("DROP TABLE " + tableName); - assertFalse(getQueryRunner().tableExists(getSession(), tableName)); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(getQueryRunner().tableExists(getSession(), tableName)).isFalse(); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } @Test @@ -6853,7 +8273,7 @@ public void testDropCorruptedTableWithHiveRedirection() .setSchema(schema) .build(); - DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(icebergSession) + QueryRunner queryRunner = DistributedQueryRunner.builder(icebergSession) .build(); queryRunner.installPlugin(new IcebergPlugin()); queryRunner.createCatalog( @@ -6861,9 +8281,10 @@ public void testDropCorruptedTableWithHiveRedirection() "iceberg", ImmutableMap.of( "iceberg.catalog.type", "TESTING_FILE_METASTORE", - "hive.metastore.catalog.dir", dataDirectory.getPath())); + "hive.metastore.catalog.dir", dataDirectory.getPath(), + "fs.hadoop.enabled", "true")); - queryRunner.installPlugin(new TestingHivePlugin(createTestingFileHiveMetastore(dataDirectory))); + queryRunner.installPlugin(new TestingHivePlugin(dataDirectory.toPath())); queryRunner.createCatalog( hiveRedirectionCatalog, "hive", @@ -6881,15 +8302,20 @@ public void testDropCorruptedTableWithHiveRedirection() // break the table by deleting all metadata files fileSystem.deleteDirectory(metadataLocation); - assertFalse(fileSystem.listFiles(metadataLocation).hasNext(), "Metadata location should not exist"); + assertThat(fileSystem.listFiles(metadataLocation).hasNext()) + .describedAs("Metadata location should not exist") + .isFalse(); // DROP TABLE should succeed using hive redirection queryRunner.execute("DROP TABLE " + hiveTableName); - assertFalse(queryRunner.tableExists(getSession(), icebergTableName)); - assertFalse(fileSystem.listFiles(tableLocation).hasNext(), "Table location should not exist"); + assertThat(queryRunner.tableExists(getSession(), icebergTableName)).isFalse(); + assertThat(fileSystem.listFiles(tableLocation).hasNext()) + .describedAs("Table location should not exist") + .isFalse(); } - @Test(timeOut = 10_000) + @Test + @Timeout(10) public void testNoRetryWhenMetadataFileInvalid() throws Exception { @@ -6907,27 +8333,924 @@ public void testNoRetryWhenMetadataFileInvalid() // Add duplicate field to produce validation error while reading the metadata file fieldsNode.add(newFieldNode); - String modifiedJson = mapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonNode); - try (OutputStream outputStream = fileSystem.newOutputFile(Location.of(metadataFileLocation)).createOrOverwrite()) { - // Corrupt metadata file by overwriting the invalid metadata content - outputStream.write(modifiedJson.getBytes(UTF_8)); - } - assertThatThrownBy(() -> query("SELECT * FROM " + tableName)) - .hasMessage("Invalid metadata file for table tpch.%s".formatted(tableName)); + byte[] modifiedJson = mapper.writerWithDefaultPrettyPrinter().writeValueAsBytes(jsonNode); + + // Corrupt metadata file by overwriting the invalid metadata content + fileSystem.newOutputFile(Location.of(metadataFileLocation)).createOrOverwrite(modifiedJson); + + assertThat(query("SELECT * FROM " + tableName)) + .failure().hasMessage("Invalid metadata file for table tpch.%s".formatted(tableName)); assertUpdate("DROP TABLE " + tableName); } + @Test + public void testTableChangesFunctionAfterSchemaChange() + { + try (TestTable table = newTrinoTable( + "test_table_changes_function_", + "AS SELECT nationkey, name FROM tpch.tiny.nation WITH NO DATA")) { + long initialSnapshot = getCurrentSnapshotId(table.getName()); + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey, name FROM nation WHERE nationkey < 5", 5); + long snapshotAfterInsert = getCurrentSnapshotId(table.getName()); + + assertUpdate("ALTER TABLE " + table.getName() + " DROP COLUMN name"); + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey FROM nation WHERE nationkey >= 5 AND nationkey < 10", 5); + long snapshotAfterDropColumn = getCurrentSnapshotId(table.getName()); + + assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN comment VARCHAR"); + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey, comment FROM nation WHERE nationkey >= 10 AND nationkey < 15", 5); + long snapshotAfterAddColumn = getCurrentSnapshotId(table.getName()); + + assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN name VARCHAR"); + assertUpdate("INSERT INTO " + table.getName() + " SELECT nationkey, comment, name FROM nation WHERE nationkey >= 15", 10); + long snapshotAfterReaddingNameColumn = getCurrentSnapshotId(table.getName()); + + assertQuery( + "SELECT nationkey, name, _change_type, _change_version_id, _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterInsert), + "SELECT nationkey, name, 'insert', %s, 0 FROM nation WHERE nationkey < 5".formatted(snapshotAfterInsert)); + + assertQuery( + "SELECT nationkey, _change_type, _change_version_id, _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterDropColumn), + "SELECT nationkey, 'insert', %s, 0 FROM nation WHERE nationkey < 5 UNION SELECT nationkey, 'insert', %s, 1 FROM nation WHERE nationkey >= 5 AND nationkey < 10 ".formatted(snapshotAfterInsert, snapshotAfterDropColumn)); + + assertQuery( + "SELECT nationkey, comment, _change_type, _change_version_id, _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterAddColumn), + ("SELECT nationkey, NULL, 'insert', %s, 0 FROM nation WHERE nationkey < 5 " + + "UNION SELECT nationkey, NULL, 'insert', %s, 1 FROM nation WHERE nationkey >= 5 AND nationkey < 10 " + + "UNION SELECT nationkey, comment, 'insert', %s, 2 FROM nation WHERE nationkey >= 10 AND nationkey < 15").formatted(snapshotAfterInsert, snapshotAfterDropColumn, snapshotAfterAddColumn)); + + assertQuery( + "SELECT nationkey, comment, name, _change_type, _change_version_id, _change_ordinal " + + "FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s))".formatted(table.getName(), initialSnapshot, snapshotAfterReaddingNameColumn), + ("SELECT nationkey, NULL, NULL, 'insert', %s, 0 FROM nation WHERE nationkey < 5 " + + "UNION SELECT nationkey, NULL, NULL, 'insert', %s, 1 FROM nation WHERE nationkey >= 5 AND nationkey < 10 " + + "UNION SELECT nationkey, comment, NULL, 'insert', %s, 2 FROM nation WHERE nationkey >= 10 AND nationkey < 15" + + "UNION SELECT nationkey, comment, name, 'insert', %s, 3 FROM nation WHERE nationkey >= 15").formatted(snapshotAfterInsert, snapshotAfterDropColumn, snapshotAfterAddColumn, snapshotAfterReaddingNameColumn)); + } + } + + @Test + public void testTableChangesFunctionInvalidArguments() + { + assertQueryFails( + "SELECT * FROM TABLE(system.table_changes(start_snapshot_id => 1, end_snapshot_id => 2))", + ".*: Missing argument: SCHEMA_NAME"); + assertQueryFails( + "SELECT * FROM TABLE(system.table_changes(schema_name => 'tpch', start_snapshot_id => 1, end_snapshot_id => 2))", + ".* Missing argument: TABLE_NAME"); + } + + @Test + public void testIdentityPartitionFilterMissing() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertQueryFails(session, "SELECT id FROM " + tableName + " WHERE ds IS NOT null OR true", "Filter required for tpch\\." + tableName + " on at least one of the partition columns: ds"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testBucketPartitionFilterMissing() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['bucket(ds, 16)'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertQueryFails(session, "SELECT id FROM " + tableName + " WHERE ds IS NOT null OR true", "Filter required for tpch\\." + tableName + " on at least one of the partition columns: ds"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testIdentityPartitionFilterIncluded() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + String query = "SELECT id FROM " + tableName + " WHERE ds = 'a'"; + assertQuery(session, query, "VALUES 1"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testBucketedSelect() + { + try { + // table is partitioned on (key1), (key2), or (key1, key2) + assertUpdate( + "CREATE TABLE test_bucketed_select WITH (partitioning = ARRAY['bucket(key1, 13)', 'bucket(key2, 17)']) AS SELECT orderkey key1, custkey key2, comment value1 FROM orders", + 15000); + Session planWithTableNodePartitioning = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", BUCKET_EXECUTION_ENABLED, "true") + .build(); + Session planWithoutTableNodePartitioning = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", BUCKET_EXECUTION_ENABLED, "false") + .build(); + + // a basic scan should not use a partitioned read as it is not helpful to the query + @Language("SQL") String query = "SELECT value1 FROM test_bucketed_select WHERE key1 < 10"; + @Language("SQL") String expectedQuery = "SELECT comment FROM orders WHERE orderkey < 10"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertNoReadPartitioning("key1", "key2")); + + // aggregation on key1, key2, or (key1, key2) should not require a remote exchange + query = "SELECT count(value1) FROM test_bucketed_select GROUP BY key1"; + expectedQuery = "SELECT count(comment) FROM orders GROUP BY orderkey"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(0)); + assertQuery(planWithoutTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(1)); + + query = "SELECT count(value1) FROM test_bucketed_select GROUP BY key2"; + expectedQuery = "SELECT count(comment) FROM orders GROUP BY custkey"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(0)); + assertQuery(planWithoutTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(1)); + + query = "SELECT count(value1) FROM test_bucketed_select GROUP BY key2, key1"; + expectedQuery = "SELECT count(comment) FROM orders GROUP BY custkey, orderkey"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(0)); + assertQuery(planWithoutTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(1)); + + // join on key1, key2, or (key1, key2) should not require a remote exchange + query = "SELECT key1 FROM test_bucketed_select JOIN test_bucketed_select USING (key1)"; + expectedQuery = "SELECT a.orderkey FROM orders a JOIN orders USING (orderkey)"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(0)); + assertQuery(planWithoutTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(2)); + + query = "SELECT key2 FROM test_bucketed_select JOIN test_bucketed_select USING (key2)"; + expectedQuery = "SELECT a.custkey FROM orders a JOIN orders USING (custkey)"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(0)); + assertQuery(planWithoutTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(2)); + + query = "SELECT key2, key1 FROM test_bucketed_select JOIN test_bucketed_select USING (key2, key1)"; + expectedQuery = "SELECT a.custkey, a.orderkey FROM orders a JOIN orders USING (custkey, orderkey)"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(0)); + assertQuery(planWithoutTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(2)); + + query = "SELECT a.key2, b.key1 FROM test_bucketed_select a JOIN test_bucketed_select b on a.key1 = b.key2"; + expectedQuery = "SELECT a.custkey, b.orderkey FROM orders a JOIN orders b on a.orderkey = b.custkey"; + assertQuery(planWithTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(1)); + assertQuery(planWithoutTableNodePartitioning, query, expectedQuery, assertRemoteExchangesCount(2)); + } + finally { + assertUpdate("DROP TABLE IF EXISTS test_bucketed_select"); + } + } + + @Test + public void testBucketPartitionFilterIncluded() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['bucket(ds, 16)'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, 'a', 'a'), (2, 'b', 'b')", 2); + String query = "SELECT id FROM " + tableName + " WHERE ds = 'a'"; + assertQuery(session, query, "VALUES 1"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testMultiPartitionedTableFilterIncluded() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['id', 'bucket(ds, 16)'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, 'a', 'a'), (2, 'b', 'b')", 2); + // include predicate only on 'id', not on 'ds' + String query = "SELECT id, ds FROM " + tableName + " WHERE id = 2"; + assertQuery(session, query, "VALUES (2, 'b')"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testIdentityPartitionIsNotNullFilter() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertQuery(session, "SELECT id FROM " + tableName + " WHERE ds IS NOT null", "VALUES 1"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testJoinPartitionFilterIncluded() + { + String tableName1 = "test_partition_" + randomNameSuffix(); + String tableName2 = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName1 + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName1 + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertUpdate(session, "CREATE TABLE " + tableName2 + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName2 + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertQuery(session, "SELECT a.id, b.id FROM " + tableName1 + " a JOIN " + tableName2 + " b ON (a.ds = b.ds) WHERE a.ds = 'a'", "VALUES (1, 1)"); + assertUpdate(session, "DROP TABLE " + tableName1); + assertUpdate(session, "DROP TABLE " + tableName2); + } + + @Test + public void testJoinWithMissingPartitionFilter() + { + String tableName1 = "test_partition_" + randomNameSuffix(); + String tableName2 = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName1 + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName1 + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertUpdate(session, "CREATE TABLE " + tableName2 + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName2 + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertQueryFails(session, "SELECT a.id, b.id FROM " + tableName1 + " a JOIN " + tableName2 + " b ON (a.id = b.id) WHERE a.ds = 'a'", "Filter required for tpch\\." + tableName2 + " on at least one of the partition columns: ds"); + assertUpdate(session, "DROP TABLE " + tableName1); + assertUpdate(session, "DROP TABLE " + tableName2); + } + + @Test + public void testJoinWithPartitionFilterOnPartitionedTable() + { + String tableName1 = "test_partition_" + randomNameSuffix(); + String tableName2 = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName1 + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName1 + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertUpdate(session, "CREATE TABLE " + tableName2 + " (id integer, a varchar, b varchar, ds varchar)"); + assertUpdate(session, "INSERT INTO " + tableName2 + " (id, a, ds) VALUES (1, 'a', 'a')", 1); + assertQuery(session, "SELECT a.id, b.id FROM " + tableName1 + " a JOIN " + tableName2 + " b ON (a.id = b.id) WHERE a.ds = 'a'", "VALUES (1, 1)"); + assertUpdate(session, "DROP TABLE " + tableName1); + assertUpdate(session, "DROP TABLE " + tableName2); + } + + @Test + public void testPartitionPredicateWithCasting() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, '1', '1')", 1); + String query = "SELECT id FROM " + tableName + " WHERE cast(ds as integer) = 1"; + assertQuery(session, query, "VALUES 1"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testNestedQueryWithInnerPartitionPredicate() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, '1', '1')", 1); + String query = "SELECT id FROM (SELECT * FROM " + tableName + " WHERE cast(ds as integer) = 1) WHERE cast(a as integer) = 1"; + assertQuery(session, query, "VALUES 1"); + assertUpdate(session, "DROP TABLE " + tableName + ""); + } + + @Test + public void testPredicateOnNonPartitionColumn() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName + " (id, a, ds) VALUES (1, '1', '1')", 1); + String query = "SELECT id FROM " + tableName + " WHERE cast(b as integer) = 1"; + assertQueryFails(session, query, "Filter required for tpch\\." + tableName + " on at least one of the partition columns: ds"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + @Test + public void testNonSelectStatementsWithPartitionFilterRequired() + { + String tableName1 = "test_partition_" + randomNameSuffix(); + String tableName2 = "test_partition_" + randomNameSuffix(); + + Session session = withPartitionFilterRequired(getSession()); + + assertUpdate(session, "CREATE TABLE " + tableName1 + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "CREATE TABLE " + tableName2 + " (id integer, a varchar, b varchar, ds varchar) WITH (partitioning = ARRAY['ds'])"); + assertUpdate(session, "INSERT INTO " + tableName1 + " (id, a, ds) VALUES (1, '1', '1'), (2, '2', '2')", 2); + assertUpdate(session, "INSERT INTO " + tableName2 + " (id, a, ds) VALUES (1, '1', '1'), (3, '3', '3')", 2); + + // These non-SELECT statements fail without a partition filter + String errorMessage = "Filter required for tpch\\." + tableName1 + " on at least one of the partition columns: ds"; + assertQueryFails(session, "ALTER TABLE " + tableName1 + " EXECUTE optimize", errorMessage); + assertQueryFails(session, "UPDATE " + tableName1 + " SET a = 'New'", errorMessage); + assertQueryFails(session, "MERGE INTO " + tableName1 + " AS a USING " + tableName2 + " AS b ON (a.ds = b.ds) WHEN MATCHED THEN UPDATE SET a = 'New'", errorMessage); + assertQueryFails(session, "DELETE FROM " + tableName1 + " WHERE a = '1'", errorMessage); + + // Adding partition filters to each solves the problem + assertQuerySucceeds(session, "ALTER TABLE " + tableName1 + " EXECUTE optimize WHERE ds in ('2', '4')"); + assertQuerySucceeds(session, "UPDATE " + tableName1 + " SET a = 'New' WHERE ds = '2'"); + assertQuerySucceeds(session, "MERGE INTO " + tableName1 + " AS a USING (SELECT * FROM " + tableName2 + " WHERE ds = '1') AS b ON (a.ds = b.ds) WHEN MATCHED THEN UPDATE SET a = 'New'"); + assertQuerySucceeds(session, "DELETE FROM " + tableName1 + " WHERE ds = '1'"); + + // Analyze should always succeed, since currently it cannot take a partition argument like Hive + assertQuerySucceeds(session, "ANALYZE " + tableName1); + assertQuerySucceeds(session, "ANALYZE " + tableName2 + " WITH (columns = ARRAY['id', 'a'])"); + + assertUpdate(session, "DROP TABLE " + tableName1); + assertUpdate(session, "DROP TABLE " + tableName2); + } + + @Test + public void testPartitionFilterRequiredSchemas() + { + String schemaName = "test_unenforced_schema_" + randomNameSuffix(); + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = Session.builder(withPartitionFilterRequired(getSession())) + .setCatalogSessionProperty("iceberg", "query_partition_filter_required_schemas", "[\"tpch\"]") + .build(); + + assertUpdate(session, "CREATE SCHEMA " + schemaName); + assertUpdate(session, format("CREATE TABLE %s.%s (id, a, ds) WITH (partitioning = ARRAY['ds']) AS SELECT 1, '1', '1'", schemaName, tableName), 1); + assertUpdate(session, "CREATE TABLE " + tableName + " (id, a, ds) WITH (partitioning = ARRAY['ds']) AS SELECT 1, '1', '1'", 1); + + String enforcedQuery = "SELECT id FROM tpch." + tableName + " WHERE a = '1'"; + assertQueryFails(session, enforcedQuery, "Filter required for tpch\\." + tableName + " on at least one of the partition columns: ds"); + + String unenforcedQuery = format("SELECT id FROM %s.%s WHERE a = '1'", schemaName, tableName); + assertQuerySucceeds(session, unenforcedQuery); + + assertUpdate(session, "DROP TABLE " + tableName); + assertUpdate(session, "DROP SCHEMA " + schemaName + " CASCADE"); + } + + @Test + public void testIgnorePartitionFilterRequiredSchemas() + { + String tableName = "test_partition_" + randomNameSuffix(); + + Session session = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", "query_partition_filter_required_schemas", "[\"tpch\"]") + .build(); + assertUpdate(session, "CREATE TABLE " + tableName + " (id, a, ds) WITH (partitioning = ARRAY['ds']) AS SELECT 1, '1', '1'", 1); + assertQuerySucceeds(session, "SELECT id FROM " + tableName + " WHERE a = '1'"); + assertUpdate(session, "DROP TABLE " + tableName); + } + + private static Session withPartitionFilterRequired(Session session) + { + return Session.builder(session) + .setCatalogSessionProperty("iceberg", "query_partition_filter_required", "true") + .build(); + } + + @Test + public void testUuidDynamicFilter() + { + String catalog = getSession().getCatalog().orElseThrow(); + try (TestTable dataTable = newTrinoTable("data_table", "(value uuid)"); + TestTable filteringTable = newTrinoTable("filtering_table", "(filtering_value uuid)")) { + assertUpdate("INSERT INTO " + dataTable.getName() + " VALUES UUID 'f73894f0-5447-41c5-a727-436d04c7f8ab', UUID '4f676658-67c9-4e80-83be-ec75f0b9d0c9'", 2); + assertUpdate("INSERT INTO " + filteringTable.getName() + " VALUES UUID 'f73894f0-5447-41c5-a727-436d04c7f8ab'", 1); + + assertThat(query( + Session.builder(getSession()) + .setCatalogSessionProperty(catalog, DYNAMIC_FILTERING_WAIT_TIMEOUT, "10s") + .build(), + "SELECT value FROM " + dataTable.getName() + " WHERE EXISTS (SELECT 1 FROM " + filteringTable.getName() + " WHERE filtering_value = value)")) + .matches("VALUES UUID 'f73894f0-5447-41c5-a727-436d04c7f8ab'"); + } + } + + @Test + public void testDynamicFilterWithExplicitPartitionFilter() + { + String catalog = getSession().getCatalog().orElseThrow(); + try (TestTable salesTable = newTrinoTable("sales_table", "(date date, receipt_id varchar, amount decimal(10,2)) with (partitioning=array['date'])"); + TestTable dimensionTable = newTrinoTable("dimension_table", "(date date, following_holiday boolean, year int)")) { + assertUpdate( + """ + INSERT INTO %s + VALUES + (DATE '2023-01-01' , false, 2023), + (DATE '2023-01-02' , true, 2023), + (DATE '2023-01-03' , false, 2023)""".formatted(dimensionTable.getName()), 3); + assertUpdate( + """ + INSERT INTO %s + VALUES + (DATE '2023-01-02' , '#2023#1', DECIMAL '122.12'), + (DATE '2023-01-02' , '#2023#2', DECIMAL '124.12'), + (DATE '2023-01-02' , '#2023#3', DECIMAL '99.99'), + (DATE '2023-01-02' , '#2023#4', DECIMAL '95.12'), + (DATE '2023-01-03' , '#2023#5', DECIMAL '199.12'), + (DATE '2023-01-04' , '#2023#6', DECIMAL '99.55'), + (DATE '2023-01-05' , '#2023#7', DECIMAL '50.11'), + (DATE '2023-01-05' , '#2023#8', DECIMAL '60.20'), + (DATE '2023-01-05' , '#2023#9', DECIMAL '70.75'), + (DATE '2023-01-05' , '#2023#10', DECIMAL '80.12')""".formatted(salesTable.getName()), 10); + + String selectQuery = + """ + SELECT receipt_id + FROM %s s + JOIN %s d + ON s.date = d.date + WHERE + d.following_holiday = true AND + d.date BETWEEN DATE '2023-01-01' AND DATE '2024-01-01'""".formatted(salesTable.getName(), dimensionTable.getName()); + MaterializedResultWithPlan result = getDistributedQueryRunner().executeWithPlan( + Session.builder(getSession()) + .setCatalogSessionProperty(catalog, DYNAMIC_FILTERING_WAIT_TIMEOUT, "10s") + .build(), + selectQuery); + MaterializedResult expected = computeActual( + Session.builder(getSession()) + .setSystemProperty(ENABLE_DYNAMIC_FILTERING, "false") + .build(), + selectQuery); + assertEqualsIgnoreOrder(result.result(), expected); + + DynamicFilterService.DynamicFiltersStats dynamicFiltersStats = getDistributedQueryRunner().getCoordinator() + .getQueryManager() + .getFullQueryInfo(result.queryId()) + .getQueryStats() + .getDynamicFiltersStats(); + // The dynamic filter reduces the range specified for the partition column `date` from `date :: [[2023-01-01, 2024-01-01]]` to `date :: {[2023-01-02]}` + assertThat(dynamicFiltersStats.getTotalDynamicFilters()).isEqualTo(1L); + assertThat(dynamicFiltersStats.getLazyDynamicFilters()).isEqualTo(1L); + assertThat(dynamicFiltersStats.getReplicatedDynamicFilters()).isEqualTo(0L); + assertThat(dynamicFiltersStats.getDynamicFiltersCompleted()).isEqualTo(1L); + } + } + @Override protected void verifyTableNameLengthFailurePermissible(Throwable e) { - assertThat(e).hasMessageMatching("Table name must be shorter than or equal to '128' characters but got .*"); + assertThat(e).hasMessageMatching(".*Table name must be shorter than or equal to '128' characters but got .*"); + } + + @Test + public void testTypeCoercionOnCreateTableAsSelect() + { + for (TypeCoercionTestSetup setup : typeCoercionOnCreateTableAsSelectProvider()) { + try (TestTable testTable = newTrinoTable( + "test_coercion_show_create_table", + format("AS SELECT %s a", setup.sourceValueLiteral))) { + assertThat(getColumnType(testTable.getName(), "a")).isEqualTo(setup.newColumnType); + assertThat(query("SELECT * FROM " + testTable.getName())) + .as("source value: %s, new type: %s, new value: %s", setup.sourceValueLiteral, setup.newColumnType, setup.newValueLiteral) + .skippingTypesCheck() + .matches("SELECT " + setup.newValueLiteral); + } + } + } + + @Test + public void testTypeCoercionOnCreateTableAsSelectWithNoData() + { + for (TypeCoercionTestSetup setup : typeCoercionOnCreateTableAsSelectProvider()) { + try (TestTable testTable = newTrinoTable( + "test_coercion_show_create_table", + format("AS SELECT %s a WITH NO DATA", setup.sourceValueLiteral))) { + assertThat(getColumnType(testTable.getName(), "a")).isEqualTo(setup.newColumnType); + } + } + } + + private List typeCoercionOnCreateTableAsSelectProvider() + { + return typeCoercionOnCreateTableAsSelectData().stream() + .map(this::filterTypeCoercionOnCreateTableAsSelectProvider) + .flatMap(Optional::stream) + .collect(toList()); + } + + protected Optional filterTypeCoercionOnCreateTableAsSelectProvider(TypeCoercionTestSetup setup) + { + return Optional.of(setup); + } + + private List typeCoercionOnCreateTableAsSelectData() + { + return ImmutableList.builder() + .add(new TypeCoercionTestSetup("TINYINT '127'", "integer", "INTEGER '127'")) + .add(new TypeCoercionTestSetup("SMALLINT '32767'", "integer", "INTEGER '32767'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.000000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.9'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.900000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.56'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.560000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.123'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.123000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.4896'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.489600'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.89356'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.893560'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.123000'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.123000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.999'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.999000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.123456'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '2020-09-27 12:34:56.1'", "timestamp(6)", "TIMESTAMP '2020-09-27 12:34:56.100000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '2020-09-27 12:34:56.9'", "timestamp(6)", "TIMESTAMP '2020-09-27 12:34:56.900000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '2020-09-27 12:34:56.123'", "timestamp(6)", "TIMESTAMP '2020-09-27 12:34:56.123000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '2020-09-27 12:34:56.123000'", "timestamp(6)", "TIMESTAMP '2020-09-27 12:34:56.123000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '2020-09-27 12:34:56.999'", "timestamp(6)", "TIMESTAMP '2020-09-27 12:34:56.999000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '2020-09-27 12:34:56.123456'", "timestamp(6)", "TIMESTAMP '2020-09-27 12:34:56.123456'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.1234561'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.123456499'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.123456499999'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.1234565'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.123457'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.111222333444'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.111222'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 00:00:00.9999995'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:01.000000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1970-01-01 23:59:59.9999995'", "timestamp(6)", "TIMESTAMP '1970-01-02 00:00:00.000000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1969-12-31 23:59:59.9999995'", "timestamp(6)", "TIMESTAMP '1970-01-01 00:00:00.000000'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1969-12-31 23:59:59.999999499999'", "timestamp(6)", "TIMESTAMP '1969-12-31 23:59:59.999999'")) + .add(new TypeCoercionTestSetup("TIMESTAMP '1969-12-31 23:59:59.9999994'", "timestamp(6)", "TIMESTAMP '1969-12-31 23:59:59.999999'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00'", "time(6)", "TIME '00:00:00.000000'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.9'", "time(6)", "TIME '00:00:00.900000'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.56'", "time(6)", "TIME '00:00:00.560000'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.123'", "time(6)", "TIME '00:00:00.123000'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.4896'", "time(6)", "TIME '00:00:00.489600'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.89356'", "time(6)", "TIME '00:00:00.893560'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.123000'", "time(6)", "TIME '00:00:00.123000'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.999'", "time(6)", "TIME '00:00:00.999000'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.123456'", "time(6)", "TIME '00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIME '12:34:56.1'", "time(6)", "TIME '12:34:56.100000'")) + .add(new TypeCoercionTestSetup("TIME '12:34:56.9'", "time(6)", "TIME '12:34:56.900000'")) + .add(new TypeCoercionTestSetup("TIME '12:34:56.123'", "time(6)", "TIME '12:34:56.123000'")) + .add(new TypeCoercionTestSetup("TIME '12:34:56.123000'", "time(6)", "TIME '12:34:56.123000'")) + .add(new TypeCoercionTestSetup("TIME '12:34:56.999'", "time(6)", "TIME '12:34:56.999000'")) + .add(new TypeCoercionTestSetup("TIME '12:34:56.123456'", "time(6)", "TIME '12:34:56.123456'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.1234561'", "time(6)", "TIME '00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.123456499'", "time(6)", "TIME '00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.123456499999'", "time(6)", "TIME '00:00:00.123456'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.1234565'", "time(6)", "TIME '00:00:00.123457'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.111222333444'", "time(6)", "TIME '00:00:00.111222'")) + .add(new TypeCoercionTestSetup("TIME '00:00:00.9999995'", "time(6)", "TIME '00:00:01.000000'")) + .add(new TypeCoercionTestSetup("TIME '23:59:59.9999995'", "time(6)", "TIME '00:00:00.000000'")) + .add(new TypeCoercionTestSetup("TIME '23:59:59.999999499999'", "time(6)", "TIME '23:59:59.999999'")) + .add(new TypeCoercionTestSetup("TIME '23:59:59.9999994'", "time(6)", "TIME '23:59:59.999999'")) + .add(new TypeCoercionTestSetup("CHAR 'A'", "varchar", "'A'")) + .add(new TypeCoercionTestSetup("CHAR 'é'", "varchar", "'é'")) + .add(new TypeCoercionTestSetup("CHAR 'A '", "varchar", "'A '")) + .add(new TypeCoercionTestSetup("CHAR ' A'", "varchar", "' A'")) + .add(new TypeCoercionTestSetup("CHAR 'ABc'", "varchar", "'ABc'")) + .add(new TypeCoercionTestSetup("ARRAY[CHAR 'A']", "array(varchar)", "ARRAY['A']")) + .add(new TypeCoercionTestSetup("ARRAY[ARRAY[CHAR 'nested']]", "array(array(varchar))", "ARRAY[ARRAY['nested']]")) + .add(new TypeCoercionTestSetup("MAP(ARRAY[CHAR 'key'], ARRAY[CHAR 'value'])", "map(varchar, varchar)", "MAP(ARRAY['key'], ARRAY['value'])")) + .add(new TypeCoercionTestSetup("MAP(ARRAY[CHAR 'key'], ARRAY[ARRAY[CHAR 'value']])", "map(varchar, array(varchar))", "MAP(ARRAY['key'], ARRAY[ARRAY['value']])")) + // TODO Add test case for MAP type with ARRAY keys once https://github.com/trinodb/trino/issues/1146 is resolved + .add(new TypeCoercionTestSetup("CAST(ROW('a') AS ROW(x CHAR))", "row(x varchar)", "CAST(ROW('a') AS ROW(x VARCHAR))")) + .add(new TypeCoercionTestSetup("CAST(ROW(ROW('a')) AS ROW(x ROW(y CHAR)))", "row(x row(y varchar))", "CAST(ROW(ROW('a')) AS ROW(x ROW(y VARCHAR)))")) + // tinyint -> integer + .add(new TypeCoercionTestSetup("ARRAY[TINYINT '127']", "array(integer)", "ARRAY[127]")) + .add(new TypeCoercionTestSetup("ARRAY[ARRAY[TINYINT '127']]", "array(array(integer))", "ARRAY[ARRAY[127]]")) + .add(new TypeCoercionTestSetup("MAP(ARRAY[TINYINT '1'], ARRAY[TINYINT '10'])", "map(integer, integer)", "MAP(ARRAY[1], ARRAY[10])")) + .add(new TypeCoercionTestSetup("MAP(ARRAY[TINYINT '1'], ARRAY[ARRAY[TINYINT '10']])", "map(integer, array(integer))", "MAP(ARRAY[1], ARRAY[ARRAY[10]])")) + .add(new TypeCoercionTestSetup("CAST(ROW(127) AS ROW(x TINYINT))", "row(x integer)", "CAST(ROW(127) AS ROW(x INTEGER))")) + .add(new TypeCoercionTestSetup("CAST(ROW(ROW(127)) AS ROW(x ROW(y TINYINT)))", "row(x row(y integer))", "CAST(ROW(ROW(127)) AS ROW(x ROW(y INTEGER)))")) + // smallint -> integer + .add(new TypeCoercionTestSetup("ARRAY[SMALLINT '32767']", "array(integer)", "ARRAY[32767]")) + .add(new TypeCoercionTestSetup("ARRAY[ARRAY[SMALLINT '32767']]", "array(array(integer))", "ARRAY[ARRAY[32767]]")) + .add(new TypeCoercionTestSetup("MAP(ARRAY[SMALLINT '1'], ARRAY[SMALLINT '10'])", "map(integer, integer)", "MAP(ARRAY[1], ARRAY[10])")) + .add(new TypeCoercionTestSetup("MAP(ARRAY[SMALLINT '1'], ARRAY[ARRAY[SMALLINT '10']])", "map(integer, array(integer))", "MAP(ARRAY[1], ARRAY[ARRAY[10]])")) + .add(new TypeCoercionTestSetup("CAST(ROW(32767) AS ROW(x SMALLINT))", "row(x integer)", "CAST(ROW(32767) AS ROW(x INTEGER))")) + .add(new TypeCoercionTestSetup("CAST(ROW(ROW(32767)) AS ROW(x ROW(y SMALLINT)))", "row(x row(y integer))", "CAST(ROW(ROW(32767)) AS ROW(x ROW(y INTEGER)))")) + .build(); + } + + public record TypeCoercionTestSetup(@Language("SQL") String sourceValueLiteral, String newColumnType, @Language("SQL") String newValueLiteral) + { + public TypeCoercionTestSetup + { + requireNonNull(sourceValueLiteral, "sourceValueLiteral is null"); + requireNonNull(newColumnType, "newColumnType is null"); + requireNonNull(newValueLiteral, "newValueLiteral is null"); + } + + public TypeCoercionTestSetup withNewValueLiteral(String newValueLiteral) + { + return new TypeCoercionTestSetup(sourceValueLiteral, newColumnType, newValueLiteral); + } + } + + @Test + public void testAddColumnWithTypeCoercion() + { + testAddColumnWithTypeCoercion("tinyint", "integer"); + testAddColumnWithTypeCoercion("smallint", "integer"); + + testAddColumnWithTypeCoercion("timestamp with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(0) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(1) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(2) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(3) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(4) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(5) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(6) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(7) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(8) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(9) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(10) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(11) with time zone", "timestamp(6) with time zone"); + testAddColumnWithTypeCoercion("timestamp(12) with time zone", "timestamp(6) with time zone"); + + testAddColumnWithTypeCoercion("timestamp", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(0)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(1)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(2)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(3)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(4)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(5)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(6)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(7)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(8)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(9)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(10)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(11)", "timestamp(6)"); + testAddColumnWithTypeCoercion("timestamp(12)", "timestamp(6)"); + + testAddColumnWithTypeCoercion("time", "time(6)"); + testAddColumnWithTypeCoercion("time(0)", "time(6)"); + testAddColumnWithTypeCoercion("time(1)", "time(6)"); + testAddColumnWithTypeCoercion("time(2)", "time(6)"); + testAddColumnWithTypeCoercion("time(3)", "time(6)"); + testAddColumnWithTypeCoercion("time(4)", "time(6)"); + testAddColumnWithTypeCoercion("time(5)", "time(6)"); + testAddColumnWithTypeCoercion("time(6)", "time(6)"); + testAddColumnWithTypeCoercion("time(7)", "time(6)"); + testAddColumnWithTypeCoercion("time(8)", "time(6)"); + testAddColumnWithTypeCoercion("time(9)", "time(6)"); + testAddColumnWithTypeCoercion("time(10)", "time(6)"); + testAddColumnWithTypeCoercion("time(11)", "time(6)"); + testAddColumnWithTypeCoercion("time(12)", "time(6)"); + + testAddColumnWithTypeCoercion("char(1)", "varchar"); + + testAddColumnWithTypeCoercion("array(char(10))", "array(varchar)"); + testAddColumnWithTypeCoercion("map(char(20), char(30))", "map(varchar, varchar)"); + testAddColumnWithTypeCoercion("row(x char(40))", "row(x varchar)"); + + testAddColumnWithTypeCoercion("array(tinyint)", "array(integer)"); + testAddColumnWithTypeCoercion("map(tinyint, tinyint)", "map(integer, integer)"); + testAddColumnWithTypeCoercion("row(x tinyint)", "row(x integer)"); + + testAddColumnWithTypeCoercion("array(smallint)", "array(integer)"); + testAddColumnWithTypeCoercion("map(smallint, smallint)", "map(integer, integer)"); + testAddColumnWithTypeCoercion("row(x smallint)", "row(x integer)"); + } + + private void testAddColumnWithTypeCoercion(String columnType, String expectedColumnType) + { + try (TestTable testTable = newTrinoTable("test_coercion_add_column", "(a varchar, b row(x integer))")) { + assertUpdate("ALTER TABLE " + testTable.getName() + " ADD COLUMN b.y " + columnType); + assertThat(getColumnType(testTable.getName(), "b")).isEqualTo("row(x integer, y %s)".formatted(expectedColumnType)); + + assertUpdate("ALTER TABLE " + testTable.getName() + " ADD COLUMN c " + columnType); + assertThat(getColumnType(testTable.getName(), "c")).isEqualTo(expectedColumnType); + } + } + + @Test + public void testSystemTables() + { + String catalog = getSession().getCatalog().orElseThrow(); + String schema = getSession().getSchema().orElseThrow(); + for (TableType tableType : TableType.values()) { + if (tableType != TableType.DATA) { + // Like a system table. Make sure this is "table not found". + assertQueryFails( + "TABLE \"$%s\"".formatted(tableType.name().toLowerCase(ENGLISH)), + "\\Qline 1:1: Table '%s.%s.\"$%s\"' does not exist".formatted(catalog, schema, tableType.name().toLowerCase(ENGLISH))); + } + } + + // given the base table exists + assertQuerySucceeds("TABLE nation"); + // verify that $ results in table not found + assertQueryFails("TABLE \"nation$foo\"", "\\Qline 1:1: Table '%s.%s.\"nation$foo\"' does not exist".formatted(catalog, schema)); + } + + @Test + public void testExtraProperties() + { + String tableName = "test_create_table_with_multiple_extra_properties_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (c1 integer) WITH (extra_properties = MAP(ARRAY['extra.property.one', 'extra.property.TWO'], ARRAY['one', 'two']))"); + + assertThat(query("SELECT key, value FROM \"" + tableName + "$properties\" WHERE key IN ('extra.property.one', 'extra.property.two')")) + .skippingTypesCheck() + .matches("VALUES ('extra.property.one', 'one'), ('extra.property.two', 'two')"); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES extra_properties = MAP(ARRAY['extra.property.one'], ARRAY['updated'])"); + assertThat(query("SELECT key, value FROM \"" + tableName + "$properties\" WHERE key IN ('extra.property.one', 'extra.property.two')")) + .skippingTypesCheck() + .matches("VALUES ('extra.property.one', 'updated'), ('extra.property.two', 'two')"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testReplaceTableExtraProperties() + { + String tableName = "test_replace_table_with_multiple_extra_properties_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (c1 integer) WITH (extra_properties = MAP(ARRAY['extra.property.one', 'extra.property.two'], ARRAY['one', 'two']))"); + assertUpdate("CREATE OR REPLACE TABLE " + tableName + " (c1 integer) WITH (extra_properties = MAP(ARRAY['extra.property.three'], ARRAY['three']))"); + + assertThat(query("SELECT key, value FROM \"" + tableName + "$properties\" WHERE key IN ('extra.property.one', 'extra.property.two', 'extra.property.three')")) + .skippingTypesCheck() + .matches("VALUES ('extra.property.three', 'three')"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testCreateTableAsSelectWithExtraProperties() + { + String tableName = "test_ctas_with_extra_properties_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " WITH (extra_properties = MAP(ARRAY['extra.property.one', 'extra.property.two'], ARRAY['one', 'two'])) " + + "AS SELECT 1 as c1", 1); + + assertThat(query("SELECT key, value FROM \"" + tableName + "$properties\" WHERE key IN ('extra.property.one', 'extra.property.two')")) + .skippingTypesCheck() + .matches("VALUES ('extra.property.one', 'one'), ('extra.property.two', 'two')"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testShowCreateNotContainExtraProperties() + { + String tableName = "test_show_create_table_with_extra_properties_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (c1 integer) WITH (extra_properties = MAP(ARRAY['extra.property.one', 'extra.property.two'], ARRAY['one', 'two']))"); + + assertThat((String) computeScalar("SHOW CREATE TABLE " + tableName)).doesNotContain("extra_properties =", "extra.property.one", "extra.property.two"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testNullExtraProperty() + { + assertQueryFails( + "CREATE TABLE test_create_table_with_null_extra_properties (c1 integer) WITH (extra_properties = MAP(ARRAY['null.property'], ARRAY[null]))", + ".*\\QUnable to set catalog 'iceberg' table property 'extra_properties' to [MAP(ARRAY['null.property'], ARRAY[null])]: Extra table property value cannot be null '{null.property=null}'\\E"); + + assertQueryFails( + "CREATE TABLE test_create_table_with_as_null_extra_properties WITH (extra_properties = MAP(ARRAY['null.property'], ARRAY[null])) AS SELECT 1 as c1", + ".*\\QUnable to set catalog 'iceberg' table property 'extra_properties' to [MAP(ARRAY['null.property'], ARRAY[null])]: Extra table property value cannot be null '{null.property=null}'\\E"); + } + + @Test + public void testIllegalExtraPropertyKey() + { + assertQueryFails( + "CREATE TABLE test_create_table_with_illegal_extra_properties (c1 integer) WITH (extra_properties = MAP(ARRAY['sorted_by'], ARRAY['id']))", + "\\QIllegal keys in extra_properties: [sorted_by]"); + + assertQueryFails( + "CREATE TABLE test_create_table_as_with_illegal_extra_properties WITH (extra_properties = MAP(ARRAY['extra_properties'], ARRAY['some_value'])) AS SELECT 1 as c1", + "\\QIllegal keys in extra_properties: [extra_properties]"); + + assertQueryFails( + "CREATE TABLE test_create_table_with_as_illegal_extra_properties WITH (extra_properties = MAP(ARRAY['write.format.default'], ARRAY['ORC'])) AS SELECT 1 as c1", + "\\QIllegal keys in extra_properties: [write.format.default]"); + + assertQueryFails( + "CREATE TABLE test_create_table_with_as_illegal_extra_properties WITH (extra_properties = MAP(ARRAY['comment'], ARRAY['some comment'])) AS SELECT 1 as c1", + "\\QIllegal keys in extra_properties: [comment]"); + + assertQueryFails( + "CREATE TABLE test_create_table_with_as_illegal_extra_properties WITH (extra_properties = MAP(ARRAY['not_allowed_property'], ARRAY['foo'])) AS SELECT 1 as c1", + "\\QIllegal keys in extra_properties: [not_allowed_property]"); + } + + @Test + public void testSetIllegalExtraPropertyKey() + { + try (TestTable table = newTrinoTable("test_set_illegal_table_properties", "(x int)")) { + assertQueryFails( + "ALTER TABLE " + table.getName() + " SET PROPERTIES extra_properties = MAP(ARRAY['sorted_by'], ARRAY['id'])", + "\\QIllegal keys in extra_properties: [sorted_by]"); + assertQueryFails( + "ALTER TABLE " + table.getName() + " SET PROPERTIES extra_properties = MAP(ARRAY['comment'], ARRAY['some comment'])", + "\\QIllegal keys in extra_properties: [comment]"); + assertQueryFails( + "ALTER TABLE " + table.getName() + " SET PROPERTIES extra_properties = MAP(ARRAY['not_allowed_property'], ARRAY['foo'])", + "\\QIllegal keys in extra_properties: [not_allowed_property]"); + } + } + + @Test + void testExplainAnalyzeSplitSourceMetrics() + { + assertExplainAnalyze( + "EXPLAIN ANALYZE VERBOSE SELECT * FROM nation a", + "splits generation metrics"); + } + + // regression test for https://github.com/trinodb/trino/issues/22922 + @Test + void testArrayElementChange() + { + try (TestTable table = newTrinoTable( + "test_array_schema_change", + "(col array(row(a varchar, b varchar)))", + List.of("CAST(array[row('a', 'b')] AS array(row(a varchar, b varchar)))"))) { + assertUpdate("ALTER TABLE " + table.getName() + " DROP COLUMN col.element.a"); + assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN col.element.c varchar"); + assertUpdate("ALTER TABLE " + table.getName() + " DROP COLUMN col.element.b"); + + String expected = format == ORC || format == AVRO ? "CAST(array[row(NULL)] AS array(row(c varchar)))" : "CAST(NULL AS array(row(c varchar)))"; + assertThat(query("SELECT * FROM " + table.getName())) + .matches("VALUES " + expected); + } + } + + // MAP type is tested in TestIcebergV2.testMapValueSchemaChange + + @Test + void testRowFieldChange() + { + try (TestTable table = newTrinoTable( + "test_row_schema_change", + "(col row(a varchar, b varchar))")) { + assertUpdate("INSERT INTO " + table.getName() + " SELECT CAST(row('a', 'b') AS row(a varchar, b varchar))", 1); + + assertUpdate("ALTER TABLE " + table.getName() + " DROP COLUMN col.a"); + assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN col.c varchar"); + assertUpdate("ALTER TABLE " + table.getName() + " DROP COLUMN col.b"); + + String expected = format == ORC || format == AVRO ? "CAST(row(NULL) AS row(c varchar))" : "CAST(NULL AS row(c varchar))"; + assertThat(query("SELECT * FROM " + table.getName())) + .matches("SELECT " + expected); + } + } + + @Test + public void testObjectStoreLayoutEnabledAndDataLocation() + throws Exception + { + String tableName = "test_object_store_layout_enabled_data_location" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " WITH (object_store_layout_enabled = true, data_location = 'local:///data-location/xyz') AS SELECT 1 AS val", 1); + + Location tableLocation = Location.of(getTableLocation(tableName)); + assertThat(fileSystem.directoryExists(tableLocation).get()).isTrue(); + + String filePath = (String) computeScalar("SELECT file_path FROM \"" + tableName + "$files\""); + Location dataFileLocation = Location.of(filePath); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()).isTrue(); + assertThat(filePath).matches("local:///data-location/xyz/.{6}/tpch/%s.*".formatted(tableName)); + + assertUpdate("DROP TABLE " + tableName); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()).isFalse(); + assertThat(fileSystem.newInputFile(tableLocation).exists()).isFalse(); + } + + @Test + public void testCreateTableWithDataLocationButObjectStoreLayoutDisabled() + { + assertQueryFails( + "CREATE TABLE test_data_location WITH (data_location = 'local:///data-location/xyz') AS SELECT 1 AS val", + "Data location can only be set when object store layout is enabled"); + } + + @Test + @Override + public void testSetFieldMapKeyType() + { + // Iceberg doesn't support change a map 'key' column. Only map values can be changed. + assertThatThrownBy(super::testSetFieldMapKeyType) + .hasMessageContaining("Failed to set field type: Cannot alter map keys"); + } + + @Test + @Override + public void testSetNestedFieldMapKeyType() + { + // Iceberg doesn't support change a map 'key' column. Only map values can be changed. + assertThatThrownBy(super::testSetNestedFieldMapKeyType) + .hasMessageContaining("Failed to set field type: Cannot alter map keys"); } @Override protected Optional filterSetColumnTypesDataProvider(SetColumnTypeSetup setup) { + if (setup.sourceColumnType().equals("timestamp(3) with time zone")) { + // The connector returns UTC instead of the given time zone + return Optional.of(setup.withNewValueLiteral("TIMESTAMP '2020-02-12 14:03:00.123000 +00:00'")); + } switch ("%s -> %s".formatted(setup.sourceColumnType(), setup.newColumnType())) { + case "tinyint -> smallint": case "bigint -> integer": case "decimal(5,3) -> decimal(5,2)": case "varchar -> char(20)": @@ -6953,13 +9276,14 @@ protected void verifySetColumnTypeFailurePermissible(Throwable e) { assertThat(e).hasMessageMatching(".*(Failed to set column type: Cannot change (column type:|type from .* to )" + "|Time(stamp)? precision \\(3\\) not supported for Iceberg. Use \"time(stamp)?\\(6\\)\" instead" + - "|Type not supported for Iceberg: char\\(20\\)).*"); + "|Type not supported for Iceberg: smallint|char\\(20\\)).*"); } @Override protected Optional filterSetFieldTypesDataProvider(SetColumnTypeSetup setup) { switch ("%s -> %s".formatted(setup.sourceColumnType(), setup.newColumnType())) { + case "tinyint -> smallint": case "bigint -> integer": case "decimal(5,3) -> decimal(5,2)": case "varchar -> char(20)": @@ -6994,21 +9318,23 @@ protected void verifySetFieldTypeFailurePermissible(Throwable e) { assertThat(e).hasMessageMatching(".*(Failed to set field type: Cannot change (column type:|type from .* to )" + "|Time(stamp)? precision \\(3\\) not supported for Iceberg. Use \"time(stamp)?\\(6\\)\" instead" + - "|Type not supported for Iceberg: char\\(20\\)" + + "|Type not supported for Iceberg: smallint|char\\(20\\)" + "|Iceberg doesn't support changing field type (from|to) non-primitive types).*"); } @Override - protected boolean supportsPhysicalPushdown() + protected Session withoutSmallFileThreshold(Session session) { - // TODO https://github.com/trinodb/trino/issues/17156 - return false; + return Session.builder(session) + .setCatalogSessionProperty(getSession().getCatalog().orElseThrow(), "parquet_small_file_threshold", "0B") + .setCatalogSessionProperty(getSession().getCatalog().orElseThrow(), "orc_tiny_stripe_threshold", "0B") + .build(); } private Session withSingleWriterPerTask(Session session) { return Session.builder(session) - .setSystemProperty("task_writer_count", "1") + .setSystemProperty("task_min_writer_count", "1") .build(); } @@ -7069,6 +9395,14 @@ private List getTableHistory(String tableName) .collect(toImmutableList()); } + private List getLatestSequenceNumbersInMetadataLogEntries(String tableName) + { + return getQueryRunner().execute(format("SELECT latest_sequence_number FROM \"%s$metadata_log_entries\"", tableName)) + .getOnlyColumn() + .map(Long.class::cast) + .collect(toImmutableList()); + } + private long getCurrentSnapshotId(String tableName) { return (long) computeScalar("SELECT snapshot_id FROM \"" + tableName + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES"); @@ -7118,13 +9452,65 @@ private String getFieldFromLatestSnapshotSummary(String tableName, String summar private QueryId executeWithQueryId(String sql) { return getDistributedQueryRunner() - .executeWithQueryId(getSession(), sql) - .getQueryId(); + .executeWithPlan(getSession(), sql) + .queryId(); } - private void assertQueryIdStored(String tableName, QueryId queryId) + private void assertQueryIdAndUserStored(String tableName, QueryId queryId) { assertThat(getFieldFromLatestSnapshotSummary(tableName, TRINO_QUERY_ID_NAME)) .isEqualTo(queryId.toString()); + assertThat(getFieldFromLatestSnapshotSummary(tableName, TRINO_USER_NAME)) + .isEqualTo("user"); + } + + private Consumer assertRemoteExchangesCount(int expectedRemoteExchangesCount) + { + return plan -> { + int actualRemoteExchangesCount = searchFrom(plan.getRoot()) + .where(node -> node instanceof ExchangeNode exchangeNode && exchangeNode.getScope() == ExchangeNode.Scope.REMOTE) + .count(); + assertThat(actualRemoteExchangesCount).isEqualTo(expectedRemoteExchangesCount); + }; + } + + private Consumer assertNoReadPartitioning(String... columnNames) + { + return plan -> { + List tableScanNodes = searchFrom(plan.getRoot()).where(node -> node instanceof TableScanNode) + .findAll().stream() + .map(TableScanNode.class::cast) + .collect(toImmutableList()); + for (TableScanNode tableScanNode : tableScanNodes) { + assertThat(tableScanNode.getUseConnectorNodePartitioning().orElseThrow()).isFalse(); + IcebergTableHandle connectorTableHandle = (IcebergTableHandle) tableScanNode.getTable().connectorHandle(); + assertThat(connectorTableHandle.getTablePartitioning()).isPresent(); + // iceberg table should have partitioning for the columns but should not be active + IcebergTablePartitioning tablePartitioning = connectorTableHandle.getTablePartitioning().orElseThrow(); + Set actualPartitionColumns = tablePartitioning.partitioningColumns().stream().map(IcebergColumnHandle::getName).collect(Collectors.toSet()); + assertThat(actualPartitionColumns).containsExactlyInAnyOrder(columnNames); + assertThat(tablePartitioning.active()).isFalse(); + } + }; + } + + private Map getTableProperties(String tableName) + { + return computeActual("SELECT key, value FROM \"" + tableName + "$properties\"").getMaterializedRows().stream() + .collect(toImmutableMap(row -> (String) row.getField(0), row -> (String) row.getField(1))); + } + + private static List getCompressionCodecs(Optional codecToExclude) + { + return Arrays.stream(HiveCompressionCodec.values()) + .filter(codec -> !(codecToExclude.isPresent() && codec.equals(codecToExclude.get()))) + .collect(toImmutableList()); + } + + private static List getFileFormats(IcebergFileFormat fileFormatToExclude) + { + return Arrays.stream(IcebergFileFormat.values()) + .filter(format -> !format.equals(fileFormatToExclude)) + .collect(toImmutableList()); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergFailureRecoveryTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergFailureRecoveryTest.java index 4d5e9d6bf89d..ac6cd2385d3a 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergFailureRecoveryTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergFailureRecoveryTest.java @@ -16,7 +16,7 @@ import io.trino.operator.RetryPolicy; import io.trino.spi.ErrorType; import io.trino.testing.BaseFailureRecoveryTest; -import org.testng.annotations.DataProvider; +import org.junit.jupiter.api.Test; import java.util.Optional; @@ -26,6 +26,7 @@ import static io.trino.execution.FailureInjector.InjectedFailureType.TASK_GET_RESULTS_REQUEST_TIMEOUT; import static io.trino.execution.FailureInjector.InjectedFailureType.TASK_MANAGEMENT_REQUEST_FAILURE; import static io.trino.execution.FailureInjector.InjectedFailureType.TASK_MANAGEMENT_REQUEST_TIMEOUT; +import static io.trino.operator.RetryPolicy.TASK; public abstract class BaseIcebergFailureRecoveryTest extends BaseFailureRecoveryTest @@ -41,17 +42,7 @@ protected boolean areWriteRetriesSupported() return true; } - @Override - @DataProvider(name = "parallelTests", parallel = true) - public Object[][] parallelTests() - { - return moreParallelTests(super.parallelTests(), - parallelTest("testCreatePartitionedTable", this::testCreatePartitionedTable), - parallelTest("testInsertIntoNewPartition", this::testInsertIntoNewPartition), - parallelTest("testInsertIntoExistingPartition", this::testInsertIntoExistingPartition), - parallelTest("testMergePartitionedTable", this::testMergePartitionedTable)); - } - + @Test protected void testCreatePartitionedTable() { testTableModification( @@ -61,29 +52,54 @@ protected void testCreatePartitionedTable() } // Copied from BaseDeltaFailureRecoveryTest + @Test @Override protected void testDelete() { - // Test method is overriden because method from superclass assumes more complex plan for `DELETE` query. + // Test method is overridden because method from superclass assumes more complex plan for `DELETE` query. // Assertions do not play well if plan consists of just two fragments. Optional setupQuery = Optional.of("CREATE TABLE
AS SELECT * FROM orders"); Optional cleanupQuery = Optional.of("DROP TABLE
"); String deleteQuery = "DELETE FROM
WHERE orderkey = 1"; - assertThatQuery(deleteQuery) - .withSetupQuery(setupQuery) - .withCleanupQuery(cleanupQuery) - .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) - .at(boundaryCoordinatorStage()) - .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)); + if (getRetryPolicy() == TASK) { + assertThatQuery(deleteQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(boundaryCoordinatorStage()) + .finishesSuccessfully() + .cleansUpTemporaryTables(); + } + else { + assertThatQuery(deleteQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(boundaryCoordinatorStage()) + .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) + .cleansUpTemporaryTables(); + } - assertThatQuery(deleteQuery) - .withSetupQuery(setupQuery) - .withCleanupQuery(cleanupQuery) - .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) - .at(rootStage()) - .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)); + if (getRetryPolicy() == TASK) { + assertThatQuery(deleteQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(rootStage()) + .finishesSuccessfully() + .cleansUpTemporaryTables(); + } + else { + assertThatQuery(deleteQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(rootStage()) + .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) + .cleansUpTemporaryTables(); + } assertThatQuery(deleteQuery) .withSetupQuery(setupQuery) @@ -91,7 +107,8 @@ protected void testDelete() .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) .at(leafStage()) .failsWithoutRetries(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); // note: this is effectively same as test with `leafStage`. Should it be dropped? assertThatQuery(deleteQuery) @@ -100,7 +117,8 @@ protected void testDelete() .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); assertThatQuery(deleteQuery) .withSetupQuery(setupQuery) @@ -108,7 +126,8 @@ protected void testDelete() .experiencing(TASK_MANAGEMENT_REQUEST_FAILURE) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageFindingMatch("Error 500 Internal Server Error|Error closing remote buffer, expected 204 got 500")) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); assertThatQuery(deleteQuery) .withSetupQuery(setupQuery) @@ -116,7 +135,8 @@ protected void testDelete() .experiencing(TASK_MANAGEMENT_REQUEST_TIMEOUT) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageFindingMatch("Encountered too many errors talking to a worker node|Error closing remote buffer")) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); if (getRetryPolicy() == RetryPolicy.QUERY) { assertThatQuery(deleteQuery) @@ -125,7 +145,8 @@ protected void testDelete() .experiencing(TASK_GET_RESULTS_REQUEST_FAILURE) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageFindingMatch("Error 500 Internal Server Error|Error closing remote buffer, expected 204 got 500")) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); assertThatQuery(deleteQuery) .withSetupQuery(setupQuery) @@ -133,34 +154,60 @@ protected void testDelete() .experiencing(TASK_GET_RESULTS_REQUEST_TIMEOUT) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageFindingMatch("Encountered too many errors talking to a worker node|Error closing remote buffer")) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); } } // Copied from BaseDeltaFailureRecoveryTest + @Test @Override protected void testUpdate() { - // Test method is overriden because method from superclass assumes more complex plan for `UPDATE` query. + // Test method is overridden because method from superclass assumes more complex plan for `UPDATE` query. // Assertions do not play well if plan consists of just two fragments. Optional setupQuery = Optional.of("CREATE TABLE
AS SELECT * FROM orders"); Optional cleanupQuery = Optional.of("DROP TABLE
"); String updateQuery = "UPDATE
SET shippriority = 101 WHERE custkey = 1"; - assertThatQuery(updateQuery) - .withSetupQuery(setupQuery) - .withCleanupQuery(cleanupQuery) - .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) - .at(boundaryCoordinatorStage()) - .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)); + if (getRetryPolicy() == TASK) { + assertThatQuery(updateQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(boundaryCoordinatorStage()) + .finishesSuccessfully() + .cleansUpTemporaryTables(); + } + else { + assertThatQuery(updateQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(boundaryCoordinatorStage()) + .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) + .cleansUpTemporaryTables(); + } - assertThatQuery(updateQuery) - .withSetupQuery(setupQuery) - .withCleanupQuery(cleanupQuery) - .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) - .at(rootStage()) - .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)); + if (getRetryPolicy() == TASK) { + assertThatQuery(updateQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(rootStage()) + .finishesSuccessfully() + .cleansUpTemporaryTables(); + } + else { + assertThatQuery(updateQuery) + .withSetupQuery(setupQuery) + .withCleanupQuery(cleanupQuery) + .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) + .at(rootStage()) + .failsAlways(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) + .cleansUpTemporaryTables(); + } assertThatQuery(updateQuery) .withSetupQuery(setupQuery) @@ -168,7 +215,8 @@ protected void testUpdate() .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) .at(leafStage()) .failsWithoutRetries(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); assertThatQuery(updateQuery) .withSetupQuery(setupQuery) @@ -176,7 +224,8 @@ protected void testUpdate() .experiencing(TASK_FAILURE, Optional.of(ErrorType.INTERNAL_ERROR)) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageContaining(FAILURE_INJECTION_MESSAGE)) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); assertThatQuery(updateQuery) .withSetupQuery(setupQuery) @@ -184,7 +233,8 @@ protected void testUpdate() .experiencing(TASK_MANAGEMENT_REQUEST_FAILURE) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageFindingMatch("Error 500 Internal Server Error|Error closing remote buffer, expected 204 got 500")) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); assertThatQuery(updateQuery) .withSetupQuery(setupQuery) @@ -201,7 +251,8 @@ protected void testUpdate() .experiencing(TASK_GET_RESULTS_REQUEST_FAILURE) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageFindingMatch("Error 500 Internal Server Error|Error closing remote buffer, expected 204 got 500")) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); assertThatQuery(updateQuery) .withSetupQuery(setupQuery) @@ -209,10 +260,12 @@ protected void testUpdate() .experiencing(TASK_GET_RESULTS_REQUEST_TIMEOUT) .at(boundaryDistributedStage()) .failsWithoutRetries(failure -> failure.hasMessageFindingMatch("Encountered too many errors talking to a worker node|Error closing remote buffer")) - .finishesSuccessfully(); + .finishesSuccessfully() + .cleansUpTemporaryTables(); } } + @Test protected void testInsertIntoNewPartition() { testTableModification( @@ -221,6 +274,7 @@ protected void testInsertIntoNewPartition() Optional.of("DROP TABLE
")); } + @Test protected void testInsertIntoExistingPartition() { testTableModification( @@ -229,6 +283,7 @@ protected void testInsertIntoExistingPartition() Optional.of("DROP TABLE
")); } + @Test protected void testMergePartitionedTable() { testTableModification( diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMaterializedViewTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMaterializedViewTest.java index 73eaa253a4ac..d840314fe501 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMaterializedViewTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMaterializedViewTest.java @@ -13,50 +13,102 @@ */ package io.trino.plugin.iceberg; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import io.trino.Session; -import io.trino.metadata.MaterializedViewDefinition; -import io.trino.metadata.QualifiedObjectName; -import io.trino.spi.connector.SchemaTableName; +import io.trino.connector.MockConnectorFactory; +import io.trino.connector.MockConnectorPlugin; +import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.spi.Page; +import io.trino.spi.QueryId; +import io.trino.spi.SplitWeight; +import io.trino.spi.block.BlockBuilder; +import io.trino.spi.connector.ConnectorAccessControl; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.ConnectorSplit; +import io.trino.spi.connector.ConnectorTransactionHandle; +import io.trino.spi.connector.FixedSplitSource; +import io.trino.spi.function.FunctionProvider; +import io.trino.spi.function.table.AbstractConnectorTableFunction; +import io.trino.spi.function.table.Argument; +import io.trino.spi.function.table.ConnectorTableFunctionHandle; +import io.trino.spi.function.table.Descriptor; +import io.trino.spi.function.table.TableFunctionAnalysis; +import io.trino.spi.function.table.TableFunctionProcessorProvider; +import io.trino.spi.function.table.TableFunctionProcessorState; +import io.trino.spi.function.table.TableFunctionSplitProcessor; +import io.trino.spi.security.ConnectorIdentity; import io.trino.sql.tree.ExplainType; import io.trino.testing.AbstractTestQueryFramework; import io.trino.testing.MaterializedRow; -import io.trino.transaction.TransactionId; -import io.trino.transaction.TransactionManager; -import org.assertj.core.api.Condition; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - +import io.trino.testing.QueryRunner; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.time.ZonedDateTime; +import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import static com.google.common.collect.ImmutableSet.toImmutableSet; -import static io.trino.SystemSessionProperties.LEGACY_MATERIALIZED_VIEW_GRACE_PERIOD; +import static com.google.common.collect.Iterables.getOnlyElement; +import static io.airlift.slice.SizeOf.instanceSize; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; +import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.spi.function.table.ReturnTypeSpecification.GenericTable.GENERIC_TABLE; +import static io.trino.spi.function.table.TableFunctionProcessorState.Finished.FINISHED; +import static io.trino.spi.function.table.TableFunctionProcessorState.Processed.produced; +import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.testing.MaterializedResult.DEFAULT_PRECISION; -import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.DELETE_TABLE; import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.DROP_MATERIALIZED_VIEW; -import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.INSERT_TABLE; import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.REFRESH_MATERIALIZED_VIEW; import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.RENAME_MATERIALIZED_VIEW; -import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.SELECT_COLUMN; -import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.UPDATE_TABLE; import static io.trino.testing.TestingAccessControlManager.privilege; import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.String.format; -import static org.assertj.core.api.Assertions.anyOf; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; public abstract class BaseIcebergMaterializedViewTest extends AbstractTestQueryFramework { - protected final String storageSchemaName = "testing_storage_schema_" + randomNameSuffix(); - protected abstract String getSchemaDirectory(); - @BeforeClass + protected abstract String getStorageMetadataLocation(String materializedViewName); + + protected static MockConnectorPlugin createMockConnectorPlugin() + { + return new MockConnectorPlugin(MockConnectorFactory.builder() + .withTableFunctions(ImmutableSet.of(new SequenceTableFunction())) + .withFunctionProvider(Optional.of(new FunctionProvider() + { + @Override + public TableFunctionProcessorProvider getTableFunctionProcessorProvider(ConnectorTableFunctionHandle functionHandle) + { + if (functionHandle instanceof SequenceTableFunctionHandle) { + return new SequenceTableFunctionProcessorProvider(); + } + throw new IllegalArgumentException("This ConnectorTableFunctionHandle is not supported"); + } + })) + .withTableFunctionSplitSources(functionHandle -> { + if (functionHandle instanceof SequenceTableFunctionHandle) { + return new FixedSplitSource(ImmutableList.of(new SequenceConnectorSplit())); + } + throw new IllegalArgumentException("This ConnectorTableFunctionHandle is not supported"); + }) + .build()); + } + + @BeforeAll public void setUp() { assertUpdate("CREATE TABLE base_table1(_bigint BIGINT, _date DATE) WITH (partitioning = ARRAY['_date'])"); @@ -65,17 +117,13 @@ public void setUp() assertUpdate("CREATE TABLE base_table2 (_varchar VARCHAR, _bigint BIGINT, _date DATE) WITH (partitioning = ARRAY['_bigint', '_date'])"); assertUpdate("INSERT INTO base_table2 VALUES ('a', 0, DATE '2019-09-08'), ('a', 1, DATE '2019-09-08'), ('a', 0, DATE '2019-09-09')", 3); - - assertUpdate("CREATE SCHEMA " + storageSchemaName); } @Test public void testShowTables() { assertUpdate("CREATE MATERIALIZED VIEW materialized_view_show_tables_test AS SELECT * FROM base_table1"); - SchemaTableName storageTableName = getStorageTable("materialized_view_show_tables_test"); - - Set expectedTables = ImmutableSet.of("base_table1", "base_table2", "materialized_view_show_tables_test", storageTableName.getTableName()); + Set expectedTables = ImmutableSet.of("base_table1", "base_table2", "materialized_view_show_tables_test"); Set actualTables = computeActual("SHOW TABLES").getOnlyColumnAsSet().stream() .map(String.class::cast) .collect(toImmutableSet()); @@ -100,38 +148,22 @@ public void testCommentColumnMaterializedView() @Test public void testMaterializedViewsMetadata() { - String catalogName = getSession().getCatalog().orElseThrow(); - String schemaName = getSession().getSchema().orElseThrow(); String materializedViewName = "test_materialized_view_" + randomNameSuffix(); computeActual("CREATE TABLE small_region AS SELECT * FROM tpch.tiny.region LIMIT 1"); computeActual(format("CREATE MATERIALIZED VIEW %s AS SELECT * FROM small_region LIMIT 1", materializedViewName)); - // test storage table name - assertQuery( - format( - "SELECT storage_catalog, storage_schema, CONCAT(storage_schema, '.', storage_table)" + - "FROM system.metadata.materialized_views WHERE schema_name = '%s' AND name = '%s'", - // TODO (https://github.com/trinodb/trino/issues/9039) remove redundant schema_name filter - schemaName, - materializedViewName), - format( - "VALUES ('%s', '%s', '%s')", - catalogName, - schemaName, - getStorageTable(catalogName, schemaName, materializedViewName))); - // test freshness update assertQuery( // TODO (https://github.com/trinodb/trino/issues/9039) remove redundant schema_name filter - format("SELECT freshness FROM system.metadata.materialized_views WHERE schema_name = '%s' AND name = '%s'", schemaName, materializedViewName), + format("SELECT freshness FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '%s'", materializedViewName), "VALUES 'STALE'"); computeActual(format("REFRESH MATERIALIZED VIEW %s", materializedViewName)); assertQuery( // TODO (https://github.com/trinodb/trino/issues/9039) remove redundant schema_name filter - format("SELECT freshness FROM system.metadata.materialized_views WHERE schema_name = '%s' AND name = '%s'", schemaName, materializedViewName), + format("SELECT freshness FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '%s'", materializedViewName), "VALUES 'FRESH'"); assertUpdate("DROP TABLE small_region"); @@ -144,7 +176,7 @@ public void testCreateWithInvalidPropertyFails() assertThatThrownBy(() -> computeActual("CREATE MATERIALIZED VIEW materialized_view_with_property " + "WITH (invalid_property = ARRAY['_date']) AS " + "SELECT _bigint, _date FROM base_table1")) - .hasMessage("Catalog 'iceberg' materialized view property 'invalid_property' does not exist"); + .hasMessage("line 1:64: Catalog 'iceberg' materialized view property 'invalid_property' does not exist"); } @Test @@ -182,7 +214,7 @@ public void testShowCreate() "WITH (\n" + " format = 'ORC',\n" + " format_version = 2,\n" + - " location = '" + getSchemaDirectory() + "/st_\\E[0-9a-f]+-[0-9a-f]+\\Q',\n" + + " location = '" + getSchemaDirectory() + "/test_mv_show_create-\\E[0-9a-f]+\\Q',\n" + " orc_bloom_filter_columns = ARRAY['_date'],\n" + " orc_bloom_filter_fpp = 1E-1,\n" + " partitioning = ARRAY['_date'],\n" + @@ -269,22 +301,6 @@ public void testRefreshDenyPermission() assertUpdate("DROP MATERIALIZED VIEW materialized_view_refresh_deny"); } - @Test - public void testRefreshAllowedWithRestrictedStorageTable() - { - assertUpdate("CREATE MATERIALIZED VIEW materialized_view_refresh AS SELECT * FROM base_table1"); - SchemaTableName storageTable = getStorageTable("materialized_view_refresh"); - - assertAccessAllowed( - "REFRESH MATERIALIZED VIEW materialized_view_refresh", - privilege(storageTable.getTableName(), INSERT_TABLE), - privilege(storageTable.getTableName(), DELETE_TABLE), - privilege(storageTable.getTableName(), UPDATE_TABLE), - privilege(storageTable.getTableName(), SELECT_COLUMN)); - - assertUpdate("DROP MATERIALIZED VIEW materialized_view_refresh"); - } - @Test public void testCreateRefreshSelect() { @@ -310,40 +326,40 @@ public void testCreateRefreshSelect() // 4. Select the data from refreshed materialized view, verify the number of rows in the result // 5. Ensure that the plan uses the storage table // 6. In some cases validate the result data - assertEquals(computeActual("SELECT * FROM materialized_view_no_part").getRowCount(), 6); + assertThat(computeActual("SELECT * FROM materialized_view_no_part").getRowCount()).isEqualTo(6); assertThat(getExplainPlan("SELECT * FROM materialized_view_no_part", ExplainType.Type.IO)) .contains("base_table1"); assertUpdate("REFRESH MATERIALIZED VIEW materialized_view_no_part", 6); - assertEquals(computeActual("SELECT * FROM materialized_view_no_part").getRowCount(), 6); + assertThat(computeActual("SELECT * FROM materialized_view_no_part").getRowCount()).isEqualTo(6); assertThat(getExplainPlan("SELECT * FROM materialized_view_no_part", ExplainType.Type.IO)).doesNotContain("base_table1"); - assertEquals(computeActual("SELECT * FROM materialized_view_agg").getRowCount(), 3); + assertThat(computeActual("SELECT * FROM materialized_view_agg").getRowCount()).isEqualTo(3); assertThat(getExplainPlan("SELECT * FROM materialized_view_agg", ExplainType.Type.IO)) .contains("base_table1"); assertUpdate("REFRESH MATERIALIZED VIEW materialized_view_agg", 3); - assertEquals(computeActual("SELECT * FROM materialized_view_agg").getRowCount(), 3); + assertThat(computeActual("SELECT * FROM materialized_view_agg").getRowCount()).isEqualTo(3); assertThat(getExplainPlan("SELECT * FROM materialized_view_agg", ExplainType.Type.IO)) .doesNotContain("base_table1"); assertQuery(session, "SELECT * FROM materialized_view_agg", "VALUES (DATE '2019-09-10', 2)," + "(DATE '2019-09-08', 1), (DATE '2019-09-09', 3)"); - assertEquals(computeActual("SELECT * FROM materialized_view_part").getRowCount(), 3); + assertThat(computeActual("SELECT * FROM materialized_view_part").getRowCount()).isEqualTo(3); assertThat(getExplainPlan("SELECT * FROM materialized_view_part", ExplainType.Type.IO)) .contains("base_table1"); assertUpdate("REFRESH MATERIALIZED VIEW materialized_view_part", 3); - assertEquals(computeActual("SELECT * FROM materialized_view_part").getRowCount(), 3); + assertThat(computeActual("SELECT * FROM materialized_view_part").getRowCount()).isEqualTo(3); assertThat(getExplainPlan("SELECT * FROM materialized_view_part", ExplainType.Type.IO)).doesNotContain("base_table1"); - assertEquals(computeActual("SELECT * FROM materialized_view_join").getRowCount(), 5); + assertThat(computeActual("SELECT * FROM materialized_view_join").getRowCount()).isEqualTo(5); assertThat(getExplainPlan("SELECT * FROM materialized_view_join", ExplainType.Type.IO)).contains("base_table1", "base_table2"); assertUpdate("REFRESH MATERIALIZED VIEW materialized_view_join", 5); - assertEquals(computeActual("SELECT * FROM materialized_view_join").getRowCount(), 5); + assertThat(computeActual("SELECT * FROM materialized_view_join").getRowCount()).isEqualTo(5); assertThat(getExplainPlan("SELECT * FROM materialized_view_join", ExplainType.Type.IO)).doesNotContain("base_table1", "base_table2"); - assertEquals(computeActual("SELECT * FROM materialized_view_join_part").getRowCount(), 4); + assertThat(computeActual("SELECT * FROM materialized_view_join_part").getRowCount()).isEqualTo(4); assertThat(getExplainPlan("SELECT * FROM materialized_view_join_part", ExplainType.Type.IO)).contains("base_table1", "base_table2"); assertUpdate("REFRESH MATERIALIZED VIEW materialized_view_join_part", 4); - assertEquals(computeActual("SELECT * FROM materialized_view_join_part").getRowCount(), 4); + assertThat(computeActual("SELECT * FROM materialized_view_join_part").getRowCount()).isEqualTo(4); assertThat(getExplainPlan("SELECT * FROM materialized_view_join_part", ExplainType.Type.IO)).doesNotContain("base_table1", "base_table2"); assertQuery(session, "SELECT * FROM materialized_view_join_part", "VALUES (2, 'a', DATE '2019-09-09', 1), " + "(0, 'a', DATE '2019-09-08', 2), (3, 'a', DATE '2019-09-09', 1), (1, 'a', DATE '2019-09-09', 1)"); @@ -358,10 +374,6 @@ public void testCreateRefreshSelect() @Test public void testDetectStaleness() { - Session legacySession = Session.builder(getSession()) - .setSystemProperty(LEGACY_MATERIALIZED_VIEW_GRACE_PERIOD, "true") - .build(); - // Base tables and materialized views for staleness check assertUpdate("CREATE TABLE base_table3(_bigint BIGINT, _date DATE) WITH (partitioning = ARRAY['_date'])"); assertUpdate("INSERT INTO base_table3 VALUES (0, DATE '2019-09-08'), (1, DATE '2019-09-09'), (2, DATE '2019-09-09')", 3); @@ -384,20 +396,12 @@ public void testDetectStaleness() assertUpdate("REFRESH MATERIALIZED VIEW materialized_view_join_part_stale", 3); assertUpdate("INSERT INTO base_table3 VALUES (3, DATE '2019-09-09'), (4, DATE '2019-09-10'), (5, DATE '2019-09-10')", 3); - assertThat(getExplainPlan(legacySession, "SELECT * FROM materialized_view_part_stale", ExplainType.Type.IO)) - .contains("base_table3"); assertThat(getExplainPlan("SELECT * FROM materialized_view_part_stale", ExplainType.Type.IO)) .doesNotContain("base_table"); - Condition containsTable3 = new Condition<>(p -> p.contains("base_table3"), "base_table3"); - Condition containsTable4 = new Condition<>(p -> p.contains("base_table4"), "base_table4"); - assertThat(getExplainPlan(legacySession, "SELECT * FROM materialized_view_join_stale", ExplainType.Type.IO)) - .is(anyOf(containsTable3, containsTable4)); assertThat(getExplainPlan("SELECT * FROM materialized_view_join_stale", ExplainType.Type.IO)) .doesNotContain("base_table"); - assertThat(getExplainPlan(legacySession, "SELECT * FROM materialized_view_join_part_stale", ExplainType.Type.IO)) - .is(anyOf(containsTable3, containsTable4)); assertThat(getExplainPlan("SELECT * FROM materialized_view_join_part_stale", ExplainType.Type.IO)) .doesNotContain("base_table"); @@ -429,7 +433,8 @@ public void testMaterializedViewOnExpiredTable() .build(); assertUpdate("CREATE TABLE mv_on_expired_base_table AS SELECT 10 a", 1); - assertUpdate(""" + assertUpdate( + """ CREATE MATERIALIZED VIEW mv_on_expired_the_mv GRACE PERIOD INTERVAL '0' SECOND AS SELECT sum(a) s FROM mv_on_expired_base_table"""); @@ -463,7 +468,8 @@ public void testMaterializedViewOnExpiredTable() public void testMaterializedViewOnTableRolledBack() { assertUpdate("CREATE TABLE mv_on_rolled_back_base_table(a integer)"); - assertUpdate(""" + assertUpdate( + """ CREATE MATERIALIZED VIEW mv_on_rolled_back_the_mv GRACE PERIOD INTERVAL '0' SECOND AS SELECT sum(a) s FROM mv_on_rolled_back_base_table"""); @@ -475,7 +481,7 @@ public void testMaterializedViewOnTableRolledBack() // Base MV on a snapshot "in the future" assertUpdate("REFRESH MATERIALIZED VIEW mv_on_rolled_back_the_mv", 1); - assertUpdate(format("CALL system.rollback_to_snapshot(CURRENT_SCHEMA, 'mv_on_rolled_back_base_table', %s)", firstSnapshot)); + assertUpdate(format("ALTER TABLE mv_on_rolled_back_base_table EXECUTE rollback_to_snapshot(%s)", firstSnapshot)); // View still can be queried assertThat(query("TABLE mv_on_rolled_back_the_mv")) @@ -524,7 +530,7 @@ public void testSqlFeatures() "WITH (\n" + " format = 'PARQUET',\n" + " format_version = 2,\n" + - " location = '" + getSchemaDirectory() + "/st_\\E[0-9a-f]+-[0-9a-f]+\\Q',\n" + + " location = '" + getSchemaDirectory() + "/materialized_view_window-\\E[0-9a-f]+\\Q',\n" + " partitioning = ARRAY['_date'],\n" + " storage_schema = '" + schema + "'\n" + ") AS\n" + @@ -535,7 +541,7 @@ public void testSqlFeatures() " base_table1"); assertQueryFails("INSERT INTO materialized_view_window VALUES (0, '2019-09-08'), (1, DATE '2019-09-09'), (2, DATE '2019-09-09')", - "Inserting into materialized views is not supported"); + "line 1:1: Inserting into materialized views is not supported"); computeScalar("EXPLAIN (TYPE LOGICAL) REFRESH MATERIALIZED VIEW materialized_view_window"); computeScalar("EXPLAIN (TYPE DISTRIBUTED) REFRESH MATERIALIZED VIEW materialized_view_window"); @@ -571,10 +577,10 @@ public void testCreateMaterializedViewWhenTableExists() { String schema = getSession().getSchema().orElseThrow(); assertUpdate("CREATE TABLE test_create_materialized_view_when_table_exists (a INT, b INT)"); - assertThatThrownBy(() -> query("CREATE OR REPLACE MATERIALIZED VIEW test_create_materialized_view_when_table_exists AS SELECT sum(1) AS num_rows FROM base_table2")) - .hasMessage("Existing table is not a Materialized View: " + schema + ".test_create_materialized_view_when_table_exists"); - assertThatThrownBy(() -> query("CREATE MATERIALIZED VIEW IF NOT EXISTS test_create_materialized_view_when_table_exists AS SELECT sum(1) AS num_rows FROM base_table2")) - .hasMessage("Existing table is not a Materialized View: " + schema + ".test_create_materialized_view_when_table_exists"); + assertThat(query("CREATE OR REPLACE MATERIALIZED VIEW test_create_materialized_view_when_table_exists AS SELECT sum(1) AS num_rows FROM base_table2")) + .failure().hasMessage("Existing table is not a Materialized View: " + schema + ".test_create_materialized_view_when_table_exists"); + assertThat(query("CREATE MATERIALIZED VIEW IF NOT EXISTS test_create_materialized_view_when_table_exists AS SELECT sum(1) AS num_rows FROM base_table2")) + .failure().hasMessage("Existing table is not a Materialized View: " + schema + ".test_create_materialized_view_when_table_exists"); assertUpdate("DROP TABLE test_create_materialized_view_when_table_exists"); } @@ -583,8 +589,8 @@ public void testDropMaterializedViewCannotDropTable() { String schema = getSession().getSchema().orElseThrow(); assertUpdate("CREATE TABLE test_drop_materialized_view_cannot_drop_table (a INT, b INT)"); - assertThatThrownBy(() -> query("DROP MATERIALIZED VIEW test_drop_materialized_view_cannot_drop_table")) - .hasMessageContaining("Materialized view 'iceberg." + schema + ".test_drop_materialized_view_cannot_drop_table' does not exist, but a table with that name exists"); + assertThat(query("DROP MATERIALIZED VIEW test_drop_materialized_view_cannot_drop_table")) + .failure().hasMessageContaining("Materialized view 'iceberg." + schema + ".test_drop_materialized_view_cannot_drop_table' does not exist, but a table with that name exists"); assertUpdate("DROP TABLE test_drop_materialized_view_cannot_drop_table"); } @@ -593,8 +599,8 @@ public void testRenameMaterializedViewCannotRenameTable() { String schema = getSession().getSchema().orElseThrow(); assertUpdate("CREATE TABLE test_rename_materialized_view_cannot_rename_table (a INT, b INT)"); - assertThatThrownBy(() -> query("ALTER MATERIALIZED VIEW test_rename_materialized_view_cannot_rename_table RENAME TO new_materialized_view_name")) - .hasMessageContaining("Materialized View 'iceberg." + schema + ".test_rename_materialized_view_cannot_rename_table' does not exist, but a table with that name exists"); + assertThat(query("ALTER MATERIALIZED VIEW test_rename_materialized_view_cannot_rename_table RENAME TO new_materialized_view_name")) + .failure().hasMessageContaining("Materialized View 'iceberg." + schema + ".test_rename_materialized_view_cannot_rename_table' does not exist, but a table with that name exists"); assertUpdate("DROP TABLE test_rename_materialized_view_cannot_rename_table"); } @@ -633,48 +639,22 @@ public void testNestedMaterializedViews() } @Test - public void testStorageSchemaProperty() + public void testBucketPartitioning() { - String schemaName = getSession().getSchema().orElseThrow(); - String viewName = "storage_schema_property_test"; - assertUpdate( - "CREATE MATERIALIZED VIEW " + viewName + " " + - "WITH (storage_schema = '" + storageSchemaName + "') AS " + - "SELECT * FROM base_table1"); - SchemaTableName storageTable = getStorageTable(viewName); - assertThat(storageTable.getSchemaName()).isEqualTo(storageSchemaName); - - assertUpdate("REFRESH MATERIALIZED VIEW " + viewName, 6); - assertThat(computeActual("SELECT * FROM " + viewName).getRowCount()).isEqualTo(6); - assertThat(getExplainPlan("SELECT * FROM " + viewName, ExplainType.Type.IO)) - .doesNotContain("base_table1") - .contains(storageSchemaName); - - assertThat((String) computeScalar("SHOW CREATE MATERIALIZED VIEW " + viewName)) - .contains("storage_schema = '" + storageSchemaName + "'"); - - Set storageSchemaTables = computeActual("SHOW TABLES IN " + storageSchemaName).getOnlyColumnAsSet().stream() - .map(String.class::cast) - .collect(toImmutableSet()); - assertThat(storageSchemaTables).contains(storageTable.getTableName()); - - assertUpdate("DROP MATERIALIZED VIEW " + viewName); - storageSchemaTables = computeActual("SHOW TABLES IN " + storageSchemaName).getOnlyColumnAsSet().stream() - .map(String.class::cast) - .collect(toImmutableSet()); - assertThat(storageSchemaTables).doesNotContain(storageTable.getTableName()); - - assertThatThrownBy(() -> query( - "CREATE MATERIALIZED VIEW " + viewName + " " + - "WITH (storage_schema = 'non_existent') AS " + - "SELECT * FROM base_table1")) - .hasMessageContaining("non_existent not found"); - assertThatThrownBy(() -> query("DESCRIBE " + viewName)) - .hasMessageContaining(format("'iceberg.%s.%s' does not exist", schemaName, viewName)); + testBucketPartitioning("integer", "20050909"); + testBucketPartitioning("bigint", "200509091331001234"); + testBucketPartitioning("decimal(8,5)", "DECIMAL '876.54321'"); + testBucketPartitioning("decimal(28,21)", "DECIMAL '1234567.890123456789012345678'"); + testBucketPartitioning("date", "DATE '2005-09-09'"); + testBucketPartitioning("time(6)", "TIME '13:31:00.123456'"); + testBucketPartitioning("timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"); + testBucketPartitioning("timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"); + testBucketPartitioning("varchar", "VARCHAR 'Greetings from Warsaw!'"); + testBucketPartitioning("uuid", "UUID '406caec7-68b9-4778-81b2-a12ece70c8b1'"); + testBucketPartitioning("varbinary", "X'66696E6465706920726F636B7321'"); } - @Test(dataProvider = "testBucketPartitioningDataProvider") - public void testBucketPartitioning(String dataType, String exampleValue) + private void testBucketPartitioning(String dataType, String exampleValue) { // validate the example value type assertThat(query("SELECT " + exampleValue)) @@ -683,9 +663,11 @@ public void testBucketPartitioning(String dataType, String exampleValue) assertUpdate("CREATE MATERIALIZED VIEW test_bucket_partitioning WITH (partitioning=ARRAY['bucket(col, 4)']) AS SELECT * FROM (VALUES CAST(NULL AS %s), %s) t(col)" .formatted(dataType, exampleValue)); try { - SchemaTableName storageTable = getStorageTable("test_bucket_partitioning"); - assertThat((String) computeScalar("SHOW CREATE TABLE " + storageTable)) - .contains("partitioning = ARRAY['bucket(col, 4)']"); + TableMetadata storageMetadata = getStorageTableMetadata("test_bucket_partitioning"); + assertThat(storageMetadata.spec().fields()).hasSize(1); + PartitionField bucketPartitionField = getOnlyElement(storageMetadata.spec().fields()); + assertThat(bucketPartitionField.name()).isEqualTo("col_bucket"); + assertThat(bucketPartitionField.transform().toString()).isEqualTo("bucket[4]"); assertThat(query("SELECT * FROM test_bucket_partitioning WHERE col = " + exampleValue)) .matches("SELECT " + exampleValue); @@ -695,27 +677,17 @@ public void testBucketPartitioning(String dataType, String exampleValue) } } - @DataProvider - public Object[][] testBucketPartitioningDataProvider() + @Test + public void testTruncatePartitioning() { - // Iceberg supports bucket partitioning on int, long, decimal, date, time, timestamp, timestamptz, string, uuid, fixed, binary - return new Object[][] { - {"integer", "20050909"}, - {"bigint", "200509091331001234"}, - {"decimal(8,5)", "DECIMAL '876.54321'"}, - {"decimal(28,21)", "DECIMAL '1234567.890123456789012345678'"}, - {"date", "DATE '2005-09-09'"}, - {"time(6)", "TIME '13:31:00.123456'"}, - {"timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"}, - {"timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"}, - {"varchar", "VARCHAR 'Greetings from Warsaw!'"}, - {"uuid", "UUID '406caec7-68b9-4778-81b2-a12ece70c8b1'"}, - {"varbinary", "X'66696E6465706920726F636B7321'"}, - }; + testTruncatePartitioning("integer", "20050909"); + testTruncatePartitioning("bigint", "200509091331001234"); + testTruncatePartitioning("decimal(8,5)", "DECIMAL '876.54321'"); + testTruncatePartitioning("decimal(28,21)", "DECIMAL '1234567.890123456789012345678'"); + testTruncatePartitioning("varchar", "VARCHAR 'Greetings from Warsaw!'"); } - @Test(dataProvider = "testTruncatePartitioningDataProvider") - public void testTruncatePartitioning(String dataType, String exampleValue) + private void testTruncatePartitioning(String dataType, String exampleValue) { // validate the example value type assertThat(query("SELECT " + exampleValue)) @@ -724,9 +696,11 @@ public void testTruncatePartitioning(String dataType, String exampleValue) assertUpdate("CREATE MATERIALIZED VIEW test_truncate_partitioning WITH (partitioning=ARRAY['truncate(col, 4)']) AS SELECT * FROM (VALUES CAST(NULL AS %s), %s) t(col)" .formatted(dataType, exampleValue)); try { - SchemaTableName storageTable = getStorageTable("test_truncate_partitioning"); - assertThat((String) computeScalar("SHOW CREATE TABLE " + storageTable)) - .contains("partitioning = ARRAY['truncate(col, 4)']"); + TableMetadata storageMetadata = getStorageTableMetadata("test_truncate_partitioning"); + assertThat(storageMetadata.spec().fields()).hasSize(1); + PartitionField bucketPartitionField = getOnlyElement(storageMetadata.spec().fields()); + assertThat(bucketPartitionField.name()).isEqualTo("col_trunc"); + assertThat(bucketPartitionField.transform().toString()).isEqualTo("truncate[4]"); assertThat(query("SELECT * FROM test_truncate_partitioning WHERE col = " + exampleValue)) .matches("SELECT " + exampleValue); @@ -736,21 +710,23 @@ public void testTruncatePartitioning(String dataType, String exampleValue) } } - @DataProvider - public Object[][] testTruncatePartitioningDataProvider() + @Test + public void testTemporalPartitioning() { - // Iceberg supports truncate partitioning on int, long, decimal, string - return new Object[][] { - {"integer", "20050909"}, - {"bigint", "200509091331001234"}, - {"decimal(8,5)", "DECIMAL '876.54321'"}, - {"decimal(28,21)", "DECIMAL '1234567.890123456789012345678'"}, - {"varchar", "VARCHAR 'Greetings from Warsaw!'"}, - }; + testTemporalPartitioning("year", "date", "DATE '2005-09-09'"); + testTemporalPartitioning("year", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"); + testTemporalPartitioning("year", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"); + testTemporalPartitioning("month", "date", "DATE '2005-09-09'"); + testTemporalPartitioning("month", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"); + testTemporalPartitioning("month", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"); + testTemporalPartitioning("day", "date", "DATE '2005-09-09'"); + testTemporalPartitioning("day", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"); + testTemporalPartitioning("day", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"); + testTemporalPartitioning("hour", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"); + testTemporalPartitioning("hour", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"); } - @Test(dataProvider = "testTemporalPartitioningDataProvider") - public void testTemporalPartitioning(String partitioning, String dataType, String exampleValue) + private void testTemporalPartitioning(String partitioning, String dataType, String exampleValue) { // validate the example value type assertThat(query("SELECT " + exampleValue)) @@ -759,9 +735,11 @@ public void testTemporalPartitioning(String partitioning, String dataType, Strin assertUpdate("CREATE MATERIALIZED VIEW test_temporal_partitioning WITH (partitioning=ARRAY['%s(col)']) AS SELECT * FROM (VALUES CAST(NULL AS %s), %s) t(col)" .formatted(partitioning, dataType, exampleValue)); try { - SchemaTableName storageTable = getStorageTable("test_temporal_partitioning"); - assertThat((String) computeScalar("SHOW CREATE TABLE " + storageTable)) - .contains("partitioning = ARRAY['%s(col)']".formatted(partitioning)); + TableMetadata storageMetadata = getStorageTableMetadata("test_temporal_partitioning"); + assertThat(storageMetadata.spec().fields()).hasSize(1); + PartitionField bucketPartitionField = getOnlyElement(storageMetadata.spec().fields()); + assertThat(bucketPartitionField.name()).isEqualTo("col_" + partitioning); + assertThat(bucketPartitionField.transform().toString()).isEqualTo(partitioning); assertThat(query("SELECT * FROM test_temporal_partitioning WHERE col = " + exampleValue)) .matches("SELECT " + exampleValue); @@ -771,47 +749,460 @@ public void testTemporalPartitioning(String partitioning, String dataType, Strin } } - @DataProvider - public Object[][] testTemporalPartitioningDataProvider() + @Test + public void testMaterializedViewSnapshotSummariesHaveTrinoQueryId() { - return new Object[][] { - {"year", "date", "DATE '2005-09-09'"}, - {"year", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"}, - {"year", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"}, - {"month", "date", "DATE '2005-09-09'"}, - {"month", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"}, - {"month", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"}, - {"day", "date", "DATE '2005-09-09'"}, - {"day", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"}, - {"day", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"}, - {"hour", "timestamp(6)", "TIMESTAMP '2005-09-10 13:31:00.123456'"}, - {"hour", "timestamp(6) with time zone", "TIMESTAMP '2005-09-10 13:00:00.123456 Europe/Warsaw'"}, - }; + String materializedViewName = "test_materialized_view_snapshot_query_ids" + randomNameSuffix(); + String sourceTableName = "test_source_table_for_mat_view" + randomNameSuffix(); + assertUpdate(format("CREATE TABLE %s (a bigint, b bigint)", sourceTableName)); + assertUpdate(format("CREATE MATERIALIZED VIEW %s WITH (partitioning = ARRAY['a']) AS SELECT * FROM %s", materializedViewName, sourceTableName)); + + try { + assertUpdate(format("INSERT INTO %s VALUES (1, 1), (1, 4), (2, 2)", sourceTableName), 3); + + QueryId refreshQueryId = getDistributedQueryRunner() + .executeWithPlan(getSession(), format("REFRESH MATERIALIZED VIEW %s", materializedViewName)) + .queryId(); + String savedQueryId = getStorageTableMetadata(materializedViewName).currentSnapshot().summary().get("trino_query_id"); + assertThat(savedQueryId).isEqualTo(refreshQueryId.getId()); + } + finally { + assertUpdate("DROP TABLE " + sourceTableName); + assertUpdate("DROP MATERIALIZED VIEW " + materializedViewName); + } } - protected String getColumnComment(String tableName, String columnName) + @Test + public void testMaterializedViewStorageTypeCoercions() { - return (String) computeScalar("SELECT comment FROM information_schema.columns WHERE table_schema = '" + getSession().getSchema().orElseThrow() + "' AND table_name = '" + tableName + "' AND column_name = '" + columnName + "'"); + String materializedViewName = "test_materialized_view_storage_type_coercion" + randomNameSuffix(); + String sourceTableName = "test_materialized_view_storage" + randomNameSuffix(); + + assertUpdate(format( + """ + CREATE TABLE %s ( + t_3 time(3), + t_9 time(9), + ts_3 timestamp(3), + ts_9 timestamp(9), + tswtz_3 timestamp(3) with time zone, + tswtz_9 timestamp(9) with time zone + ) + """, sourceTableName)); + assertUpdate(format("INSERT INTO %s VALUES (localtime, localtime, localtimestamp, localtimestamp, current_timestamp, current_timestamp)", sourceTableName), 1); + + assertUpdate(format("CREATE MATERIALIZED VIEW %s AS SELECT * FROM %s", materializedViewName, sourceTableName)); + + assertThat(query(format("SELECT * FROM %s WHERE t_3 < localtime", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE t_9 < localtime", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE ts_3 < localtimestamp", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE ts_9 < localtimestamp", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE tswtz_3 < current_timestamp", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE tswtz_9 < current_timestamp", materializedViewName))).succeeds(); + + assertUpdate(format("REFRESH MATERIALIZED VIEW %s", materializedViewName), 1); + + assertThat(query(format("SELECT * FROM %s WHERE t_3 < localtime", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE t_9 < localtime", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE ts_3 < localtimestamp", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE ts_9 < localtimestamp", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE tswtz_3 < current_timestamp", materializedViewName))).succeeds(); + assertThat(query(format("SELECT * FROM %s WHERE tswtz_9 < current_timestamp", materializedViewName))).succeeds(); } - private SchemaTableName getStorageTable(String materializedViewName) + @Test + public void testDropLegacyMaterializedView() { - return getStorageTable(getSession().getCatalog().orElseThrow(), getSession().getSchema().orElseThrow(), materializedViewName); + String schemaName = getSession().getSchema().orElseThrow(); + String materializedViewName = "test_drop_legacy_materialized_view" + randomNameSuffix(); + String sourceTableName = "test_source_table_for_mat_view" + randomNameSuffix(); + assertUpdate(format("CREATE TABLE %s (a bigint, b bigint)", sourceTableName)); + assertUpdate(format("CREATE MATERIALIZED VIEW iceberg_legacy_mv.%s.%s AS SELECT * FROM %s", schemaName, materializedViewName, sourceTableName)); + + try { + // Refresh with legacy enabled + assertUpdate(format("INSERT INTO %s VALUES (1, 1), (1, 4), (2, 2)", sourceTableName), 3); + assertUpdate(format("REFRESH MATERIALIZED VIEW iceberg_legacy_mv.%s.%s", schemaName, materializedViewName), 3); + + // Refresh with legacy disabled + assertUpdate(format("INSERT INTO %s VALUES (10, 10), (10, 40), (20, 20)", sourceTableName), 3); + assertUpdate("REFRESH MATERIALIZED VIEW " + materializedViewName, 6); + + String storageTableName = (String) computeScalar("SELECT storage_table FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + materializedViewName + "'"); + assertThat(storageTableName) + .isEqualTo(computeScalar("SELECT storage_table FROM system.metadata.materialized_views WHERE catalog_name = 'iceberg_legacy_mv' AND schema_name = CURRENT_SCHEMA AND name = '" + materializedViewName + "'")) + .startsWith("st_"); + + assertThat(query("TABLE " + materializedViewName)).matches("TABLE " + sourceTableName); + assertThat(query("TABLE " + storageTableName)).matches("TABLE " + sourceTableName); + assertUpdate("DROP MATERIALIZED VIEW " + materializedViewName); + assertThat(query("TABLE " + materializedViewName)).failure().hasMessageMatching(".* does not exist"); + assertThat(query("TABLE " + storageTableName)).failure().hasMessageMatching(".* does not exist"); + } + finally { + assertUpdate("DROP TABLE " + sourceTableName); + assertUpdate(format("DROP MATERIALIZED VIEW IF EXISTS iceberg_legacy_mv.%s.%s", schemaName, materializedViewName)); + } } - private SchemaTableName getStorageTable(String catalogName, String schemaName, String materializedViewName) + @Test + public void testMaterializedViewCreatedFromTableFunction() + { + String viewName = "materialized_view_for_ptf_" + randomNameSuffix(); + assertUpdate("CREATE MATERIALIZED VIEW " + viewName + " AS SELECT * FROM TABLE(mock.system.sequence_function())"); + + assertFreshness(viewName, "STALE"); + assertThat(computeActual("SELECT last_fresh_time FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'").getOnlyValue()).isNull(); + int result1 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + + int result2 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + assertThat(result2).isNotEqualTo(result1); // differs because PTF sequence_function is called directly as mv is considered stale + assertFreshness(viewName, "STALE"); + assertThat(computeActual("SELECT last_fresh_time FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'").getOnlyValue()).isNull(); + + assertUpdate("REFRESH MATERIALIZED VIEW " + viewName, 1); + assertFreshness(viewName, "UNKNOWN"); + ZonedDateTime lastFreshTime = (ZonedDateTime) computeActual("SELECT last_fresh_time FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'").getOnlyValue(); + assertThat(lastFreshTime).isNotNull(); + int result3 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + assertThat(result3).isNotEqualTo(result2); // mv is not stale anymore so all selects until next refresh returns same result + int result4 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + int result5 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + assertThat(result4).isEqualTo(result3); + assertThat(result4).isEqualTo(result5); + + assertUpdate("REFRESH MATERIALIZED VIEW " + viewName, 1); + assertThat((ZonedDateTime) computeActual("SELECT last_fresh_time FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'").getOnlyValue()).isAfter(lastFreshTime); + assertFreshness(viewName, "UNKNOWN"); + int result6 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + assertThat(result6).isNotEqualTo(result5); + } + + @Test + public void testMaterializedViewCreatedFromTableFunctionAndTable() + { + String sourceTableName = "source_table_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + sourceTableName + " (VALUE INTEGER)"); + assertUpdate("INSERT INTO " + sourceTableName + " VALUES 2", 1); + String viewName = "materialized_view_for_ptf_adn_table_" + randomNameSuffix(); + assertUpdate("CREATE MATERIALIZED VIEW " + viewName + " AS SELECT * FROM TABLE(mock.system.sequence_function()) CROSS JOIN " + sourceTableName); + + List materializedRows = computeActual("SELECT * FROM " + viewName).getMaterializedRows(); + assertThat(materializedRows).hasSize(1); + assertThat(materializedRows.get(0).getField(1)).isEqualTo(2); + int valueFromPtf1 = (int) materializedRows.get(0).getField(0); + assertFreshness(viewName, "STALE"); + assertThat(computeActual("SELECT last_fresh_time FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'").getOnlyValue()).isNull(); + + materializedRows = computeActual("SELECT * FROM " + viewName).getMaterializedRows(); + assertThat(materializedRows).hasSize(1); + assertThat(materializedRows.get(0).getField(1)).isEqualTo(2); + int valueFromPtf2 = (int) materializedRows.get(0).getField(0); + assertThat(valueFromPtf2).isNotEqualTo(valueFromPtf1); // differs because PTF sequence_function is called directly as mv is considered stale + assertFreshness(viewName, "STALE"); + assertThat(computeActual("SELECT last_fresh_time FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'").getOnlyValue()).isNull(); + + assertUpdate("REFRESH MATERIALIZED VIEW " + viewName, 1); + assertFreshness(viewName, "UNKNOWN"); + ZonedDateTime lastFreshTime = (ZonedDateTime) computeActual("SELECT last_fresh_time FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'").getOnlyValue(); + assertThat(lastFreshTime).isNotNull(); + materializedRows = computeActual("SELECT * FROM " + viewName).getMaterializedRows(); + assertThat(materializedRows).hasSize(1); + assertThat(materializedRows.get(0).getField(1)).isEqualTo(2); + int valueFromPtf3 = (int) materializedRows.get(0).getField(0); + assertThat(valueFromPtf3).isNotEqualTo(valueFromPtf1); + assertThat(valueFromPtf3).isNotEqualTo(valueFromPtf2); + + materializedRows = computeActual("SELECT * FROM " + viewName).getMaterializedRows(); + assertThat(materializedRows).hasSize(1); + assertThat(materializedRows.get(0).getField(1)).isEqualTo(2); + int valueFromPtf4 = (int) materializedRows.get(0).getField(0); + assertThat(valueFromPtf4).isNotEqualTo(valueFromPtf1); + assertThat(valueFromPtf4).isNotEqualTo(valueFromPtf2); + assertThat(valueFromPtf4).isEqualTo(valueFromPtf3); // mv is not stale anymore so all selects until next refresh returns same result + } + + @Test + public void testMaterializedViewCreatedFromTableFunctionWithGracePeriod() + throws InterruptedException { - TransactionManager transactionManager = getQueryRunner().getTransactionManager(); - TransactionId transactionId = transactionManager.beginTransaction(false); - Session session = getSession().beginTransactionId(transactionId, transactionManager, getQueryRunner().getAccessControl()); - Optional materializedView = getQueryRunner().getMetadata() - .getMaterializedView(session, new QualifiedObjectName(catalogName, schemaName, materializedViewName)); - assertThat(materializedView).isPresent(); - return materializedView.get().getStorageTable().get().getSchemaTableName(); + String viewName = "materialized_view_for_ptf_with_grace_period_" + randomNameSuffix(); + assertUpdate("CREATE MATERIALIZED VIEW " + viewName + " GRACE PERIOD INTERVAL '1' SECOND AS SELECT * FROM TABLE(mock.system.sequence_function())"); + + int result1 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + int result2 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + assertThat(result2).isNotEqualTo(result1); + + assertUpdate("REFRESH MATERIALIZED VIEW " + viewName, 1); + int result3 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + assertThat(result3).isNotEqualTo(result2); + Thread.sleep(1001); + int result4 = (int) computeActual("SELECT * FROM " + viewName).getOnlyValue(); + assertThat(result4).isNotEqualTo(result3); + } + + @Test + public void testIncrementalRefresh() + { + String sourceTableName = "source_table" + randomNameSuffix(); + String materializedViewName = "test_materialized_view_" + randomNameSuffix(); + + Session defaultSession = getSession(); + Session incrementalRefreshDisabled = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", "incremental_refresh_enabled", "false") + .build(); + + String matViewDef = "SELECT a, b FROM %s WHERE a < 3 OR a > 5".formatted(sourceTableName); + + // create source table and two identical MVs + assertUpdate("CREATE TABLE %s (a int, b varchar)".formatted(sourceTableName)); + assertUpdate("INSERT INTO %s VALUES (1, 'abc'), (2, 'def')".formatted(sourceTableName), 2); + assertUpdate("CREATE MATERIALIZED VIEW %s_1 AS %s".formatted(materializedViewName, matViewDef)); + assertUpdate("CREATE MATERIALIZED VIEW %s_2 AS %s".formatted(materializedViewName, matViewDef)); + + // execute first refresh: afterwards both MVs will contain: (1, 'abc'), (2, 'def') + assertUpdate("REFRESH MATERIALIZED VIEW %s_1".formatted(materializedViewName), 2); + assertUpdate("REFRESH MATERIALIZED VIEW %s_2".formatted(materializedViewName), 2); + + // add some new rows to source + assertUpdate("INSERT INTO %s VALUES (3, 'ghi'), (4, 'jkl'), (5, 'mno'), (6, 'pqr')".formatted(sourceTableName), 4); + + // will do incremental refresh, and only add: (6, 'pqr') + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s_1".formatted(materializedViewName), 1); + // will do full refresh, and (re)add: (1, 'abc'), (2, 'def'), (6, 'pqr') + assertUpdate(incrementalRefreshDisabled, "REFRESH MATERIALIZED VIEW %s_2".formatted(materializedViewName), 3); + + // verify that view contents are the same + assertThat(query("TABLE %s_1".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'abc'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr')"); + assertThat(query("TABLE %s_2".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'abc'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr')"); + + // cleanup + assertUpdate("DROP MATERIALIZED VIEW %s_1".formatted(materializedViewName)); + assertUpdate("DROP MATERIALIZED VIEW %s_2".formatted(materializedViewName)); + assertUpdate("DROP TABLE %s".formatted(sourceTableName)); + } + + @Test + public void testFullRefreshForUnion() + { + String sourceTableName = "source_table" + randomNameSuffix(); + String materializedViewName = "test_materialized_view_" + randomNameSuffix(); + + Session defaultSession = getSession(); + + String matViewDef = + """ + SELECT a, b FROM %s a WHERE a.a < 3 UNION ALL + SELECT * FROM %s b WHERE b.a > 5""".formatted(sourceTableName, sourceTableName); + + // create source table and two identical MVs + assertUpdate("CREATE TABLE %s (a int, b varchar)".formatted(sourceTableName)); + assertUpdate("INSERT INTO %s VALUES (1, 'abc'), (2, 'def')".formatted(sourceTableName), 2); + assertUpdate("CREATE MATERIALIZED VIEW %s AS %s".formatted(materializedViewName, matViewDef)); + + // execute first refresh: afterwards both MVs will contain: (1, 'abc'), (2, 'def') + assertUpdate("REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 2); + + // add some new rows to source + assertUpdate("INSERT INTO %s VALUES (3, 'ghi'), (4, 'jkl'), (5, 'mno'), (6, 'pqr')".formatted(sourceTableName), 4); + + // will do a full refresh + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 3); + + // verify that view contents are the same + assertThat(query("TABLE %s".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'abc'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr')"); + + // cleanup + assertUpdate("DROP MATERIALIZED VIEW %s".formatted(materializedViewName)); + assertUpdate("DROP TABLE %s".formatted(sourceTableName)); + } + + @Test + public void testFullRefreshForUpdates() + { + String sourceTableName = "source_table" + randomNameSuffix(); + String materializedViewName = "test_materialized_view_" + randomNameSuffix(); + + Session defaultSession = getSession(); + + String matViewDef = "SELECT a, b FROM %s WHERE a < 3 OR a > 5".formatted(sourceTableName); + + // create source table and an MV + assertUpdate("CREATE TABLE %s (a int, b varchar)".formatted(sourceTableName)); + assertUpdate("INSERT INTO %s VALUES (1, 'abc'), (2, 'def')".formatted(sourceTableName), 2); + assertUpdate("CREATE MATERIALIZED VIEW %s AS %s".formatted(materializedViewName, matViewDef)); + + // execute first refresh: afterwards both MVs will contain: (1, 'abc'), (2, 'def') + assertUpdate("REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 2); + + // add some new rows to source + assertUpdate("INSERT INTO %s VALUES (3, 'ghi'), (4, 'jkl'), (5, 'mno'), (6, 'pqr')".formatted(sourceTableName), 4); + + // will do incremental refresh, and only add: (6, 'pqr') + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 1); + + // update one row and append one + assertUpdate("UPDATE %s SET b = 'updated' WHERE a = 1".formatted(sourceTableName), 1); + assertUpdate("INSERT INTO %s VALUES (7, 'stv')".formatted(sourceTableName), 1); + + // will do full refresh due to the above update command + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 4); + // verify view contents + assertThat(query("TABLE %s".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'updated'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr'), (7, VARCHAR 'stv')"); + + // add some new row to source + assertUpdate("INSERT INTO %s VALUES (8, 'wxy')".formatted(sourceTableName), 1); + // will do incremental refresh now since refresh window now does not contain the delete anymore, and only add: (8, 'wxy') + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 1); + // verify view contents + assertThat(query("TABLE %s".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'updated'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr'), (7, VARCHAR 'stv'), (8, VARCHAR 'wxy')"); + + // cleanup + assertUpdate("DROP MATERIALIZED VIEW %s".formatted(materializedViewName)); + assertUpdate("DROP TABLE %s".formatted(sourceTableName)); + } + + @Test + public void testRefreshWithCompaction() + { + String sourceTableName = "source_table" + randomNameSuffix(); + String materializedViewName = "test_materialized_view_" + randomNameSuffix(); + + Session defaultSession = getSession(); + + String matViewDef = "SELECT a, b FROM %s WHERE a < 3 OR a > 5".formatted(sourceTableName); + + // create source table and an MV + assertUpdate("CREATE TABLE %s (a int, b varchar)".formatted(sourceTableName)); + assertUpdate("INSERT INTO %s VALUES (1, 'abc'), (2, 'def')".formatted(sourceTableName), 2); + assertUpdate("CREATE MATERIALIZED VIEW %s AS %s".formatted(materializedViewName, matViewDef)); + + // execute first refresh: afterwards both MVs will contain: (1, 'abc'), (2, 'def') + assertUpdate("REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 2); + + // add some new rows to source + assertUpdate("INSERT INTO %s VALUES (3, 'ghi'), (4, 'jkl'), (5, 'mno'), (6, 'pqr')".formatted(sourceTableName), 4); + + // will do incremental refresh, and only add: (6, 'pqr') + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 1); + // verify view contents + assertThat(query("TABLE %s".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'abc'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr')"); + + // run compaction - after that, refresh will update 0 rows + assertUpdate(defaultSession, "ALTER TABLE %s EXECUTE OPTIMIZE".formatted(sourceTableName)); + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 0); + // verify view contents + assertThat(query("TABLE %s".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'abc'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr')"); + + // add some new rows to source + assertUpdate("INSERT INTO %s VALUES (7, 'stv'), (8, 'wxy')".formatted(sourceTableName), 2); + // will do incremental refresh, and only add: (7, 'stv'), (8, 'wxy') + assertUpdate(defaultSession, "REFRESH MATERIALIZED VIEW %s".formatted(materializedViewName), 2); + // verify view contents + assertThat(query("TABLE %s".formatted(materializedViewName))).matches("VALUES (1, VARCHAR 'abc'), (2, VARCHAR 'def'), (6, VARCHAR 'pqr'), (7, VARCHAR 'stv'), (8, VARCHAR 'wxy')"); + + // cleanup + assertUpdate("DROP MATERIALIZED VIEW %s".formatted(materializedViewName)); + assertUpdate("DROP TABLE %s".formatted(sourceTableName)); + } + + protected String getColumnComment(String tableName, String columnName) + { + return (String) computeScalar("SELECT comment FROM information_schema.columns WHERE table_schema = '" + getSession().getSchema().orElseThrow() + "' AND table_name = '" + tableName + "' AND column_name = '" + columnName + "'"); + } + + private TableMetadata getStorageTableMetadata(String materializedViewName) + { + QueryRunner queryRunner = getQueryRunner(); + TrinoFileSystem fileSystemFactory = getFileSystemFactory(queryRunner).create(ConnectorIdentity.ofUser("test")); + Location metadataLocation = Location.of(getStorageMetadataLocation(materializedViewName)); + return TableMetadataParser.read(FILE_IO_FACTORY.create(fileSystemFactory), metadataLocation.toString()); } private long getLatestSnapshotId(String tableName) { return (long) computeScalar(format("SELECT snapshot_id FROM \"%s$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES", tableName)); } + + private void assertFreshness(String viewName, String expected) + { + assertThat((String) computeScalar("SELECT freshness FROM system.metadata.materialized_views WHERE catalog_name = CURRENT_CATALOG AND schema_name = CURRENT_SCHEMA AND name = '" + viewName + "'")).isEqualTo(expected); + } + + public static class SequenceTableFunction + extends AbstractConnectorTableFunction + { + public SequenceTableFunction() + { + super("system", "sequence_function", List.of(), GENERIC_TABLE); + } + + @Override + public TableFunctionAnalysis analyze(ConnectorSession session, ConnectorTransactionHandle transaction, Map arguments, ConnectorAccessControl accessControl) + { + return TableFunctionAnalysis.builder() + .handle(new SequenceTableFunctionHandle()) + .returnedType(new Descriptor(ImmutableList.of(new Descriptor.Field("next_value", Optional.of(INTEGER))))) + .build(); + } + } + + public record SequenceTableFunctionHandle() + implements ConnectorTableFunctionHandle {} + + public static class SequenceTableFunctionProcessorProvider + implements TableFunctionProcessorProvider + { + private final SequenceFunctionProcessor sequenceFunctionProcessor = new SequenceFunctionProcessor(); + + @Override + public TableFunctionSplitProcessor getSplitProcessor(ConnectorSession session, ConnectorTableFunctionHandle handle, ConnectorSplit split) + { + sequenceFunctionProcessor.reset(); + return sequenceFunctionProcessor; + } + } + + public static class SequenceFunctionProcessor + implements TableFunctionSplitProcessor + { + private static final AtomicInteger generator = new AtomicInteger(10); + private final AtomicBoolean finished = new AtomicBoolean(false); + + @Override + public TableFunctionProcessorState process() + { + if (finished.get()) { + return FINISHED; + } + BlockBuilder builder = INTEGER.createFixedSizeBlockBuilder(1); + INTEGER.writeInt(builder, generator.getAndIncrement()); + finished.set(true); + return produced(new Page(builder.build())); + } + + public void reset() + { + finished.set(false); + } + } + + public record SequenceConnectorSplit() + implements ConnectorSplit + { + private static final int INSTANCE_SIZE = instanceSize(SequenceConnectorSplit.class); + + @JsonIgnore + @Override + public SplitWeight getSplitWeight() + { + return SplitWeight.standard(); + } + + @Override + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE; + } + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMinioConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMinioConnectorSmokeTest.java index d60cbc15378f..0e877ee133a0 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMinioConnectorSmokeTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergMinioConnectorSmokeTest.java @@ -13,41 +13,56 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.minio.messages.Event; import io.trino.Session; -import io.trino.plugin.hive.containers.HiveMinioDataLake; -import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.metastore.Column; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.HiveType; +import io.trino.metastore.Table; +import io.trino.plugin.hive.containers.Hive3MinioDataLake; import io.trino.plugin.hive.metastore.thrift.BridgingHiveMetastore; import io.trino.testing.QueryRunner; import io.trino.testing.minio.MinioClient; +import io.trino.testing.sql.TestTable; import org.apache.iceberg.FileFormat; import org.intellij.lang.annotations.Language; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Queue; import java.util.concurrent.ConcurrentLinkedQueue; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static io.trino.metastore.PrincipalPrivileges.NO_PRIVILEGES; +import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; import static io.trino.plugin.hive.TestingThriftHiveMetastoreBuilder.testingThriftHiveMetastoreBuilder; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.plugin.iceberg.catalog.AbstractIcebergTableOperations.ICEBERG_METASTORE_STORAGE_FORMAT; import static io.trino.testing.TestingNames.randomNameSuffix; import static io.trino.testing.containers.Minio.MINIO_ACCESS_KEY; import static io.trino.testing.containers.Minio.MINIO_REGION; import static io.trino.testing.containers.Minio.MINIO_SECRET_KEY; import static java.lang.String.format; import static java.util.Locale.ENGLISH; +import static org.apache.iceberg.BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE; +import static org.apache.iceberg.BaseMetastoreTableOperations.TABLE_TYPE_PROP; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; +@Execution(SAME_THREAD) public abstract class BaseIcebergMinioConnectorSmokeTest extends BaseIcebergConnectorSmokeTest { private final String schemaName; private final String bucketName; - private HiveMinioDataLake hiveMinioDataLake; + private Hive3MinioDataLake hiveMinioDataLake; protected BaseIcebergMinioConnectorSmokeTest(FileFormat format) { @@ -60,7 +75,7 @@ protected BaseIcebergMinioConnectorSmokeTest(FileFormat format) protected QueryRunner createQueryRunner() throws Exception { - this.hiveMinioDataLake = closeAfterClass(new HiveMinioDataLake(bucketName)); + this.hiveMinioDataLake = closeAfterClass(new Hive3MinioDataLake(bucketName)); this.hiveMinioDataLake.start(); return IcebergQueryRunner.builder() @@ -68,9 +83,8 @@ protected QueryRunner createQueryRunner() ImmutableMap.builder() .put("iceberg.file-format", format.name()) .put("iceberg.catalog.type", "HIVE_METASTORE") - .put("hive.metastore.uri", "thrift://" + hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint()) - .put("hive.metastore-timeout", "1m") // read timed out sometimes happens with the default timeout - .put("fs.hadoop.enabled", "false") + .put("hive.metastore.uri", hiveMinioDataLake.getHiveMetastoreEndpoint().toString()) + .put("hive.metastore.thrift.client.read-timeout", "1m") // read timed out sometimes happens with the default timeout .put("fs.native-s3.enabled", "true") .put("s3.aws-access-key", MINIO_ACCESS_KEY) .put("s3.aws-secret-key", MINIO_SECRET_KEY) @@ -81,6 +95,8 @@ protected QueryRunner createQueryRunner() .put("s3.max-connections", "2") // verify no leaks .put("iceberg.register-table-procedure.enabled", "true") .put("iceberg.writer-sort-buffer-size", "1MB") + .put("iceberg.allowed-extra-properties", "write.metadata.delete-after-commit.enabled,write.metadata.previous-versions-max") + .putAll(getAdditionalIcebergProperties()) .buildOrThrow()) .setSchemaInitializer( SchemaInitializer.builder() @@ -91,6 +107,11 @@ protected QueryRunner createQueryRunner() .build(); } + public Map getAdditionalIcebergProperties() + { + return ImmutableMap.of(); + } + @Override protected String createSchemaSql(String schemaName) { @@ -116,11 +137,10 @@ public void testS3LocationWithTrailingSlash() assertThat(location).doesNotContain("#"); assertUpdate("CREATE TABLE " + tableName + " WITH (location='" + location + "') AS SELECT 1 col", 1); - - List dataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "/%s/%s/data".formatted(schemaName, tableName)); + List dataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "%s/%s/data".formatted(schemaName, tableName)); assertThat(dataFiles).isNotEmpty().filteredOn(filePath -> filePath.contains("#")).isEmpty(); - List metadataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "/%s/%s/metadata".formatted(schemaName, tableName)); + List metadataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "%s/%s/metadata".formatted(schemaName, tableName)); assertThat(metadataFiles).isNotEmpty().filteredOn(filePath -> filePath.contains("#")).isEmpty(); // Verify ALTER TABLE succeeds https://github.com/trinodb/trino/issues/14552 @@ -155,6 +175,48 @@ public void testMetadataLocationWithDoubleSlash() assertUpdate("DROP TABLE " + tableName); } + @Test + void testHiveMetastoreTableParameter() + { + try (TestTable table = newTrinoTable("test_table_params", "(id int)")) { + String snapshotId = getTableParameterValue(table.getName(), "current-snapshot-id"); + String snapshotTimestamp = getTableParameterValue(table.getName(), "current-snapshot-timestamp-ms"); + assertThat(snapshotId).isNotNull(); + assertThat(snapshotTimestamp).isNotNull(); + + assertUpdate("INSERT INTO " + table.getName() + " VALUES 1", 1); + assertThat(getTableParameterValue(table.getName(), "current-snapshot-id")).isNotEqualTo(snapshotId); + assertThat(getTableParameterValue(table.getName(), "current-snapshot-timestamp-ms")).isNotEqualTo(snapshotTimestamp); + } + } + + @Test + void testHiveMetastoreMaterializedParameter() + { + String mvName = "test_mv_params_" + randomNameSuffix(); + try (TestTable table = newTrinoTable("test_mv_params", "(id int)")) { + assertUpdate("CREATE MATERIALIZED VIEW " + mvName + " AS SELECT * FROM " + table.getName()); + String snapshotId = getTableParameterValue(mvName, "current-snapshot-id"); + String snapshotTimestamp = getTableParameterValue(mvName, "current-snapshot-timestamp-ms"); + assertThat(snapshotId).isNotNull(); + assertThat(snapshotTimestamp).isNotNull(); + + assertUpdate("INSERT INTO " + table.getName() + " VALUES 1", 1); + assertUpdate("REFRESH MATERIALIZED VIEW " + mvName, 1); + assertThat(getTableParameterValue(mvName, "current-snapshot-id")).isNotEqualTo(snapshotId); + assertThat(getTableParameterValue(mvName, "current-snapshot-timestamp-ms")).isNotEqualTo(snapshotTimestamp); + } + finally { + assertUpdate("DROP MATERIALIZED VIEW IF EXISTS " + mvName); + } + } + + private String getTableParameterValue(String tableName, String parameterKey) + { + String tableId = onMetastore("SELECT tbl_id FROM TBLS t INNER JOIN DBS db ON t.db_id = db.db_id WHERE db.name = '" + schemaName + "' and t.tbl_name = '" + tableName + "'"); + return onMetastore("SELECT param_value FROM TABLE_PARAMS WHERE param_key = '" + parameterKey + "' AND tbl_id = " + tableId); + } + @Test public void testExpireSnapshotsBatchDeletes() { @@ -173,7 +235,7 @@ public void testExpireSnapshotsBatchDeletes() assertUpdate("INSERT INTO " + tableName + " VALUES ('two', 2)", 1); assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (VARCHAR 'one', 1), (VARCHAR 'two', 2)"); - List initialMetadataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "/%s/%s/metadata".formatted(schemaName, tableName)); + List initialMetadataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "%s/%s/metadata".formatted(schemaName, tableName)); assertThat(initialMetadataFiles).isNotEmpty(); List initialSnapshots = getSnapshotIds(tableName); @@ -181,7 +243,7 @@ public void testExpireSnapshotsBatchDeletes() assertQuerySucceeds(sessionWithShortRetentionUnlocked, "ALTER TABLE " + tableName + " EXECUTE EXPIRE_SNAPSHOTS (retention_threshold => '0s')"); - List updatedMetadataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "/%s/%s/metadata".formatted(schemaName, tableName)); + List updatedMetadataFiles = hiveMinioDataLake.getMinioClient().listObjects(bucketName, "%s/%s/metadata".formatted(schemaName, tableName)); assertThat(updatedMetadataFiles).isNotEmpty().hasSizeLessThan(initialMetadataFiles.size()); List updatedSnapshots = getSnapshotIds(tableName); @@ -190,14 +252,62 @@ public void testExpireSnapshotsBatchDeletes() assertThat(query("SELECT * FROM " + tableName)) .matches("VALUES (VARCHAR 'one', 1), (VARCHAR 'two', 2)"); assertThat(events).hasSize(3); - // if files were deleted in batch there should be only one request id because there was one request only + // since we have delegated the batch delete operation to iceberg there are two requests. + // the first request is for data and the second is for statistics files. + // https://github.com/apache/iceberg/blob/9a23420592e2b8be5f792c8e6eb32a64e92e4088/core/src/main/java/org/apache/iceberg/IncrementalFileCleanup.java#L58 assertThat(events.stream() .map(event -> event.responseElements().get("x-amz-request-id")) - .collect(toImmutableSet())).hasSize(1); + .collect(toImmutableSet())).hasSize(2); assertUpdate("DROP TABLE " + tableName); } + @Test + public void testPathContainsSpecialCharacter() + { + String tableName = "test_path_special_character" + randomNameSuffix(); + String location = "s3://%s/%s/%s/".formatted(bucketName, schemaName, tableName); + assertUpdate(format( + "CREATE TABLE %s (id bigint, part varchar) WITH (partitioning = ARRAY['part'], location='%s')", + tableName, + location)); + + String values = "(1, 'with-hyphen')," + + "(2, 'with.dot')," + + "(3, 'with:colon')," + + "(4, 'with/slash')," + + "(5, 'with\\\\backslashes')," + + "(6, 'with\\backslash')," + + "(7, 'with=equal')," + + "(8, 'with?question')," + + "(9, 'with!exclamation')," + + "(10, 'with%percent')," + + "(11, 'with%%percents')," + + "(12, 'with space')"; + assertUpdate("INSERT INTO " + tableName + " VALUES " + values, 12); + assertQuery("SELECT * FROM " + tableName, "VALUES " + values); + assertUpdate("DROP TABLE " + tableName); + } + + @Override + protected AutoCloseable createSparkIcebergTable(String schema) + { + HiveMetastore metastore = getHiveMetastore(getQueryRunner()); + // simulate iceberg table created by spark with lowercase table type + Table lowerCaseTableType = io.trino.metastore.Table.builder() + .setDatabaseName(schema) + .setTableName("lowercase_type_" + randomNameSuffix()) + .setOwner(Optional.empty()) + .setDataColumns(ImmutableList.of(new Column("id", HiveType.HIVE_STRING, Optional.empty(), ImmutableMap.of()))) + .setTableType(EXTERNAL_TABLE.name()) + .withStorage(storage -> storage.setStorageFormat(ICEBERG_METASTORE_STORAGE_FORMAT)) + .setParameter("EXTERNAL", "TRUE") + .setParameter(TABLE_TYPE_PROP, ICEBERG_TABLE_TYPE_VALUE.toLowerCase(ENGLISH)) + .build(); + metastore.createTable(lowerCaseTableType, NO_PRIVILEGES); + return () -> metastore.dropTable(lowerCaseTableType.getDatabaseName(), lowerCaseTableType.getTableName(), true); + } + private String onMetastore(@Language("SQL") String sql) { return hiveMinioDataLake.getHiveHadoop().runOnMetastore(sql); @@ -223,8 +333,8 @@ protected void dropTableFromMetastore(String tableName) { HiveMetastore metastore = new BridgingHiveMetastore( testingThriftHiveMetastoreBuilder() - .metastoreClient(hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint()) - .build()); + .metastoreClient(hiveMinioDataLake.getHiveMetastoreEndpoint()) + .build(this::closeAfterClass)); metastore.dropTable(schemaName, tableName, false); assertThat(metastore.getTable(schemaName, tableName)).isEmpty(); } @@ -234,8 +344,8 @@ protected String getMetadataLocation(String tableName) { HiveMetastore metastore = new BridgingHiveMetastore( testingThriftHiveMetastoreBuilder() - .metastoreClient(hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint()) - .build()); + .metastoreClient(hiveMinioDataLake.getHiveMetastoreEndpoint()) + .build(this::closeAfterClass)); return metastore .getTable(schemaName, tableName).orElseThrow() .getParameters().get("metadata_location"); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergSystemTables.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergSystemTables.java index 32697ad6090e..98434d18b652 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergSystemTables.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseIcebergSystemTables.java @@ -13,30 +13,58 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.metastore.HiveMetastore; +import io.trino.spi.type.ArrayType; import io.trino.testing.AbstractTestQueryFramework; import io.trino.testing.DistributedQueryRunner; import io.trino.testing.MaterializedResult; import io.trino.testing.MaterializedRow; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import io.trino.testing.QueryRunner; +import io.trino.testing.sql.TestTable; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import java.time.LocalDate; +import java.util.ArrayList; +import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.function.Function; import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.plugin.iceberg.IcebergFileFormat.ORC; import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; +import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.plugin.iceberg.util.EqualityDeleteUtils.writeEqualityDeleteForTable; +import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.testing.MaterializedResult.DEFAULT_PRECISION; +import static io.trino.testing.MaterializedResult.resultBuilder; +import static java.util.Locale.ENGLISH; +import static java.util.Map.entry; import static java.util.Objects.requireNonNull; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_PATH; +import static org.apache.iceberg.MetadataColumns.DELETE_FILE_POS; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertEquals; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public abstract class BaseIcebergSystemTables extends AbstractTestQueryFramework { private final IcebergFileFormat format; + private HiveMetastore metastore; + private TrinoFileSystemFactory fileSystemFactory; protected BaseIcebergSystemTables(IcebergFileFormat format) { @@ -44,15 +72,18 @@ protected BaseIcebergSystemTables(IcebergFileFormat format) } @Override - protected DistributedQueryRunner createQueryRunner() + protected QueryRunner createQueryRunner() throws Exception { - return IcebergQueryRunner.builder() + DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() .setIcebergProperties(ImmutableMap.of("iceberg.file-format", format.name())) .build(); + metastore = getHiveMetastore(queryRunner); + fileSystemFactory = getFileSystemFactory(queryRunner); + return queryRunner; } - @BeforeClass + @BeforeAll public void setUp() { assertUpdate("CREATE SCHEMA test_schema"); @@ -90,7 +121,7 @@ public void setUp() assertQuery("SELECT count(*) FROM test_schema.test_table_with_dml", "VALUES 7"); } - @AfterClass(alwaysRun = true) + @AfterAll public void tearDown() { assertUpdate("DROP TABLE IF EXISTS test_schema.test_table"); @@ -98,11 +129,12 @@ public void tearDown() assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_drop_column"); assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_nan"); assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_with_dml"); + assertUpdate("DROP TABLE IF EXISTS test_schema.test_metadata_log_entries"); assertUpdate("DROP SCHEMA IF EXISTS test_schema"); } @Test - public void testPartitionTable() + public void testPartitionsTable() { assertQuery("SELECT count(*) FROM test_schema.test_table", "VALUES 6"); assertQuery("SHOW COLUMNS FROM test_schema.\"test_table$partitions\"", @@ -113,74 +145,82 @@ public void testPartitionTable() "('data', 'row(_bigint row(min bigint, max bigint, null_count bigint, nan_count bigint))', '', '')"); MaterializedResult result = computeActual("SELECT * from test_schema.\"test_table$partitions\""); - assertEquals(result.getRowCount(), 3); + assertThat(result.getRowCount()).isEqualTo(3); Map rowsByPartition = result.getMaterializedRows().stream() .collect(toImmutableMap(row -> ((LocalDate) ((MaterializedRow) row.getField(0)).getField(0)), Function.identity())); // Test if row counts are computed correctly - assertEquals(rowsByPartition.get(LocalDate.parse("2019-09-08")).getField(1), 1L); - assertEquals(rowsByPartition.get(LocalDate.parse("2019-09-09")).getField(1), 3L); - assertEquals(rowsByPartition.get(LocalDate.parse("2019-09-10")).getField(1), 2L); + assertThat(rowsByPartition.get(LocalDate.parse("2019-09-08")).getField(1)).isEqualTo(1L); + assertThat(rowsByPartition.get(LocalDate.parse("2019-09-09")).getField(1)).isEqualTo(3L); + assertThat(rowsByPartition.get(LocalDate.parse("2019-09-10")).getField(1)).isEqualTo(2L); // Test if min/max values, null value count and nan value count are computed correctly. - assertEquals(rowsByPartition.get(LocalDate.parse("2019-09-08")).getField(4), new MaterializedRow(DEFAULT_PRECISION, new MaterializedRow(DEFAULT_PRECISION, 0L, 0L, 0L, null))); - assertEquals(rowsByPartition.get(LocalDate.parse("2019-09-09")).getField(4), new MaterializedRow(DEFAULT_PRECISION, new MaterializedRow(DEFAULT_PRECISION, 1L, 3L, 0L, null))); - assertEquals(rowsByPartition.get(LocalDate.parse("2019-09-10")).getField(4), new MaterializedRow(DEFAULT_PRECISION, new MaterializedRow(DEFAULT_PRECISION, 4L, 5L, 0L, null))); + assertThat(rowsByPartition.get(LocalDate.parse("2019-09-08")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, new MaterializedRow(DEFAULT_PRECISION, 0L, 0L, 0L, null))); + assertThat(rowsByPartition.get(LocalDate.parse("2019-09-09")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, new MaterializedRow(DEFAULT_PRECISION, 1L, 3L, 0L, null))); + assertThat(rowsByPartition.get(LocalDate.parse("2019-09-10")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, new MaterializedRow(DEFAULT_PRECISION, 4L, 5L, 0L, null))); } @Test - public void testPartitionTableWithNan() + public void testPartitionsTableWithNan() { assertQuery("SELECT count(*) FROM test_schema.test_table_nan", "VALUES 6"); MaterializedResult result = computeActual("SELECT * from test_schema.\"test_table_nan$partitions\""); - assertEquals(result.getRowCount(), 4); + assertThat(result.getRowCount()).isEqualTo(4); Map rowsByPartition = result.getMaterializedRows().stream() .collect(toImmutableMap(row -> ((LocalDate) ((MaterializedRow) row.getField(0)).getField(0)), Function.identity())); // Test if row counts are computed correctly - assertEquals(rowsByPartition.get(LocalDate.parse("2022-01-01")).getField(1), 1L); - assertEquals(rowsByPartition.get(LocalDate.parse("2022-01-02")).getField(1), 1L); - assertEquals(rowsByPartition.get(LocalDate.parse("2022-01-03")).getField(1), 1L); - assertEquals(rowsByPartition.get(LocalDate.parse("2022-01-04")).getField(1), 3L); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-01")).getField(1)).isEqualTo(1L); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-02")).getField(1)).isEqualTo(1L); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-03")).getField(1)).isEqualTo(1L); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-04")).getField(1)).isEqualTo(3L); // Test if min/max values, null value count and nan value count are computed correctly. - assertEquals( - rowsByPartition.get(LocalDate.parse("2022-01-01")).getField(4), - new MaterializedRow(DEFAULT_PRECISION, - new MaterializedRow(DEFAULT_PRECISION, 1L, 1L, 0L, null), - new MaterializedRow(DEFAULT_PRECISION, 1.1d, 1.1d, 0L, null), - new MaterializedRow(DEFAULT_PRECISION, 1.2f, 1.2f, 0L, null))); - assertEquals( - rowsByPartition.get(LocalDate.parse("2022-01-02")).getField(4), - new MaterializedRow(DEFAULT_PRECISION, - new MaterializedRow(DEFAULT_PRECISION, 2L, 2L, 0L, null), - new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(1L)), - new MaterializedRow(DEFAULT_PRECISION, 2.2f, 2.2f, 0L, null))); - assertEquals( - rowsByPartition.get(LocalDate.parse("2022-01-03")).getField(4), - new MaterializedRow(DEFAULT_PRECISION, - new MaterializedRow(DEFAULT_PRECISION, 3L, 3L, 0L, null), - new MaterializedRow(DEFAULT_PRECISION, 3.3, 3.3d, 0L, null), - new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(1L)))); - assertEquals( - rowsByPartition.get(LocalDate.parse("2022-01-04")).getField(4), - new MaterializedRow(DEFAULT_PRECISION, - new MaterializedRow(DEFAULT_PRECISION, 4L, 6L, 0L, null), - new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(2L)), - new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(2L)))); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-01")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, + new MaterializedRow(DEFAULT_PRECISION, 1L, 1L, 0L, null), + new MaterializedRow(DEFAULT_PRECISION, 1.1d, 1.1d, 0L, null), + new MaterializedRow(DEFAULT_PRECISION, 1.2f, 1.2f, 0L, null))); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-02")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, + new MaterializedRow(DEFAULT_PRECISION, 2L, 2L, 0L, null), + new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(1L)), + new MaterializedRow(DEFAULT_PRECISION, 2.2f, 2.2f, 0L, null))); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-03")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, + new MaterializedRow(DEFAULT_PRECISION, 3L, 3L, 0L, null), + new MaterializedRow(DEFAULT_PRECISION, 3.3, 3.3d, 0L, null), + new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(1L)))); + assertThat(rowsByPartition.get(LocalDate.parse("2022-01-04")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, + new MaterializedRow(DEFAULT_PRECISION, 4L, 6L, 0L, null), + new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(2L)), + new MaterializedRow(DEFAULT_PRECISION, null, null, 0L, nanCount(2L)))); } @Test - public void testPartitionTableOnDropColumn() + public void testPartitionsTableAfterAddColumn() + { + try (TestTable table = newTrinoTable("test_partitions_new_column", "AS SELECT 1 col")) { + assertThat(computeScalar("SELECT data.col FROM \"" + table.getName() + "$partitions\"")) + .isEqualTo(new MaterializedRow(DEFAULT_PRECISION, 1, 1, 0L, null)); + + assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN new_col int"); + + assertThat(computeScalar("SELECT data.col FROM \"" + table.getName() + "$partitions\"")) + .isEqualTo(new MaterializedRow(DEFAULT_PRECISION, 1, 1, 0L, null)); + assertThat(computeScalar("SELECT data.new_col FROM \"" + table.getName() + "$partitions\"")) + .isNull(); + } + } + + @Test + public void testPartitionsTableOnDropColumn() { MaterializedResult resultAfterDrop = computeActual("SELECT * from test_schema.\"test_table_drop_column$partitions\""); - assertEquals(resultAfterDrop.getRowCount(), 3); + assertThat(resultAfterDrop.getRowCount()).isEqualTo(3); Map rowsByPartitionAfterDrop = resultAfterDrop.getMaterializedRows().stream() .collect(toImmutableMap(row -> ((LocalDate) ((MaterializedRow) row.getField(0)).getField(0)), Function.identity())); - assertEquals(rowsByPartitionAfterDrop.get(LocalDate.parse("2019-09-08")).getField(4), new MaterializedRow(DEFAULT_PRECISION, + assertThat(rowsByPartitionAfterDrop.get(LocalDate.parse("2019-09-08")).getField(4)).isEqualTo(new MaterializedRow(DEFAULT_PRECISION, new MaterializedRow(DEFAULT_PRECISION, 0L, 0L, 0L, null))); } @@ -203,6 +243,77 @@ public void testHistoryTable() assertQuery("SELECT count(*) FROM test_schema.\"test_table$history\"", "VALUES 3"); } + @Test + public void testMetadataLogEntriesTable() + { + assertQuery("SHOW COLUMNS FROM test_schema.\"test_table$metadata_log_entries\"", + "VALUES ('timestamp', 'timestamp(3) with time zone', '', '')," + + "('file', 'varchar', '', '')," + + "('latest_snapshot_id', 'bigint', '', '')," + + "('latest_schema_id', 'integer', '', '')," + + "('latest_sequence_number', 'bigint', '', '')"); + + List latestSchemaIds = new ArrayList<>(); + List latestSequenceNumbers = new ArrayList<>(); + + assertUpdate("CREATE TABLE test_schema.test_metadata_log_entries (c1 BIGINT)"); + latestSchemaIds.add(0); + latestSequenceNumbers.add(1L); + assertMetadataLogEntries(latestSchemaIds, latestSequenceNumbers); + + assertUpdate("INSERT INTO test_schema.test_metadata_log_entries VALUES (1)", 1); + // INSERT create two commits (https://github.com/trinodb/trino/issues/15439) and share a same snapshotId + latestSchemaIds.add(0); + latestSchemaIds.add(0); + latestSequenceNumbers.add(2L); + latestSequenceNumbers.add(2L); + assertMetadataLogEntries(latestSchemaIds, latestSequenceNumbers); + + assertUpdate("ALTER TABLE test_schema.test_metadata_log_entries ADD COLUMN c2 VARCHAR"); + latestSchemaIds.add(0); + latestSequenceNumbers.add(2L); + assertMetadataLogEntries(latestSchemaIds, latestSequenceNumbers); + + assertUpdate("DELETE FROM test_schema.test_metadata_log_entries WHERE c1 = 1", 1); + latestSchemaIds.add(1); + latestSequenceNumbers.add(3L); + assertMetadataLogEntries(latestSchemaIds, latestSequenceNumbers); + + // OPTIMIZE create two commits: update snapshot and rewrite statistics + assertUpdate("ALTER TABLE test_schema.test_metadata_log_entries execute optimize"); + latestSchemaIds.add(1); + latestSchemaIds.add(1); + latestSequenceNumbers.add(4L); + latestSequenceNumbers.add(4L); + assertMetadataLogEntries(latestSchemaIds, latestSequenceNumbers); + + assertUpdate("CREATE OR REPLACE TABLE test_schema.test_metadata_log_entries (c3 INTEGER)"); + latestSchemaIds.add(2); + latestSequenceNumbers.add(5L); + assertMetadataLogEntries(latestSchemaIds, latestSequenceNumbers); + + assertUpdate("INSERT INTO test_schema.test_metadata_log_entries VALUES (1)", 1); + latestSchemaIds.add(2); + latestSequenceNumbers.add(6L); + latestSchemaIds.add(2); + latestSequenceNumbers.add(6L); + assertMetadataLogEntries(latestSchemaIds, latestSequenceNumbers); + + assertUpdate("DROP TABLE IF EXISTS test_schema.test_metadata_log_entries"); + } + + private void assertMetadataLogEntries(List latestSchemaIds, List latestSequenceNumbers) + { + MaterializedResult result = computeActual("SELECT latest_schema_id, latest_sequence_number FROM test_schema.\"test_metadata_log_entries$metadata_log_entries\" ORDER BY timestamp"); + List materializedRows = result.getMaterializedRows(); + + assertThat(result.getRowCount()).isEqualTo(latestSchemaIds.size()); + for (int i = 0; i < result.getRowCount(); i++) { + assertThat(materializedRows.get(i).getField(0)).isEqualTo(latestSchemaIds.get(i)); + assertThat(materializedRows.get(i).getField(1)).isEqualTo(latestSequenceNumbers.get(i)); + } + } + @Test public void testSnapshotsTable() { @@ -218,6 +329,54 @@ public void testSnapshotsTable() assertQuery("SELECT summary['total-records'] FROM test_schema.\"test_table$snapshots\"", "VALUES '0', '3', '6'"); } + @Test + void testAllManifests() + { + try (TestTable table = newTrinoTable("test_all_manifests", "(x) AS VALUES 1, 2")) { + assertThat(query("SHOW COLUMNS FROM \"" + table.getName() + "$all_manifests\"")) + .skippingTypesCheck() + .matches("VALUES " + + "('path', 'varchar', '', '')," + + "('length', 'bigint', '', '')," + + "('partition_spec_id', 'integer', '', '')," + + "('added_snapshot_id', 'bigint', '', '')," + + "('added_data_files_count', 'integer', '', '')," + + "('existing_data_files_count', 'integer', '', '')," + + "('deleted_data_files_count', 'integer', '', '')," + + "('added_delete_files_count', 'integer', '', '')," + + "('existing_delete_files_count', 'integer', '', '')," + + "('deleted_delete_files_count', 'integer', '', '')," + + "('partition_summaries', 'array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))', '', '')," + + "('reference_snapshot_id', 'bigint', '', '')"); + + assertThat((String) computeScalar("SELECT path FROM \"" + table.getName() + "$all_manifests\"")).endsWith("-m0.avro"); + assertThat((Long) computeScalar("SELECT length FROM \"" + table.getName() + "$all_manifests\"")).isPositive(); + assertThat((Integer) computeScalar("SELECT partition_spec_id FROM \"" + table.getName() + "$all_manifests\"")).isZero(); + assertThat((Long) computeScalar("SELECT added_snapshot_id FROM \"" + table.getName() + "$all_manifests\"")).isPositive(); + assertThat((Integer) computeScalar("SELECT added_data_files_count FROM \"" + table.getName() + "$all_manifests\"")).isEqualTo(1); + assertThat((Integer) computeScalar("SELECT existing_data_files_count FROM \"" + table.getName() + "$all_manifests\"")).isZero(); + assertThat((Integer) computeScalar("SELECT deleted_data_files_count FROM \"" + table.getName() + "$all_manifests\"")).isZero(); + assertThat((Integer) computeScalar("SELECT added_delete_files_count FROM \"" + table.getName() + "$all_manifests\"")).isZero(); + assertThat((Integer) computeScalar("SELECT existing_delete_files_count FROM \"" + table.getName() + "$all_manifests\"")).isZero(); + assertThat((Integer) computeScalar("SELECT deleted_delete_files_count FROM \"" + table.getName() + "$all_manifests\"")).isZero(); + assertThat((List) computeScalar("SELECT partition_summaries FROM \"" + table.getName() + "$all_manifests\"")).isEmpty(); + assertThat((Long) computeScalar("SELECT reference_snapshot_id FROM \"" + table.getName() + "$all_manifests\"")).isPositive(); + + assertUpdate("DELETE FROM " + table.getName() + " WHERE x = 1", 1); + assertThat((Long) computeScalar("SELECT count(1) FROM \"" + table.getName() + "$all_manifests\"")).isEqualTo(3); + assertThat((Long) computeScalar("SELECT count(1) FROM \"" + table.getName() + "$all_manifests\" WHERE added_delete_files_count > 0")).isEqualTo(1); + } + } + + @Test + void testAllManifestsWithPartitionTable() + { + try (TestTable table = newTrinoTable("test_all_manifests", "WITH (partitioning = ARRAY['dt']) AS SELECT 1 x, DATE '2021-01-01' dt")) { + assertThat(query("SELECT partition_summaries FROM \"" + table.getName() + "$all_manifests\"")) + .matches("VALUES CAST(ARRAY[ROW(false, false, VARCHAR '2021-01-01', VARCHAR '2021-01-01')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))"); + } + } + @Test public void testManifestsTable() { @@ -232,22 +391,22 @@ public void testManifestsTable() "('existing_rows_count', 'bigint', '', '')," + "('deleted_data_files_count', 'integer', '', '')," + "('deleted_rows_count', 'bigint', '', '')," + - "('partitions', 'array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))', '', '')"); + "('partition_summaries', 'array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))', '', '')"); assertQuerySucceeds("SELECT * FROM test_schema.\"test_table$manifests\""); - assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table$manifests\"")) + assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partition_summaries FROM test_schema.\"test_table$manifests\"")) .matches( "VALUES " + " (2, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar)))) , " + " (2, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '2019-09-09', '2019-09-10')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))"); assertQuerySucceeds("SELECT * FROM test_schema.\"test_table_multilevel_partitions$manifests\""); - assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table_multilevel_partitions$manifests\"")) + assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partition_summaries FROM test_schema.\"test_table_multilevel_partitions$manifests\"")) .matches( "VALUES " + "(3, BIGINT '0', BIGINT '3', 0, BIGINT '0', CAST(ARRAY[ROW(false, false, '0', '1'), ROW(false, false, '2019-09-08', '2019-09-09')] AS array(row(contains_null boolean, contains_nan boolean, lower_bound varchar, upper_bound varchar))))"); assertQuerySucceeds("SELECT * FROM test_schema.\"test_table_with_dml$manifests\""); - assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partitions FROM test_schema.\"test_table_with_dml$manifests\"")) + assertThat(query("SELECT added_data_files_count, existing_rows_count, added_rows_count, deleted_data_files_count, deleted_rows_count, partition_summaries FROM test_schema.\"test_table_with_dml$manifests\"")) .matches( "VALUES " + // INSERT on '2022-01-01', '2022-02-02', '2022-03-03' partitions @@ -263,11 +422,24 @@ public void testManifestsTable() @Test public void testFilesTable() + { + try (TestTable table = newTrinoTable("test_files_table", "AS SELECT 1 x")) { + MaterializedResult result = computeActual("DESCRIBE " + table.getName()); + assertThat(result.getMaterializedRows().stream().map(row -> (String) row.getField(0))) + .doesNotContain("partition"); + assertQuerySucceeds("SELECT * FROM \"" + table.getName() + "$files\""); + } + } + + @Test + public void testFilesPartitionTable() { assertQuery("SHOW COLUMNS FROM test_schema.\"test_table$files\"", "VALUES ('content', 'integer', '', '')," + "('file_path', 'varchar', '', '')," + "('file_format', 'varchar', '', '')," + + "('spec_id', 'integer', '', '')," + + "('partition', 'row(_date date)', '', '')," + "('record_count', 'bigint', '', '')," + "('file_size_in_bytes', 'bigint', '', '')," + "('column_sizes', 'map(integer, bigint)', '', '')," + @@ -278,8 +450,420 @@ public void testFilesTable() "('upper_bounds', 'map(integer, varchar)', '', '')," + "('key_metadata', 'varbinary', '', '')," + "('split_offsets', 'array(bigint)', '', '')," + - "('equality_ids', 'array(integer)', '', '')"); + "('equality_ids', 'array(integer)', '', '')," + + "('sort_order_id', 'integer', '', '')," + + "('readable_metrics', 'json', '', '')"); assertQuerySucceeds("SELECT * FROM test_schema.\"test_table$files\""); + + long offset = format == PARQUET ? 4L : 3L; + assertThat(computeActual("SELECT split_offsets FROM test_schema.\"test_table$files\"")) + .isEqualTo(resultBuilder(getSession(), ImmutableList.of(new ArrayType(BIGINT))) + .row(ImmutableList.of(offset)) + .row(ImmutableList.of(offset)) + .row(ImmutableList.of(offset)) + .row(ImmutableList.of(offset)) + .build()); + } + + @Test + void testFilesTableReadableMetrics() + { + testFilesTableReadableMetrics( + "boolean", + "VALUES true, false, NULL", + "{\"x\":{\"column_size\":" + columnSize(33) + ",\"value_count\":3,\"null_value_count\":1,\"nan_value_count\":null,\"lower_bound\":false,\"upper_bound\":true}}"); + testFilesTableReadableMetrics( + "int", + "VALUES -1, 1", + "{\"x\":{\"column_size\":" + columnSize(40) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":-1,\"upper_bound\":1}}"); + testFilesTableReadableMetrics( + "bigint", + "VALUES -123, 999", + "{\"x\":{\"column_size\":" + columnSize(48) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":-123,\"upper_bound\":999}}"); + testFilesTableReadableMetrics( + "real", + "VALUES -1.1, 1.1, nan()", + "{\"x\":{\"column_size\":" + columnSize(44) + ",\"value_count\":3,\"null_value_count\":0,\"nan_value_count\":" + nanCount(1) + ",\"lower_bound\":null,\"upper_bound\":null}}"); + testFilesTableReadableMetrics( + "double", + "VALUES -1.1, 1.1, nan()", + "{\"x\":{\"column_size\":" + columnSize(53) + ",\"value_count\":3,\"null_value_count\":0,\"nan_value_count\":" + nanCount(1) + ",\"lower_bound\":null,\"upper_bound\":null}}"); + testFilesTableReadableMetrics( + "decimal(3,1)", + "VALUES -3.14, 3.14", + "{\"x\":{\"column_size\":" + columnSize(40) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":\"-3.1\",\"upper_bound\":\"3.1\"}}"); + testFilesTableReadableMetrics( + "date", + "VALUES DATE '1960-01-01', DATE '9999-12-31'", + "{\"x\":{\"column_size\":" + columnSize(40) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":\"1960-01-01\",\"upper_bound\":\"9999-12-31\"}}"); + testFilesTableReadableMetrics( + "time", + "VALUES TIME '00:00:00.000', TIME '12:34:56.999999'", + "{\"x\":{\"column_size\":" + columnSize(48) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":\"00:00:00\",\"upper_bound\":\"12:34:56.999999\"}}"); + testFilesTableReadableMetrics( + "timestamp", + "VALUES TIMESTAMP '1960-01-01 00:00:00', TIMESTAMP '9999-12-31 12:34:56.999999'", + "{\"x\":{\"column_size\":" + columnSize(48) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":\"1960-01-01T00:00:00\",\"upper_bound\":\"9999-12-31T12:34:56.999999\"}}"); + testFilesTableReadableMetrics( + "timestamp with time zone", + "VALUES TIMESTAMP '1960-01-01 00:00:00 UTC', TIMESTAMP '9999-12-31 12:34:56.999999 UTC'", + "{\"x\":{\"column_size\":" + columnSize(48) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":\"1960-01-01T00:00:00+00:00\",\"upper_bound\":\"9999-12-31T12:34:56.999999+00:00\"}}"); + testFilesTableReadableMetrics( + "varchar", + "VALUES 'alice', 'bob'", + "{\"x\":{\"column_size\":" + columnSize(48) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":\"alice\",\"upper_bound\":\"bob\"}}"); + testFilesTableReadableMetrics( + "uuid", + "VALUES UUID '09e1efb9-9e87-465e-abaf-0c67f4841114', UUID '0f2ef2b3-3c5a-4834-ba91-61be53ff8fbb'", + "{\"x\":{\"column_size\":" + columnSize(64) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":" + value("\"09e1efb9-9e87-465e-abaf-0c67f4841114\"", null) + ",\"upper_bound\":" + value("\"0f2ef2b3-3c5a-4834-ba91-61be53ff8fbb\"", null) + "}}"); + testFilesTableReadableMetrics( + "varbinary", + "VALUES x'12', x'34'", + "{\"x\":{\"column_size\":" + columnSize(42) + ",\"value_count\":2,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":" + value("\"12\"", null) + ",\"upper_bound\":" + value("\"34\"", null) + "}}"); + testFilesTableReadableMetrics( + "row(y int)", + "SELECT (CAST(ROW(123) AS ROW(y int)))", + "{\"x.y\":{\"column_size\":" + columnSize(37) + ",\"value_count\":1,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":123,\"upper_bound\":123}}"); + testFilesTableReadableMetrics( + "array(int)", + "VALUES ARRAY[123]", + "{\"x.element\":{\"column_size\":" + columnSize(43) + ",\"value_count\":" + value(1, null) + ",\"null_value_count\":" + value(0, null) + ",\"nan_value_count\":null,\"lower_bound\":null,\"upper_bound\":null}}"); + testFilesTableReadableMetrics( + "map(int, int)", + "VALUES map(ARRAY[1,3], ARRAY[2,4])", + "{" + + "\"x.key\":{\"column_size\":" + columnSize(47) + ",\"value_count\":" + value(2, null) + ",\"null_value_count\":" + value(0, null) + ",\"nan_value_count\":null,\"lower_bound\":null,\"upper_bound\":null}," + + "\"x.value\":{\"column_size\":" + columnSize(47) + ",\"value_count\":" + value(2, null) + ",\"null_value_count\":" + value(0, null) + ",\"nan_value_count\":null,\"lower_bound\":null,\"upper_bound\":null}" + + "}"); + } + + private void testFilesTableReadableMetrics(@Language("SQL") String type, @Language("SQL") String values, @Language("JSON") String... readableMetrics) + { + try (TestTable table = newTrinoTable("test_files_table", "(x " + type + ")")) { + getQueryRunner().execute("INSERT INTO " + table.getName() + " " + values); + assertThat(computeActual("SELECT readable_metrics FROM \"" + table.getName() + "$files\"").getOnlyColumnAsSet()) + .containsExactlyInAnyOrder(readableMetrics); + } + } + + @Test + public void testFilesSchemaEvolution() + { + try (TestTable table = newTrinoTable("test_files_table", "WITH (partitioning = ARRAY['part']) AS SELECT 1 x, 2 part")) { + assertThat(query("SELECT partition FROM \"" + table.getName() + "$files\"")) + .matches("SELECT CAST(ROW(2) AS ROW(part int))"); + + assertUpdate("ALTER TABLE " + table.getName() + " ADD COLUMN another_part int"); + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES partitioning = ARRAY['part', 'another_part']"); + assertThat(query("SELECT partition FROM \"" + table.getName() + "$files\"")) + .matches("SELECT CAST(ROW(2, NULL) AS ROW(part int, another_part int))"); + + assertUpdate("ALTER TABLE " + table.getName() + " RENAME COLUMN part TO part_renamed"); + assertThat(query("SELECT partition FROM \"" + table.getName() + "$files\"")) + .matches("SELECT CAST(ROW(2, NULL) AS ROW(part int, another_part int))"); + } + } + + @Test + public void testFilesNestedPartition() + { + try (TestTable table = newTrinoTable( + "test_files_table", + "WITH (partitioning = ARRAY['\"part.nested\"']) AS SELECT 1 x, CAST(ROW(2) AS ROW(nested int)) part")) { + assertThat(query("SELECT partition.\"part.nested\" FROM \"" + table.getName() + "$files\"")) + .matches("VALUES 2"); + } + } + + @Test + public void testFilesTableWithDelete() + { + assertUpdate("CREATE TABLE test_schema.test_table_with_delete (_bigint BIGINT, _date DATE) WITH (partitioning = ARRAY['_date'])"); + assertUpdate("INSERT INTO test_schema.test_table_with_delete VALUES (0, CAST('2019-09-08' AS DATE)), (1, CAST('2019-09-09' AS DATE)), (2, CAST('2019-09-09' AS DATE))", 3); + assertUpdate("INSERT INTO test_schema.test_table_with_delete VALUES (3, CAST('2019-09-09' AS DATE)), (4, CAST('2019-09-10' AS DATE)), (5, CAST('2019-09-10' AS DATE))", 3); + assertUpdate("DELETE FROM test_schema.test_table_with_delete WHERE _bigint = 5", 1); + assertUpdate("DELETE FROM test_schema.test_table_with_delete WHERE _bigint = 2", 1); + + assertQuery("SELECT count(*) FROM test_schema.test_table_with_delete", "VALUES 4"); + assertQuery("SELECT count(*) FROM test_schema.\"test_table_with_delete$files\" WHERE content = " + FileContent.DATA.id(), "VALUES 4"); + assertQuery("SELECT count(*) FROM test_schema.\"test_table_with_delete$files\" WHERE content = " + FileContent.POSITION_DELETES.id(), "VALUES 2"); + assertQuery("SELECT count(*) FROM test_schema.\"test_table_with_delete$files\" WHERE content = " + FileContent.EQUALITY_DELETES.id(), "VALUES 0"); + + assertUpdate("DROP TABLE IF EXISTS test_schema.test_table_with_delete"); + } + + @Test + void testAllEntriesTable() + { + try (TestTable table = newTrinoTable("test_all_entries", "AS SELECT 1 id, DATE '2014-01-01' dt")) { + assertThat(query("DESCRIBE \"" + table.getName() + "$all_entries\"")) + .matches("DESCRIBE \"" + table.getName() + "$entries\""); + + assertThat(query("SELECT * FROM \"" + table.getName() + "$all_entries\"")) + .matches("SELECT * FROM \"" + table.getName() + "$entries\""); + + assertUpdate("DELETE FROM " + table.getName(), 1); + + assertThat(computeActual("SELECT status FROM \"" + table.getName() + "$all_entries\"").getOnlyColumnAsSet()) + .containsExactly(1, 2); + assertThat(computeActual("SELECT status FROM \"" + table.getName() + "$entries\"").getOnlyColumnAsSet()) + .containsExactly(2); + assertThat(query("SELECT * FROM \"" + table.getName() + "$all_entries\" WHERE status = 2")) + .matches("SELECT * FROM \"" + table.getName() + "$entries\""); + } + } + + @Test + void testEntriesTable() + { + try (TestTable table = newTrinoTable("test_entries", "AS SELECT 1 id, DATE '2014-01-01' dt")) { + assertQuery("SHOW COLUMNS FROM \"" + table.getName() + "$entries\"", + "VALUES ('status', 'integer', '', '')," + + "('snapshot_id', 'bigint', '', '')," + + "('sequence_number', 'bigint', '', '')," + + "('file_sequence_number', 'bigint', '', '')," + + "('data_file', 'row(content integer, file_path varchar, file_format varchar, spec_id integer, record_count bigint, file_size_in_bytes bigint, " + + "column_sizes map(integer, bigint), value_counts map(integer, bigint), null_value_counts map(integer, bigint), nan_value_counts map(integer, bigint), " + + "lower_bounds map(integer, varchar), upper_bounds map(integer, varchar), key_metadata varbinary, split_offsets array(bigint), " + + "equality_ids array(integer), sort_order_id integer)', '', '')," + + "('readable_metrics', 'json', '', '')"); + + Table icebergTable = loadTable(table.getName()); + Snapshot snapshot = icebergTable.currentSnapshot(); + long snapshotId = snapshot.snapshotId(); + long sequenceNumber = snapshot.sequenceNumber(); + + assertThat(computeScalar("SELECT status FROM \"" + table.getName() + "$entries\"")) + .isEqualTo(1); + assertThat(computeScalar("SELECT snapshot_id FROM \"" + table.getName() + "$entries\"")) + .isEqualTo(snapshotId); + assertThat(computeScalar("SELECT sequence_number FROM \"" + table.getName() + "$entries\"")) + .isEqualTo(sequenceNumber); + assertThat(computeScalar("SELECT file_sequence_number FROM \"" + table.getName() + "$entries\"")) + .isEqualTo(1L); + + MaterializedRow dataFile = (MaterializedRow) computeScalar("SELECT data_file FROM \"" + table.getName() + "$entries\""); + assertThat(dataFile.getFieldCount()).isEqualTo(16); + assertThat(dataFile.getField(0)).isEqualTo(0); // content + assertThat((String) dataFile.getField(1)).endsWith(format.toString().toLowerCase(ENGLISH)); // file_path + assertThat(dataFile.getField(2)).isEqualTo(format.toString()); // file_format + assertThat(dataFile.getField(3)).isEqualTo(0); // spec_id + assertThat(dataFile.getField(4)).isEqualTo(1L); // record_count + assertThat((long) dataFile.getField(5)).isPositive(); // file_size_in_bytes + assertThat(dataFile.getField(6)).isEqualTo(value(Map.of(1, 36L, 2, 36L), null)); // column_sizes + assertThat(dataFile.getField(7)).isEqualTo(Map.of(1, 1L, 2, 1L)); // value_counts + assertThat(dataFile.getField(8)).isEqualTo(Map.of(1, 0L, 2, 0L)); // null_value_counts + assertThat(dataFile.getField(9)).isEqualTo(value(Map.of(), null)); // nan_value_counts + assertThat(dataFile.getField(10)).isEqualTo(Map.of(1, "1", 2, "2014-01-01")); // lower_bounds + assertThat(dataFile.getField(11)).isEqualTo(Map.of(1, "1", 2, "2014-01-01")); // upper_bounds + assertThat(dataFile.getField(12)).isNull(); // key_metadata + assertThat(dataFile.getField(13)).isEqualTo(List.of(value(4L, 3L))); // split_offsets + assertThat(dataFile.getField(14)).isNull(); // equality_ids + assertThat(dataFile.getField(15)).isEqualTo(0); // sort_order_id + + assertThat(computeScalar("SELECT readable_metrics FROM \"" + table.getName() + "$entries\"")) + .isEqualTo("{" + + "\"dt\":{\"column_size\":" + value(36, null) + ",\"value_count\":1,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":\"2014-01-01\",\"upper_bound\":\"2014-01-01\"}," + + "\"id\":{\"column_size\":" + value(36, null) + ",\"value_count\":1,\"null_value_count\":0,\"nan_value_count\":null,\"lower_bound\":1,\"upper_bound\":1}" + + "}"); + } + } + + @Test + void testEntriesAfterPositionDelete() + { + try (TestTable table = new TestTable(getQueryRunner()::execute, "test_entries", "AS SELECT 1 id, DATE '2014-01-01' dt")) { + assertUpdate("DELETE FROM " + table.getName() + " WHERE id = 1", 1); + + Table icebergTable = loadTable(table.getName()); + Snapshot snapshot = icebergTable.currentSnapshot(); + long snapshotId = snapshot.snapshotId(); + long sequenceNumber = snapshot.sequenceNumber(); + + assertThat(computeScalar("SELECT status FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(1); + assertThat(computeScalar("SELECT snapshot_id FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(snapshotId); + assertThat(computeScalar("SELECT sequence_number FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(sequenceNumber); + assertThat(computeScalar("SELECT file_sequence_number FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(2L); + + MaterializedRow deleteFile = (MaterializedRow) computeScalar("SELECT data_file FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId); + assertThat(deleteFile.getFieldCount()).isEqualTo(16); + assertThat(deleteFile.getField(0)).isEqualTo(1); // content + assertThat((String) deleteFile.getField(1)).endsWith(format.toString().toLowerCase(ENGLISH)); // file_path + assertThat(deleteFile.getField(2)).isEqualTo(format.toString()); // file_format + assertThat(deleteFile.getField(3)).isEqualTo(0); // spec_id + assertThat(deleteFile.getField(4)).isEqualTo(1L); // record_count + assertThat((long) deleteFile.getField(5)).isPositive(); // file_size_in_bytes + + //noinspection unchecked + Map columnSizes = (Map) deleteFile.getField(6); + switch (format) { + case ORC -> assertThat(columnSizes).isNull(); + case PARQUET -> assertThat(columnSizes) + .hasSize(2) + .satisfies(_ -> assertThat(columnSizes.get(DELETE_FILE_POS.fieldId())).isPositive()) + .satisfies(_ -> assertThat(columnSizes.get(DELETE_FILE_PATH.fieldId())).isPositive()); + default -> throw new IllegalArgumentException("Unsupported format: " + format); + } + + assertThat(deleteFile.getField(7)).isEqualTo(Map.of(DELETE_FILE_POS.fieldId(), 1L, DELETE_FILE_PATH.fieldId(), 1L)); // value_counts + assertThat(deleteFile.getField(8)).isEqualTo(Map.of(DELETE_FILE_POS.fieldId(), 0L, DELETE_FILE_PATH.fieldId(), 0L)); // null_value_counts + assertThat(deleteFile.getField(9)).isEqualTo(value(Map.of(), null)); // nan_value_counts + + // lower_bounds + //noinspection unchecked + Map lowerBounds = (Map) deleteFile.getField(10); + assertThat(lowerBounds) + .hasSize(2) + .satisfies(_ -> assertThat(lowerBounds.get(DELETE_FILE_POS.fieldId())).isEqualTo("0")) + .satisfies(_ -> assertThat(lowerBounds.get(DELETE_FILE_PATH.fieldId())).contains(table.getName())); + + // upper_bounds + //noinspection unchecked + Map upperBounds = (Map) deleteFile.getField(11); + assertThat(upperBounds) + .hasSize(2) + .satisfies(_ -> assertThat(upperBounds.get(DELETE_FILE_POS.fieldId())).isEqualTo("0")) + .satisfies(_ -> assertThat(upperBounds.get(DELETE_FILE_PATH.fieldId())).contains(table.getName())); + + assertThat(deleteFile.getField(12)).isNull(); // key_metadata + assertThat(deleteFile.getField(13)).isEqualTo(List.of(value(4L, 3L))); // split_offsets + assertThat(deleteFile.getField(14)).isNull(); // equality_ids + assertThat(deleteFile.getField(15)).isNull(); // sort_order_id + + assertThat(computeScalar("SELECT readable_metrics FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(""" + {\ + "dt":{"column_size":null,"value_count":null,"null_value_count":null,"nan_value_count":null,"lower_bound":null,"upper_bound":null},\ + "id":{"column_size":null,"value_count":null,"null_value_count":null,"nan_value_count":null,"lower_bound":null,"upper_bound":null}\ + }"""); + } + } + + @Test + void testEntriesAfterEqualityDelete() + throws Exception + { + try (TestTable table = new TestTable(getQueryRunner()::execute, "test_entries", "AS SELECT 1 id, DATE '2014-01-01' dt")) { + Table icebergTable = loadTable(table.getName()); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + writeEqualityDeleteForTable(icebergTable, fileSystemFactory, Optional.empty(), Optional.empty(), ImmutableMap.of("id", 1), Optional.empty()); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "1"); + + Snapshot snapshot = icebergTable.currentSnapshot(); + long snapshotId = snapshot.snapshotId(); + long sequenceNumber = snapshot.sequenceNumber(); + + assertThat(computeScalar("SELECT status FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(1); + assertThat(computeScalar("SELECT snapshot_id FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(snapshotId); + assertThat(computeScalar("SELECT sequence_number FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(sequenceNumber); + assertThat(computeScalar("SELECT file_sequence_number FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(2L); + + MaterializedRow dataFile = (MaterializedRow) computeScalar("SELECT data_file FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId); + assertThat(dataFile.getFieldCount()).isEqualTo(16); + assertThat(dataFile.getField(0)).isEqualTo(2); // content + assertThat(dataFile.getField(3)).isEqualTo(0); // spec_id + assertThat(dataFile.getField(4)).isEqualTo(1L); // record_count + assertThat((long) dataFile.getField(5)).isPositive(); // file_size_in_bytes + assertThat(dataFile.getField(6)).isEqualTo(Map.of(1, 51L)); // column_sizes + assertThat(dataFile.getField(7)).isEqualTo(Map.of(1, 1L)); // value_counts + assertThat(dataFile.getField(8)).isEqualTo(Map.of(1, 0L)); // null_value_counts + assertThat(dataFile.getField(9)).isEqualTo(Map.of()); // nan_value_counts + assertThat(dataFile.getField(10)).isEqualTo(Map.of(1, "1")); // lower_bounds + assertThat(dataFile.getField(11)).isEqualTo(Map.of(1, "1")); // upper_bounds + assertThat(dataFile.getField(12)).isNull(); // key_metadata + assertThat(dataFile.getField(13)).isEqualTo(List.of(4L)); // split_offsets + assertThat(dataFile.getField(14)).isEqualTo(List.of(1)); // equality_ids + assertThat(dataFile.getField(15)).isEqualTo(0); // sort_order_id + + assertThat(computeScalar("SELECT readable_metrics FROM \"" + table.getName() + "$entries\"" + " WHERE snapshot_id = " + snapshotId)) + .isEqualTo(""" + {\ + "dt":{"column_size":null,"value_count":null,"null_value_count":null,"nan_value_count":null,"lower_bound":null,"upper_bound":null},\ + "id":{"column_size":51,"value_count":1,"null_value_count":0,"nan_value_count":null,"lower_bound":1,"upper_bound":1}\ + }"""); + } + } + + @Test + public void testPartitionsColumns() + { + try (TestTable testTable = newTrinoTable("test_partition_columns", """ + WITH (partitioning = ARRAY[ + '"r1.f1"', + 'bucket(b1, 4)' + ]) AS + SELECT + CAST(ROW(1, 2) AS ROW(f1 INTEGER, f2 integeR)) as r1 + , CAST('b' AS VARCHAR) as b1""")) { + assertThat(query("SELECT partition FROM \"" + testTable.getName() + "$partitions\"")) + .matches("SELECT CAST(ROW(1, 3) AS ROW(\"r1.f1\" INTEGER, b1_bucket INTEGER))"); + } + + try (TestTable testTable = newTrinoTable("test_partition_columns", """ + WITH (partitioning = ARRAY[ + '"r1.f2"', + 'bucket(b1, 4)', + '"r1.f1"' + ]) AS + SELECT + CAST(ROW('f1', 'f2') AS ROW(f1 VARCHAR, f2 VARCHAR)) as r1 + , CAST('b' AS VARCHAR) as b1""")) { + assertThat(query("SELECT partition FROM \"" + testTable.getName() + "$partitions\"")) + .matches("SELECT CAST(ROW('f2', 3, 'f1') AS ROW(\"r1.f2\" VARCHAR, b1_bucket INTEGER, \"r1.f1\" VARCHAR))"); + } + } + + @Test + void testEntriesPartitionTable() + { + try (TestTable table = newTrinoTable( + "test_entries_partition", + "WITH (partitioning = ARRAY['dt']) AS SELECT 1 id, DATE '2014-01-01' dt")) { + assertQuery("SHOW COLUMNS FROM \"" + table.getName() + "$entries\"", + "VALUES ('status', 'integer', '', '')," + + "('snapshot_id', 'bigint', '', '')," + + "('sequence_number', 'bigint', '', '')," + + "('file_sequence_number', 'bigint', '', '')," + + "('data_file', 'row(content integer, file_path varchar, file_format varchar, spec_id integer, partition row(dt date), record_count bigint, file_size_in_bytes bigint, " + + "column_sizes map(integer, bigint), value_counts map(integer, bigint), null_value_counts map(integer, bigint), nan_value_counts map(integer, bigint), " + + "lower_bounds map(integer, varchar), upper_bounds map(integer, varchar), key_metadata varbinary, split_offsets array(bigint), " + + "equality_ids array(integer), sort_order_id integer)', '', '')," + + "('readable_metrics', 'json', '', '')"); + + assertThat(query("SELECT data_file.partition FROM \"" + table.getName() + "$entries\"")) + .matches("SELECT CAST(ROW(DATE '2014-01-01') AS ROW(dt date))"); + } + } + + @Test + public void testPropertiesTable() + { + try (TestTable table = newTrinoTable("test_properties", "(x BIGINT,y DOUBLE) WITH (sorted_by = ARRAY['y'])")) { + Table icebergTable = loadTable(table.getName()); + Map actualProperties = getTableProperties(table.getName()); + assertThat(actualProperties).contains( + entry("format", "iceberg/" + format.name()), + entry("provider", "iceberg"), + entry("current-snapshot-id", Long.toString(icebergTable.currentSnapshot().snapshotId())), + entry("location", icebergTable.location()), + entry("format-version", "2"), + entry("sort-order", "y ASC NULLS FIRST"), + entry("write.format.default", format.name())); + } + } + + private Map getTableProperties(String tableName) + { + return computeActual("SELECT key, value FROM \"" + tableName + "$properties\"").getMaterializedRows().stream() + .collect(toImmutableMap(row -> (String) row.getField(0), row -> (String) row.getField(1))); } private Long nanCount(long value) @@ -287,4 +871,20 @@ private Long nanCount(long value) // Parquet does not have nan count metrics return format == PARQUET ? null : value; } + + private Long columnSize(long value) + { + // ORC does not have column size in readable metrics + return format == ORC ? null : value; + } + + private Object value(Object parquet, Object orc) + { + return format == PARQUET ? parquet : orc; + } + + private BaseTable loadTable(String tableName) + { + return IcebergTestUtils.loadTable(tableName, metastore, fileSystemFactory, "hive", "tpch"); + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseSharedMetastoreTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseSharedMetastoreTest.java index 2f497f4180d9..e9dc81d27752 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseSharedMetastoreTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/BaseSharedMetastoreTest.java @@ -14,7 +14,7 @@ package io.trino.plugin.iceberg; import io.trino.testing.AbstractTestQueryFramework; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.time.Instant; import java.time.ZonedDateTime; @@ -24,13 +24,15 @@ import static java.lang.String.format; import static java.time.ZoneOffset.UTC; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; public abstract class BaseSharedMetastoreTest extends AbstractTestQueryFramework { - protected final String schema = "test_shared_schema_" + randomNameSuffix(); + /** + * This schema should contain only nation and region tables. Use {@link #testSchema} when creating new tables. + */ + protected final String tpchSchema = "test_tpch_shared_schema_" + randomNameSuffix(); + protected final String testSchema = "test_mutable_shared_schema_" + randomNameSuffix(); protected abstract String getExpectedHiveCreateSchema(String catalogName); @@ -39,69 +41,92 @@ public abstract class BaseSharedMetastoreTest @Test public void testSelect() { - assertQuery("SELECT * FROM iceberg." + schema + ".nation", "SELECT * FROM nation"); - assertQuery("SELECT * FROM hive." + schema + ".region", "SELECT * FROM region"); - assertQuery("SELECT * FROM hive_with_redirections." + schema + ".nation", "SELECT * FROM nation"); - assertQuery("SELECT * FROM hive_with_redirections." + schema + ".region", "SELECT * FROM region"); - assertQuery("SELECT * FROM iceberg_with_redirections." + schema + ".nation", "SELECT * FROM nation"); - assertQuery("SELECT * FROM iceberg_with_redirections." + schema + ".region", "SELECT * FROM region"); - - assertThatThrownBy(() -> query("SELECT * FROM iceberg." + schema + ".region")) - .hasMessageContaining("Not an Iceberg table"); - assertThatThrownBy(() -> query("SELECT * FROM hive." + schema + ".nation")) - .hasMessageContaining("Cannot query Iceberg table"); + assertQuery("SELECT * FROM iceberg." + tpchSchema + ".nation", "SELECT * FROM nation"); + assertQuery("SELECT * FROM hive." + tpchSchema + ".region", "SELECT * FROM region"); + assertQuery("SELECT * FROM hive_with_redirections." + tpchSchema + ".nation", "SELECT * FROM nation"); + assertQuery("SELECT * FROM hive_with_redirections." + tpchSchema + ".region", "SELECT * FROM region"); + assertQuery("SELECT * FROM iceberg_with_redirections." + tpchSchema + ".nation", "SELECT * FROM nation"); + assertQuery("SELECT * FROM iceberg_with_redirections." + tpchSchema + ".region", "SELECT * FROM region"); + + assertThat(query("SELECT * FROM iceberg." + tpchSchema + ".region")) + .failure().hasMessageContaining("Not an Iceberg table"); + assertThat(query("SELECT * FROM iceberg." + tpchSchema + ".\"region$data\"")) + .failure().hasMessageMatching(".* Table .* does not exist"); + assertThat(query("SELECT * FROM iceberg." + tpchSchema + ".\"region$files\"")) + .failure().hasMessageMatching(".* Table .* does not exist"); + + assertThat(query("SELECT * FROM hive." + tpchSchema + ".nation")) + .failure().hasMessageContaining("Cannot query Iceberg table"); + assertThat(query("SELECT * FROM hive." + tpchSchema + ".\"nation$partitions\"")) + .failure().hasMessageMatching(".* Table .* does not exist"); + assertThat(query("SELECT * FROM hive." + tpchSchema + ".\"nation$properties\"")) + .failure().hasMessageMatching(".* Table .* does not exist"); } @Test public void testReadInformationSchema() { - assertThat(query("SELECT table_schema FROM hive.information_schema.tables WHERE table_name = 'region' AND table_schema='" + schema + "'")) + assertThat(query("SELECT table_schema FROM hive.information_schema.tables WHERE table_name = 'region' AND table_schema='" + tpchSchema + "'")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); - assertThat(query("SELECT table_schema FROM iceberg.information_schema.tables WHERE table_name = 'nation' AND table_schema='" + schema + "'")) + .containsAll("VALUES '" + tpchSchema + "'"); + assertThat(query("SELECT table_schema FROM iceberg.information_schema.tables WHERE table_name = 'nation' AND table_schema='" + tpchSchema + "'")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); - assertThat(query("SELECT table_schema FROM hive_with_redirections.information_schema.tables WHERE table_name = 'region' AND table_schema='" + schema + "'")) + .containsAll("VALUES '" + tpchSchema + "'"); + assertThat(query("SELECT table_schema FROM hive_with_redirections.information_schema.tables WHERE table_name = 'region' AND table_schema='" + tpchSchema + "'")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); - assertThat(query("SELECT table_schema FROM hive_with_redirections.information_schema.tables WHERE table_name = 'nation' AND table_schema='" + schema + "'")) + .containsAll("VALUES '" + tpchSchema + "'"); + assertThat(query("SELECT table_schema FROM hive_with_redirections.information_schema.tables WHERE table_name = 'nation' AND table_schema='" + tpchSchema + "'")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); - assertThat(query("SELECT table_schema FROM iceberg_with_redirections.information_schema.tables WHERE table_name = 'region' AND table_schema='" + schema + "'")) + .containsAll("VALUES '" + tpchSchema + "'"); + assertThat(query("SELECT table_schema FROM iceberg_with_redirections.information_schema.tables WHERE table_name = 'region' AND table_schema='" + tpchSchema + "'")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); + .containsAll("VALUES '" + tpchSchema + "'"); - assertQuery("SELECT table_name, column_name from hive.information_schema.columns WHERE table_schema = '" + schema + "'", + assertQuery("SELECT table_name, column_name from hive.information_schema.columns WHERE table_schema = '" + tpchSchema + "'", "VALUES ('region', 'regionkey'), ('region', 'name'), ('region', 'comment')"); - assertQuery("SELECT table_name, column_name from iceberg.information_schema.columns WHERE table_schema = '" + schema + "'", + assertQuery("SELECT table_name, column_name from iceberg.information_schema.columns WHERE table_schema = '" + tpchSchema + "'", "VALUES ('nation', 'nationkey'), ('nation', 'name'), ('nation', 'regionkey'), ('nation', 'comment')"); - assertQuery("SELECT table_name, column_name from hive_with_redirections.information_schema.columns WHERE table_schema = '" + schema + "'", + assertQuery("SELECT table_name, column_name from hive_with_redirections.information_schema.columns WHERE table_schema = '" + tpchSchema + "'", "VALUES" + "('region', 'regionkey'), ('region', 'name'), ('region', 'comment'), " + "('nation', 'nationkey'), ('nation', 'name'), ('nation', 'regionkey'), ('nation', 'comment')"); - assertQuery("SELECT table_name, column_name from iceberg_with_redirections.information_schema.columns WHERE table_schema = '" + schema + "'", + assertQuery("SELECT table_name, column_name from iceberg_with_redirections.information_schema.columns WHERE table_schema = '" + tpchSchema + "'", "VALUES" + "('region', 'regionkey'), ('region', 'name'), ('region', 'comment'), " + "('nation', 'nationkey'), ('nation', 'name'), ('nation', 'regionkey'), ('nation', 'comment')"); } + @Test + void testHiveSelectTableColumns() + { + assertThat(query("SELECT table_cat, table_schem, table_name, column_name FROM system.jdbc.columns WHERE table_cat = 'hive' AND table_schem = '" + tpchSchema + "' AND table_name = 'region'")) + .skippingTypesCheck() + .matches("VALUES " + + "('hive', '" + tpchSchema + "', 'region', 'regionkey')," + + "('hive', '" + tpchSchema + "', 'region', 'name')," + + "('hive', '" + tpchSchema + "', 'region', 'comment')"); + + // Hive does not show any information about tables with unsupported format + assertQueryReturnsEmptyResult("SELECT table_cat, table_schem, table_name, column_name FROM system.jdbc.columns WHERE table_cat = 'hive' AND table_schem = '" + tpchSchema + "' AND table_name = 'nation'"); + } + @Test public void testShowTables() { - assertQuery("SHOW TABLES FROM iceberg." + schema, "VALUES 'region', 'nation'"); - assertQuery("SHOW TABLES FROM hive." + schema, "VALUES 'region', 'nation'"); - assertQuery("SHOW TABLES FROM hive_with_redirections." + schema, "VALUES 'region', 'nation'"); - assertQuery("SHOW TABLES FROM iceberg_with_redirections." + schema, "VALUES 'region', 'nation'"); - - assertThatThrownBy(() -> query("SHOW CREATE TABLE iceberg." + schema + ".region")) - .hasMessageContaining("Not an Iceberg table"); - assertThatThrownBy(() -> query("SHOW CREATE TABLE hive." + schema + ".nation")) - .hasMessageContaining("Cannot query Iceberg table"); - - assertThatThrownBy(() -> query("DESCRIBE iceberg." + schema + ".region")) - .hasMessageContaining("Not an Iceberg table"); - assertThatThrownBy(() -> query("DESCRIBE hive." + schema + ".nation")) - .hasMessageContaining("Cannot query Iceberg table"); + assertQuery("SHOW TABLES FROM iceberg." + tpchSchema, "VALUES 'region', 'nation'"); + assertQuery("SHOW TABLES FROM hive." + tpchSchema, "VALUES 'region', 'nation'"); + assertQuery("SHOW TABLES FROM hive_with_redirections." + tpchSchema, "VALUES 'region', 'nation'"); + assertQuery("SHOW TABLES FROM iceberg_with_redirections." + tpchSchema, "VALUES 'region', 'nation'"); + + assertThat(query("SHOW CREATE TABLE iceberg." + tpchSchema + ".region")) + .failure().hasMessageContaining("Not an Iceberg table"); + assertThat(query("SHOW CREATE TABLE hive." + tpchSchema + ".nation")) + .failure().hasMessageContaining("Cannot query Iceberg table"); + + assertThat(query("DESCRIBE iceberg." + tpchSchema + ".region")) + .failure().hasMessageContaining("Not an Iceberg table"); + assertThat(query("DESCRIBE hive." + tpchSchema + ".nation")) + .failure().hasMessageContaining("Cannot query Iceberg table"); } @Test @@ -109,75 +134,131 @@ public void testShowSchemas() { assertThat(query("SHOW SCHEMAS FROM hive")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); + .containsAll("VALUES '" + tpchSchema + "'"); assertThat(query("SHOW SCHEMAS FROM iceberg")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); + .containsAll("VALUES '" + tpchSchema + "'"); assertThat(query("SHOW SCHEMAS FROM hive_with_redirections")) .skippingTypesCheck() - .containsAll("VALUES '" + schema + "'"); - - String showCreateHiveSchema = (String) computeActual("SHOW CREATE SCHEMA hive." + schema).getOnlyValue(); - assertEquals( - showCreateHiveSchema, - getExpectedHiveCreateSchema("hive")); - String showCreateIcebergSchema = (String) computeActual("SHOW CREATE SCHEMA iceberg." + schema).getOnlyValue(); - assertEquals( - showCreateIcebergSchema, - getExpectedIcebergCreateSchema("iceberg")); - String showCreateHiveWithRedirectionsSchema = (String) computeActual("SHOW CREATE SCHEMA hive_with_redirections." + schema).getOnlyValue(); - assertEquals( - showCreateHiveWithRedirectionsSchema, - getExpectedHiveCreateSchema("hive_with_redirections")); - String showCreateIcebergWithRedirectionsSchema = (String) computeActual("SHOW CREATE SCHEMA iceberg_with_redirections." + schema).getOnlyValue(); - assertEquals( - showCreateIcebergWithRedirectionsSchema, - getExpectedIcebergCreateSchema("iceberg_with_redirections")); + .containsAll("VALUES '" + tpchSchema + "'"); + + String showCreateHiveSchema = (String) computeActual("SHOW CREATE SCHEMA hive." + tpchSchema).getOnlyValue(); + assertThat(showCreateHiveSchema).isEqualTo(getExpectedHiveCreateSchema("hive")); + String showCreateIcebergSchema = (String) computeActual("SHOW CREATE SCHEMA iceberg." + tpchSchema).getOnlyValue(); + assertThat(showCreateIcebergSchema).isEqualTo(getExpectedIcebergCreateSchema("iceberg")); + String showCreateHiveWithRedirectionsSchema = (String) computeActual("SHOW CREATE SCHEMA hive_with_redirections." + tpchSchema).getOnlyValue(); + assertThat(showCreateHiveWithRedirectionsSchema).isEqualTo(getExpectedHiveCreateSchema("hive_with_redirections")); + String showCreateIcebergWithRedirectionsSchema = (String) computeActual("SHOW CREATE SCHEMA iceberg_with_redirections." + tpchSchema).getOnlyValue(); + assertThat(showCreateIcebergWithRedirectionsSchema).isEqualTo(getExpectedIcebergCreateSchema("iceberg_with_redirections")); + } + + @Test + public void testIcebergTablesSystemTable() + { + assertQuery("SELECT * FROM iceberg.system.iceberg_tables WHERE table_schema = '%s'".formatted(tpchSchema), "VALUES ('%s', 'nation')".formatted(tpchSchema)); + assertQuery("SELECT * FROM iceberg_with_redirections.system.iceberg_tables WHERE table_schema = '%s'".formatted(tpchSchema), "VALUES ('%s', 'nation')".formatted(tpchSchema)); } @Test public void testTimeTravelWithRedirection() throws InterruptedException { - String testLocalSchema = "test_schema_" + randomNameSuffix(); try { - assertUpdate("CREATE SCHEMA iceberg. " + testLocalSchema); - assertUpdate(format("CREATE TABLE iceberg.%s.nation_test AS SELECT * FROM nation", testLocalSchema), 25); - assertQuery("SELECT * FROM hive_with_redirections." + testLocalSchema + ".nation_test", "SELECT * FROM nation"); - long snapshot1 = getLatestSnapshotId(testLocalSchema); - long v1EpochMillis = getCommittedAtInEpochMilliSeconds(snapshot1, testLocalSchema); + assertUpdate(format("CREATE TABLE iceberg.%s.nation_test AS SELECT * FROM nation", testSchema), 25); + assertQuery("SELECT * FROM hive_with_redirections." + testSchema + ".nation_test", "SELECT * FROM nation"); + long snapshot1 = getLatestSnapshotId(testSchema); + long v1EpochMillis = getCommittedAtInEpochMilliSeconds(snapshot1, testSchema); Thread.sleep(1); - assertUpdate(format("INSERT INTO hive_with_redirections.%s.nation_test VALUES(25, 'POLAND', 3, 'test 1')", testLocalSchema), 1); - long snapshot2 = getLatestSnapshotId(testLocalSchema); - long v2EpochMillis = getCommittedAtInEpochMilliSeconds(snapshot2, testLocalSchema); + assertUpdate(format("INSERT INTO hive_with_redirections.%s.nation_test VALUES(25, 'POLAND', 3, 'test 1')", testSchema), 1); + long snapshot2 = getLatestSnapshotId(testSchema); + long v2EpochMillis = getCommittedAtInEpochMilliSeconds(snapshot2, testSchema); Thread.sleep(1); - assertUpdate(format("INSERT INTO hive_with_redirections.%s.nation_test VALUES(26, 'CHILE', 1, 'test 2')", testLocalSchema), 1); - long snapshot3 = getLatestSnapshotId(testLocalSchema); - long v3EpochMillis = getCommittedAtInEpochMilliSeconds(snapshot3, testLocalSchema); + assertUpdate(format("INSERT INTO hive_with_redirections.%s.nation_test VALUES(26, 'CHILE', 1, 'test 2')", testSchema), 1); + long snapshot3 = getLatestSnapshotId(testSchema); + long v3EpochMillis = getCommittedAtInEpochMilliSeconds(snapshot3, testSchema); long incorrectSnapshot = 2324324333L; Thread.sleep(1); - assertQuery(format("SELECT * FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testLocalSchema, snapshot1), "SELECT * FROM nation"); - assertQuery(format("SELECT * FROM hive_with_redirections.%s.nation_test FOR TIMESTAMP AS OF %s", testLocalSchema, timestampLiteral(v1EpochMillis)), "SELECT * FROM nation"); - assertQuery(format("SELECT count(*) FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testLocalSchema, snapshot2), "VALUES(26)"); + assertQuery(format("SELECT * FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testSchema, snapshot1), "SELECT * FROM nation"); + assertQuery(format("SELECT * FROM hive_with_redirections.%s.nation_test FOR TIMESTAMP AS OF %s", testSchema, timestampLiteral(v1EpochMillis)), "SELECT * FROM nation"); + assertQuery(format("SELECT count(*) FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testSchema, snapshot2), "VALUES(26)"); assertQuery(format( - "SELECT count(*) FROM iceberg_with_redirections.%s.nation_test FOR TIMESTAMP AS OF %s", testLocalSchema, timestampLiteral(v2EpochMillis)), "VALUES(26)"); - assertQuery(format("SELECT count(*) FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testLocalSchema, snapshot3), "VALUES(27)"); + "SELECT count(*) FROM iceberg_with_redirections.%s.nation_test FOR TIMESTAMP AS OF %s", testSchema, timestampLiteral(v2EpochMillis)), "VALUES(26)"); + assertQuery(format("SELECT count(*) FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testSchema, snapshot3), "VALUES(27)"); assertQuery(format( - "SELECT count(*) FROM hive_with_redirections.%s.nation_test FOR TIMESTAMP AS OF %s", testLocalSchema, timestampLiteral(v3EpochMillis)), "VALUES(27)"); - assertQueryFails(format("SELECT * FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testLocalSchema, incorrectSnapshot), "Iceberg snapshot ID does not exists: " + incorrectSnapshot); + "SELECT count(*) FROM hive_with_redirections.%s.nation_test FOR TIMESTAMP AS OF %s", testSchema, timestampLiteral(v3EpochMillis)), "VALUES(27)"); + assertQueryFails(format("SELECT * FROM hive_with_redirections.%s.nation_test FOR VERSION AS OF %d", testSchema, incorrectSnapshot), "Iceberg snapshot ID does not exists: " + incorrectSnapshot); assertQueryFails( - format("SELECT * FROM hive_with_redirections.%s.nation_test FOR TIMESTAMP AS OF TIMESTAMP '1970-01-01 00:00:00.001000000 Z'", testLocalSchema), - format("\\QNo version history table \"%s\".\"nation_test\" at or before 1970-01-01T00:00:00.001Z", testLocalSchema)); + format("SELECT * FROM hive_with_redirections.%s.nation_test FOR TIMESTAMP AS OF TIMESTAMP '1970-01-01 00:00:00.001000000 Z'", testSchema), + format("\\QNo version history table \"%s\".\"nation_test\" at or before 1970-01-01T00:00:00.001Z", testSchema)); assertQueryFails( - format("SELECT * FROM iceberg_with_redirections.%s.region FOR TIMESTAMP AS OF TIMESTAMP '1970-01-01 00:00:00.001000000 Z'", schema), + format("SELECT * FROM iceberg_with_redirections.%s.region FOR TIMESTAMP AS OF TIMESTAMP '1970-01-01 00:00:00.001000000 Z'", tpchSchema), "\\QThis connector does not support versioned tables"); } finally { - query("DROP TABLE IF EXISTS iceberg." + testLocalSchema + ".nation_test"); - query("DROP SCHEMA IF EXISTS iceberg." + testLocalSchema); + assertUpdate("DROP TABLE IF EXISTS iceberg." + testSchema + ".nation_test"); } } + @Test + void testIcebergCannotCreateTableNamesakeToHiveTable() + { + String tableName = "test_iceberg_create_namesake_hive_table_" + randomNameSuffix(); + String hiveTableName = "hive.%s.%s".formatted(testSchema, tableName); + String icebergTableName = "iceberg.%s.%s".formatted(testSchema, tableName); + + assertUpdate("CREATE TABLE " + hiveTableName + "(a bigint)"); + assertThat(query("CREATE TABLE " + icebergTableName + "(a bigint)")) + .failure().hasMessageMatching(".* Table .* of unsupported type already exists"); + + assertUpdate("DROP TABLE " + hiveTableName); + } + + @Test + void testHiveCannotCreateTableNamesakeToIcebergTable() + { + String tableName = "test_iceberg_create_namesake_hive_table_" + randomNameSuffix(); + String hiveTableName = "hive.%s.%s".formatted(testSchema, tableName); + String icebergTableName = "iceberg.%s.%s".formatted(testSchema, tableName); + + assertUpdate("CREATE TABLE " + icebergTableName + "(a bigint)"); + assertThat(query("CREATE TABLE " + hiveTableName + "(a bigint)")) + .failure().hasMessageMatching(".* Table .* of unsupported type already exists"); + + assertUpdate("DROP TABLE " + icebergTableName); + } + + @Test + public void testMigrateTable() + { + String tableName = "test_migrate_" + randomNameSuffix(); + String hiveTableName = "hive.%s.%s".formatted(testSchema, tableName); + String icebergTableName = "iceberg.%s.%s".formatted(testSchema, tableName); + + assertUpdate("CREATE TABLE " + hiveTableName + " AS SELECT 1 id", 1); + assertQueryFails("SELECT * FROM " + icebergTableName, "Not an Iceberg table: .*"); + + assertUpdate("CALL iceberg.system.migrate('" + testSchema + "', '" + tableName + "')"); + assertQuery("SELECT * FROM " + icebergTableName, "VALUES 1"); + + assertUpdate("DROP TABLE " + icebergTableName); + } + + @Test + public void testMigratePartitionedTable() + { + String tableName = "test_migrate_" + randomNameSuffix(); + String hiveTableName = "hive.%s.%s".formatted(testSchema, tableName); + String icebergTableName = "iceberg.%s.%s".formatted(testSchema, tableName); + + assertUpdate("CREATE TABLE " + hiveTableName + " WITH (partitioned_by = ARRAY['part']) AS SELECT 1 id, 'test' part", 1); + assertQueryFails("SELECT * FROM " + icebergTableName, "Not an Iceberg table: .*"); + + assertUpdate("CALL iceberg.system.migrate('" + testSchema + "', '" + tableName + "')"); + assertQuery("SELECT * FROM " + icebergTableName, "VALUES (1, 'test')"); + + assertUpdate("DROP TABLE " + icebergTableName); + } + private long getLatestSnapshotId(String schema) { return (long) computeScalar(format("SELECT snapshot_id FROM iceberg.%s.\"nation_test$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES", schema)); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/DataFileRecord.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/DataFileRecord.java index 249f5e740dba..a5d1cbc701e0 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/DataFileRecord.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/DataFileRecord.java @@ -18,7 +18,7 @@ import java.util.Map; -import static org.testng.Assert.assertEquals; +import static org.assertj.core.api.Assertions.assertThat; public class DataFileRecord { @@ -37,19 +37,19 @@ public class DataFileRecord @SuppressWarnings("unchecked") public static DataFileRecord toDataFileRecord(MaterializedRow row) { - assertEquals(row.getFieldCount(), 14); + assertThat(row.getFieldCount()).isEqualTo(17); return new DataFileRecord( (int) row.getField(0), (String) row.getField(1), (String) row.getField(2), - (long) row.getField(3), (long) row.getField(4), - row.getField(5) != null ? ImmutableMap.copyOf((Map) row.getField(5)) : null, + (long) row.getField(5), row.getField(6) != null ? ImmutableMap.copyOf((Map) row.getField(6)) : null, row.getField(7) != null ? ImmutableMap.copyOf((Map) row.getField(7)) : null, row.getField(8) != null ? ImmutableMap.copyOf((Map) row.getField(8)) : null, - row.getField(9) != null ? ImmutableMap.copyOf((Map) row.getField(9)) : null, - row.getField(10) != null ? ImmutableMap.copyOf((Map) row.getField(10)) : null); + row.getField(9) != null ? ImmutableMap.copyOf((Map) row.getField(9)) : null, + row.getField(10) != null ? ImmutableMap.copyOf((Map) row.getField(10)) : null, + row.getField(11) != null ? ImmutableMap.copyOf((Map) row.getField(11)) : null); } private DataFileRecord( diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergQueryRunner.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergQueryRunner.java index b75c67b9ed10..3b0e92e9a877 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergQueryRunner.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergQueryRunner.java @@ -17,52 +17,57 @@ import com.google.common.collect.ImmutableMap; import com.google.common.io.Resources; import io.airlift.http.server.testing.TestingHttpServer; +import io.airlift.log.Level; import io.airlift.log.Logger; import io.airlift.log.Logging; +import io.trino.plugin.exchange.filesystem.FileSystemExchangePlugin; +import io.trino.plugin.hive.containers.Hive3MinioDataLake; import io.trino.plugin.hive.containers.HiveHadoop; -import io.trino.plugin.hive.containers.HiveMinioDataLake; import io.trino.plugin.iceberg.catalog.jdbc.TestingIcebergJdbcServer; +import io.trino.plugin.iceberg.catalog.rest.TestingPolarisCatalog; +import io.trino.plugin.iceberg.containers.NessieContainer; +import io.trino.plugin.iceberg.containers.UnityCatalogContainer; +import io.trino.plugin.tpcds.TpcdsPlugin; import io.trino.plugin.tpch.TpchPlugin; import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.QueryRunner; import io.trino.testing.containers.Minio; import io.trino.tpch.TpchTable; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.rest.DelegatingRestSessionCatalog; -import org.assertj.core.util.Files; import java.io.File; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.FileAttribute; import java.nio.file.attribute.PosixFilePermission; import java.nio.file.attribute.PosixFilePermissions; -import java.util.HashMap; import java.util.Map; import java.util.Optional; import java.util.Set; -import static com.google.common.base.Preconditions.checkState; import static io.airlift.testing.Closeables.closeAllSuppress; import static io.trino.plugin.iceberg.catalog.jdbc.TestingIcebergJdbcServer.PASSWORD; import static io.trino.plugin.iceberg.catalog.jdbc.TestingIcebergJdbcServer.USER; import static io.trino.plugin.iceberg.catalog.rest.RestCatalogTestUtils.backendCatalog; +import static io.trino.testing.TestingProperties.requiredNonEmptySystemProperty; import static io.trino.testing.TestingSession.testSessionBuilder; import static io.trino.testing.containers.Minio.MINIO_ACCESS_KEY; +import static io.trino.testing.containers.Minio.MINIO_REGION; import static io.trino.testing.containers.Minio.MINIO_SECRET_KEY; import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.Files.createTempDirectory; import static java.util.Objects.requireNonNull; public final class IcebergQueryRunner { - public static final String ICEBERG_CATALOG = "iceberg"; - private IcebergQueryRunner() {} - public static DistributedQueryRunner createIcebergQueryRunner(TpchTable... tables) - throws Exception - { - return builder() - .setInitialTables(tables) - .build(); + public static final String ICEBERG_CATALOG = "iceberg"; + + static { + Logging logging = Logging.initialize(); + logging.setLevel("org.apache.iceberg", Level.OFF); } public static Builder builder() @@ -70,12 +75,18 @@ public static Builder builder() return new Builder(); } + public static Builder builder(String schema) + { + return new Builder(schema); + } + public static class Builder extends DistributedQueryRunner.Builder { private Optional metastoreDirectory = Optional.empty(); private ImmutableMap.Builder icebergProperties = ImmutableMap.builder(); - private Optional schemaInitializer = Optional.empty(); + private Optional schemaInitializer = Optional.of(SchemaInitializer.builder().build()); + private boolean tpcdsCatalogEnabled; protected Builder() { @@ -85,6 +96,14 @@ protected Builder() .build()); } + protected Builder(String schema) + { + super(testSessionBuilder() + .setCatalog(ICEBERG_CATALOG) + .setSchema(schema) + .build()); + } + public Builder setMetastoreDirectory(File metastoreDirectory) { this.metastoreDirectory = Optional.of(metastoreDirectory); @@ -117,12 +136,23 @@ public Builder setInitialTables(Iterable> initialTables) public Builder setSchemaInitializer(SchemaInitializer schemaInitializer) { - checkState(this.schemaInitializer.isEmpty(), "schemaInitializer is already set"); this.schemaInitializer = Optional.of(requireNonNull(schemaInitializer, "schemaInitializer is null")); amendSession(sessionBuilder -> sessionBuilder.setSchema(schemaInitializer.getSchemaName())); return self(); } + public Builder disableSchemaInitializer() + { + schemaInitializer = Optional.empty(); + return self(); + } + + public Builder setTpcdsCatalogEnabled(boolean tpcdsCatalogEnabled) + { + this.tpcdsCatalogEnabled = tpcdsCatalogEnabled; + return self(); + } + @Override public DistributedQueryRunner build() throws Exception @@ -132,17 +162,20 @@ public DistributedQueryRunner build() queryRunner.installPlugin(new TpchPlugin()); queryRunner.createCatalog("tpch", "tpch"); - queryRunner.installPlugin(new IcebergPlugin()); - Map icebergProperties = new HashMap<>(this.icebergProperties.buildOrThrow()); - String catalogType = icebergProperties.get("iceberg.catalog.type"); - Path dataDir = metastoreDirectory.map(File::toPath).orElseGet(() -> queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data")); - if (catalogType == null) { - icebergProperties.put("iceberg.catalog.type", "TESTING_FILE_METASTORE"); - icebergProperties.put("hive.metastore.catalog.dir", dataDir.toString()); + if (tpcdsCatalogEnabled) { + queryRunner.installPlugin(new TpcdsPlugin()); + queryRunner.createCatalog("tpcds", "tpcds"); } - queryRunner.createCatalog(ICEBERG_CATALOG, "iceberg", icebergProperties); - schemaInitializer.orElseGet(() -> SchemaInitializer.builder().build()).accept(queryRunner); + if (icebergProperties.buildOrThrow().keySet().stream().noneMatch(key -> + key.equals("fs.hadoop.enabled") || key.startsWith("fs.native-"))) { + icebergProperties.put("fs.hadoop.enabled", "true"); + } + + Path dataDir = metastoreDirectory.map(File::toPath).orElseGet(() -> queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data")); + queryRunner.installPlugin(new TestingIcebergPlugin(dataDir)); + queryRunner.createCatalog(ICEBERG_CATALOG, "iceberg", icebergProperties.buildOrThrow()); + schemaInitializer.ifPresent(initializer -> initializer.accept(queryRunner)); return queryRunner; } @@ -153,6 +186,13 @@ public DistributedQueryRunner build() } } + private static Builder icebergQueryRunnerMainBuilder() + { + return IcebergQueryRunner.builder() + .addCoordinatorProperty("http-server.http.port", "8080") + .setTpcdsCatalogEnabled(true); + } + public static final class IcebergRestQueryRunnerMain { private IcebergRestQueryRunnerMain() {} @@ -160,8 +200,8 @@ private IcebergRestQueryRunnerMain() {} public static void main(String[] args) throws Exception { - File warehouseLocation = Files.newTemporaryFolder(); - warehouseLocation.deleteOnExit(); + Path warehouseLocation = Files.createTempDirectory(null); + warehouseLocation.toFile().deleteOnExit(); Catalog backend = backendCatalog(warehouseLocation); @@ -173,9 +213,8 @@ public static void main(String[] args) testServer.start(); @SuppressWarnings("resource") - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() - .setExtraProperties(ImmutableMap.of("http-server.http.port", "8080")) - .setBaseDataDir(Optional.of(warehouseLocation.toPath())) + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() + .setBaseDataDir(Optional.of(warehouseLocation)) .setIcebergProperties(ImmutableMap.of( "iceberg.catalog.type", "rest", "iceberg.rest-catalog.uri", testServer.getBaseUrl().toString())) @@ -188,22 +227,84 @@ public static void main(String[] args) } } - public static final class IcebergGlueQueryRunnerMain + public static final class IcebergPolarisQueryRunnerMain + { + private IcebergPolarisQueryRunnerMain() {} + + public static void main(String[] args) + throws Exception + { + Path warehouseLocation = Files.createTempDirectory(null); + warehouseLocation.toFile().deleteOnExit(); + + @SuppressWarnings("resource") + TestingPolarisCatalog polarisCatalog = new TestingPolarisCatalog(warehouseLocation.toString()); + + @SuppressWarnings("resource") + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() + .setBaseDataDir(Optional.of(warehouseLocation)) + .addIcebergProperty("iceberg.catalog.type", "rest") + .addIcebergProperty("iceberg.rest-catalog.uri", polarisCatalog.restUri() + "/api/catalog") + .addIcebergProperty("iceberg.rest-catalog.warehouse", TestingPolarisCatalog.WAREHOUSE) + .addIcebergProperty("iceberg.rest-catalog.security", "OAUTH2") + .addIcebergProperty("iceberg.rest-catalog.oauth2.credential", TestingPolarisCatalog.CREDENTIAL) + .addIcebergProperty("iceberg.rest-catalog.oauth2.scope", "PRINCIPAL_ROLE:ALL") + .setInitialTables(TpchTable.getTables()) + .build(); + + Logger log = Logger.get(IcebergPolarisQueryRunnerMain.class); + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } + } + + public static final class IcebergUnityQueryRunnerMain + { + private IcebergUnityQueryRunnerMain() {} + + public static void main(String[] args) + throws Exception + { + Path warehouseLocation = Files.createTempDirectory(null); + warehouseLocation.toFile().deleteOnExit(); + + @SuppressWarnings("resource") + UnityCatalogContainer unityCatalog = new UnityCatalogContainer("unity", "tpch"); + + @SuppressWarnings("resource") + QueryRunner queryRunner = IcebergQueryRunner.builder() + .addCoordinatorProperty("http-server.http.port", "8080") + .setBaseDataDir(Optional.of(warehouseLocation)) + .addIcebergProperty("iceberg.security", "read_only") + .addIcebergProperty("iceberg.catalog.type", "rest") + .addIcebergProperty("iceberg.rest-catalog.uri", unityCatalog.uri() + "/iceberg") + .addIcebergProperty("iceberg.rest-catalog.warehouse", "unity") + .addIcebergProperty("iceberg.register-table-procedure.enabled", "true") + .disableSchemaInitializer() + .build(); + + unityCatalog.copyTpchTables(TpchTable.getTables()); + + Logger log = Logger.get(IcebergUnityQueryRunnerMain.class); + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } + } + + public static final class IcebergExternalQueryRunnerMain { - private IcebergGlueQueryRunnerMain() {} + private IcebergExternalQueryRunnerMain() {} public static void main(String[] args) throws Exception { - // Requires AWS credentials, which can be provided any way supported by the DefaultProviderChain - // See https://docs.aws.amazon.com/sdk-for-java/v1/developer-guide/credentials.html#credentials-default + // Please set Iceberg connector properties via VM options. e.g. -Diceberg.catalog.type=glue -D.. @SuppressWarnings("resource") - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() - .setExtraProperties(ImmutableMap.of("http-server.http.port", "8080")) - .setIcebergProperties(ImmutableMap.of("iceberg.catalog.type", "glue")) + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() + .setIcebergProperties(ImmutableMap.of("iceberg.catalog.type", System.getProperty("iceberg.catalog.type"))) .build(); - Logger log = Logger.get(IcebergGlueQueryRunnerMain.class); + Logger log = Logger.get(IcebergExternalQueryRunnerMain.class); log.info("======== SERVER STARTED ========"); log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); } @@ -218,21 +319,22 @@ public static void main(String[] args) { String bucketName = "test-bucket"; @SuppressWarnings("resource") - HiveMinioDataLake hiveMinioDataLake = new HiveMinioDataLake(bucketName); + Hive3MinioDataLake hiveMinioDataLake = new Hive3MinioDataLake(bucketName); hiveMinioDataLake.start(); @SuppressWarnings("resource") - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() - .setCoordinatorProperties(Map.of( - "http-server.http.port", "8080")) + QueryRunner queryRunner = IcebergQueryRunner.builder() + .addCoordinatorProperty("http-server.http.port", "8080") .setIcebergProperties(Map.of( "iceberg.catalog.type", "HIVE_METASTORE", - "hive.metastore.uri", "thrift://" + hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint(), - "hive.s3.aws-access-key", MINIO_ACCESS_KEY, - "hive.s3.aws-secret-key", MINIO_SECRET_KEY, - "hive.s3.endpoint", hiveMinioDataLake.getMinio().getMinioAddress(), - "hive.s3.path-style-access", "true", - "hive.s3.streaming.part-size", "5MB")) + "hive.metastore.uri", hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint().toString(), + "fs.native-s3.enabled", "true", + "s3.aws-access-key", MINIO_ACCESS_KEY, + "s3.aws-secret-key", MINIO_SECRET_KEY, + "s3.region", MINIO_REGION, + "s3.endpoint", hiveMinioDataLake.getMinio().getMinioAddress(), + "s3.path-style-access", "true", + "s3.streaming.part-size", "5MB")) .setSchemaInitializer( SchemaInitializer.builder() .withSchemaName("tpch") @@ -263,17 +365,17 @@ public static void main(String[] args) minio.createBucket(bucketName); @SuppressWarnings("resource") - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() - .setCoordinatorProperties(Map.of( - "http-server.http.port", "8080")) + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() .setIcebergProperties(Map.of( "iceberg.catalog.type", "TESTING_FILE_METASTORE", "hive.metastore.catalog.dir", "s3://%s/".formatted(bucketName), - "hive.s3.aws-access-key", MINIO_ACCESS_KEY, - "hive.s3.aws-secret-key", MINIO_SECRET_KEY, - "hive.s3.endpoint", "http://" + minio.getMinioApiEndpoint(), - "hive.s3.path-style-access", "true", - "hive.s3.streaming.part-size", "5MB")) + "fs.native-s3.enabled", "true", + "s3.aws-access-key", MINIO_ACCESS_KEY, + "s3.aws-secret-key", MINIO_SECRET_KEY, + "s3.region", MINIO_REGION, + "s3.endpoint", "http://" + minio.getMinioApiEndpoint(), + "s3.path-style-access", "true", + "s3.streaming.part-size", "5MB")) .setSchemaInitializer( SchemaInitializer.builder() .withSchemaName("tpch") @@ -294,15 +396,9 @@ private IcebergAzureQueryRunnerMain() {} public static void main(String[] args) throws Exception { - String azureContainer = requireNonNull( - System.getProperty("hive.hadoop2.azure-abfs-container"), - "System property hive.hadoop2.azure-abfs-container must be provided"); - String azureAccount = requireNonNull( - System.getProperty("hive.hadoop2.azure-abfs-account"), - "System property hive.hadoop2.azure-abfs-account must be provided"); - String azureAccessKey = requireNonNull( - System.getProperty("hive.hadoop2.azure-abfs-access-key"), - "System property hive.hadoop2.azure-abfs-access-key must be provided"); + String azureContainer = requiredNonEmptySystemProperty("testing.azure-abfs-container"); + String azureAccount = requiredNonEmptySystemProperty("testing.azure-abfs-account"); + String azureAccessKey = requiredNonEmptySystemProperty("testing.azure-abfs-access-key"); String abfsSpecificCoreSiteXmlContent = Resources.toString(Resources.getResource("hdp3.1-core-site.xml.abfs-template"), UTF_8) .replace("%ABFS_ACCESS_KEY%", azureAccessKey) @@ -321,14 +417,13 @@ public static void main(String[] args) hiveHadoop.start(); @SuppressWarnings("resource") - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() - .setCoordinatorProperties(Map.of( - "http-server.http.port", "8080")) + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() .setIcebergProperties(Map.of( "iceberg.catalog.type", "HIVE_METASTORE", - "hive.metastore.uri", "thrift://" + hiveHadoop.getHiveMetastoreEndpoint(), - "hive.azure.abfs-storage-account", azureAccount, - "hive.azure.abfs-access-key", azureAccessKey)) + "hive.metastore.uri", hiveHadoop.getHiveMetastoreEndpoint().toString(), + "fs.native-azure.enabled", "true", + "azure.auth-type", "ACCESS_KEY", + "azure.access-key", azureAccessKey)) .setSchemaInitializer( SchemaInitializer.builder() .withSchemaName("tpch") @@ -350,14 +445,13 @@ private IcebergJdbcQueryRunnerMain() {} public static void main(String[] args) throws Exception { - File warehouseLocation = Files.newTemporaryFolder(); - warehouseLocation.deleteOnExit(); + Path warehouseLocation = Files.createTempDirectory(null); + warehouseLocation.toFile().deleteOnExit(); TestingIcebergJdbcServer server = new TestingIcebergJdbcServer(); @SuppressWarnings("resource") - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() - .setExtraProperties(ImmutableMap.of("http-server.http.port", "8080")) + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() .setIcebergProperties(ImmutableMap.builder() .put("iceberg.catalog.type", "jdbc") .put("iceberg.jdbc-catalog.driver-class", "org.postgresql.Driver") @@ -365,7 +459,7 @@ public static void main(String[] args) .put("iceberg.jdbc-catalog.connection-user", USER) .put("iceberg.jdbc-catalog.connection-password", PASSWORD) .put("iceberg.jdbc-catalog.catalog-name", "tpch") - .put("iceberg.jdbc-catalog.default-warehouse-dir", warehouseLocation.getAbsolutePath()) + .put("iceberg.jdbc-catalog.default-warehouse-dir", warehouseLocation.toAbsolutePath().toString()) .buildOrThrow()) .setInitialTables(TpchTable.getTables()) .build(); @@ -376,6 +470,69 @@ public static void main(String[] args) } } + public static final class IcebergSnowflakeQueryRunnerMain + { + private IcebergSnowflakeQueryRunnerMain() {} + + public static void main(String[] args) + throws Exception + { + @SuppressWarnings("resource") + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() + .setIcebergProperties(ImmutableMap.builder() + .put("iceberg.catalog.type", "snowflake") + .put("fs.native-s3.enabled", "true") + .put("s3.aws-access-key", requiredNonEmptySystemProperty("testing.snowflake.catalog.s3.access-key")) + .put("s3.aws-secret-key", requiredNonEmptySystemProperty("testing.snowflake.catalog.s3.secret-key")) + .put("s3.region", requiredNonEmptySystemProperty("testing.snowflake.catalog.s3.region")) + .put("iceberg.file-format", "PARQUET") + .put("iceberg.snowflake-catalog.account-uri", requiredNonEmptySystemProperty("testing.snowflake.catalog.account-url")) + .put("iceberg.snowflake-catalog.user", requiredNonEmptySystemProperty("testing.snowflake.catalog.user")) + .put("iceberg.snowflake-catalog.password", requiredNonEmptySystemProperty("testing.snowflake.catalog.password")) + .put("iceberg.snowflake-catalog.database", requiredNonEmptySystemProperty("testing.snowflake.catalog.database")) + .buildOrThrow()) + .setSchemaInitializer( + SchemaInitializer.builder() + .withSchemaName("tpch") // Requires schema to pre-exist as Iceberg Snowflake catalog doesn't support creating schemas + .build()) + .build(); + + Logger log = Logger.get(IcebergSnowflakeQueryRunnerMain.class); + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } + } + + public static final class IcebergNessieQueryRunnerMain + { + private IcebergNessieQueryRunnerMain() {} + + public static void main(String[] args) + throws Exception + { + NessieContainer nessieContainer = NessieContainer.builder().build(); + nessieContainer.start(); + + Path tempDir = createTempDirectory("trino_nessie_catalog"); + + @SuppressWarnings("resource") + QueryRunner queryRunner = IcebergQueryRunner.builder() + .addCoordinatorProperty("http-server.http.port", "8080") + .setBaseDataDir(Optional.of(tempDir)) + .setIcebergProperties(ImmutableMap.builder() + .put("iceberg.catalog.type", "nessie") + .put("iceberg.nessie-catalog.uri", nessieContainer.getRestApiUri()) + .put("iceberg.nessie-catalog.default-warehouse-dir", tempDir.toString()) + .buildOrThrow()) + .setInitialTables(TpchTable.getTables()) + .build(); + + Logger log = Logger.get(IcebergNessieQueryRunnerMain.class); + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } + } + public static final class DefaultIcebergQueryRunnerMain { private DefaultIcebergQueryRunnerMain() {} @@ -384,13 +541,53 @@ public static void main(String[] args) throws Exception { Logger log = Logger.get(DefaultIcebergQueryRunnerMain.class); + File metastoreDir = createTempDirectory("iceberg_query_runner").toFile(); + metastoreDir.deleteOnExit(); + @SuppressWarnings("resource") - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() - .setExtraProperties(ImmutableMap.of("http-server.http.port", "8080")) + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() + .addIcebergProperty("hive.metastore.catalog.dir", metastoreDir.toURI().toString()) .setInitialTables(TpchTable.getTables()) .build(); log.info("======== SERVER STARTED ========"); log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); } } + + public static final class IcebergQueryRunnerWithTaskRetries + { + private IcebergQueryRunnerWithTaskRetries() {} + + public static void main(String[] args) + throws Exception + { + Logger log = Logger.get(IcebergQueryRunnerWithTaskRetries.class); + + File exchangeManagerDirectory = createTempDirectory("exchange_manager").toFile(); + Map exchangeManagerProperties = ImmutableMap.builder() + .put("exchange.base-directories", exchangeManagerDirectory.getAbsolutePath()) + .buildOrThrow(); + exchangeManagerDirectory.deleteOnExit(); + + File metastoreDir = createTempDirectory("iceberg_query_runner").toFile(); + metastoreDir.deleteOnExit(); + + @SuppressWarnings("resource") + QueryRunner queryRunner = icebergQueryRunnerMainBuilder() + .addIcebergProperty("hive.metastore.catalog.dir", metastoreDir.toURI().toString()) + .setExtraProperties(ImmutableMap.builder() + .put("retry-policy", "TASK") + .put("fault-tolerant-execution-task-memory", "1GB") + .buildOrThrow()) + .setInitialTables(TpchTable.getTables()) + .setAdditionalSetup(runner -> { + runner.installPlugin(new FileSystemExchangePlugin()); + runner.loadExchangeManager("filesystem", exchangeManagerProperties); + }) + .build(); + + log.info("======== SERVER STARTED ========"); + log.info("\n====\n%s\n====", queryRunner.getCoordinator().getBaseUrl()); + } + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergTestUtils.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergTestUtils.java index 39199092f2c3..293a923d3b77 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergTestUtils.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/IcebergTestUtils.java @@ -13,52 +13,99 @@ */ package io.trino.plugin.iceberg; -import io.airlift.slice.Slice; import io.trino.Session; +import io.trino.filesystem.FileEntry; +import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.filesystem.TrinoInputFile; -import io.trino.filesystem.local.LocalInputFile; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.HiveMetastoreFactory; +import io.trino.metastore.cache.CachingHiveMetastore; +import io.trino.orc.OrcColumn; import io.trino.orc.OrcDataSource; +import io.trino.orc.OrcPredicate; import io.trino.orc.OrcReader; import io.trino.orc.OrcReaderOptions; -import io.trino.orc.metadata.OrcColumnId; -import io.trino.orc.metadata.statistics.StringStatistics; -import io.trino.orc.metadata.statistics.StripeStatistics; +import io.trino.orc.OrcRecordReader; +import io.trino.orc.metadata.OrcType; import io.trino.parquet.ParquetReaderOptions; +import io.trino.parquet.metadata.BlockMetadata; +import io.trino.parquet.metadata.ColumnChunkMetadata; +import io.trino.parquet.metadata.ParquetMetadata; import io.trino.parquet.reader.MetadataReader; -import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.base.metrics.FileFormatDataSourceStats; +import io.trino.plugin.hive.TrinoViewHiveMetastore; +import io.trino.plugin.hive.orc.OrcReaderConfig; +import io.trino.plugin.hive.orc.OrcWriterConfig; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hive.parquet.ParquetWriterConfig; import io.trino.plugin.hive.parquet.TrinoParquetDataSource; -import io.trino.testing.DistributedQueryRunner; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; +import io.trino.plugin.iceberg.catalog.TrinoCatalog; +import io.trino.plugin.iceberg.catalog.file.FileMetastoreTableOperationsProvider; +import io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog; +import io.trino.plugin.iceberg.fileio.ForwardingFileIoFactory; +import io.trino.plugin.iceberg.fileio.ForwardingInputFile; +import io.trino.spi.block.Block; +import io.trino.spi.catalog.CatalogName; +import io.trino.spi.connector.ConnectorSession; +import io.trino.spi.connector.SchemaTableName; +import io.trino.spi.connector.SourcePage; +import io.trino.spi.type.TestingTypeManager; +import io.trino.spi.type.Type; +import io.trino.testing.QueryRunner; +import io.trino.testing.TestingConnectorSession; +import org.apache.iceberg.BaseTable; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.TableMetadataParser; -import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.function.Supplier; import static com.google.common.base.Verify.verify; -import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.Iterators.getOnlyElement; import static com.google.common.collect.MoreCollectors.onlyElement; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static com.google.common.util.concurrent.MoreExecutors.newDirectExecutorService; +import static io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext; +import static io.trino.metastore.cache.CachingHiveMetastore.createPerTransactionCache; +import static io.trino.orc.OrcReader.INITIAL_BATCH_SIZE; import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.plugin.iceberg.IcebergUtil.loadIcebergTable; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.METADATA_JSON; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.fromFilePath; +import static io.trino.spi.type.TypeUtils.readNativeValue; +import static io.trino.spi.type.VarcharType.VARCHAR; +import static org.joda.time.DateTimeZone.UTC; public final class IcebergTestUtils { - private IcebergTestUtils() - { } + public static final ConnectorSession SESSION = TestingConnectorSession.builder() + .setPropertyMetadata(new IcebergSessionProperties( + new IcebergConfig(), + new OrcReaderConfig(), + new OrcWriterConfig(), + new ParquetReaderConfig(), + new ParquetWriterConfig()).getSessionProperties()) + .build(); + + public static final ForwardingFileIoFactory FILE_IO_FACTORY = new ForwardingFileIoFactory(newDirectExecutorService()); + + private IcebergTestUtils() {} public static Session withSmallRowGroups(Session session) { return Session.builder(session) - .setCatalogSessionProperty("iceberg", "orc_writer_max_stripe_rows", "10") + .setCatalogSessionProperty("iceberg", "orc_writer_max_stripe_rows", "20") .setCatalogSessionProperty("iceberg", "parquet_writer_block_size", "1kB") - .setCatalogSessionProperty("iceberg", "parquet_writer_batch_size", "10") + .setCatalogSessionProperty("iceberg", "parquet_writer_batch_size", "20") .build(); } @@ -79,33 +126,31 @@ private static boolean checkOrcFileSorting(Supplier dataSourceSup OrcReaderOptions readerOptions = new OrcReaderOptions(); try (OrcDataSource dataSource = dataSourceSupplier.get()) { OrcReader orcReader = OrcReader.createOrcReader(dataSource, readerOptions).orElseThrow(); - String previousMax = null; - OrcColumnId sortColumnId = orcReader.getRootColumn().getNestedColumns().stream() + OrcColumn sortColumn = orcReader.getRootColumn().getNestedColumns().stream() .filter(column -> column.getColumnName().equals(sortColumnName)) - .collect(onlyElement()) - .getColumnId(); - List statistics = orcReader.getMetadata().getStripeStatsList().stream() - .map(Optional::orElseThrow) - .collect(toImmutableList()); - verify(statistics.size() > 1, "Test must produce at least two row groups"); - - for (StripeStatistics stripeStatistics : statistics) { - // TODO: This only works if the sort column is a String - StringStatistics columnStatistics = stripeStatistics.getColumnStatistics().get(sortColumnId).getStringStatistics(); - - Slice minValue = columnStatistics.getMin(); - Slice maxValue = columnStatistics.getMax(); - if (minValue == null || maxValue == null) { - throw new IllegalStateException("ORC files must produce min/max stripe statistics"); - } - - if (previousMax != null && previousMax.compareTo(minValue.toStringUtf8()) > 0) { - return false; + .collect(onlyElement()); + Type sortColumnType = getType(sortColumn.getColumnType().getOrcTypeKind()); + try (OrcRecordReader recordReader = orcReader.createRecordReader( + List.of(sortColumn), + List.of(sortColumnType), + false, + OrcPredicate.TRUE, + UTC, + newSimpleAggregatedMemoryContext(), + INITIAL_BATCH_SIZE, + RuntimeException::new)) { + Comparable previousMax = null; + for (SourcePage page = recordReader.nextPage(); page != null; page = recordReader.nextPage()) { + Block block = page.getBlock(0); + for (int position = 0; position < block.getPositionCount(); position++) { + Comparable current = (Comparable) readNativeValue(sortColumnType, block, position); + if (previousMax != null && previousMax.compareTo(current) > 0) { + return false; + } + previousMax = current; + } } - - previousMax = maxValue.toStringUtf8(); } - return true; } catch (IOException e) { @@ -113,28 +158,30 @@ private static boolean checkOrcFileSorting(Supplier dataSourceSup } } - public static boolean checkParquetFileSorting(String path, String sortColumnName) + private static Type getType(OrcType.OrcTypeKind orcTypeKind) { - return checkParquetFileSorting(new LocalInputFile(new File(path)), sortColumnName); + return switch (orcTypeKind) { + case OrcType.OrcTypeKind.STRING, OrcType.OrcTypeKind.VARCHAR -> VARCHAR; + default -> throw new IllegalArgumentException("Unsupported orc type: " + orcTypeKind); + }; } @SuppressWarnings({"unchecked", "rawtypes"}) public static boolean checkParquetFileSorting(TrinoInputFile inputFile, String sortColumnName) { - ParquetMetadata parquetMetadata; + ParquetMetadata parquetMetadata = getParquetFileMetadata(inputFile); + List blocks; try { - parquetMetadata = MetadataReader.readFooter( - new TrinoParquetDataSource(inputFile, new ParquetReaderOptions(), new FileFormatDataSourceStats()), - Optional.empty()); + blocks = parquetMetadata.getBlocks(); } catch (IOException e) { throw new UncheckedIOException(e); } Comparable previousMax = null; - verify(parquetMetadata.getBlocks().size() > 1, "Test must produce at least two row groups"); - for (BlockMetaData blockMetaData : parquetMetadata.getBlocks()) { - ColumnChunkMetaData columnMetadata = blockMetaData.getColumns().stream() + verify(blocks.size() > 1, "Test must produce at least two row groups"); + for (BlockMetadata blockMetaData : blocks) { + ColumnChunkMetadata columnMetadata = blockMetaData.columns().stream() .filter(column -> getOnlyElement(column.getPath().iterator()).equalsIgnoreCase(sortColumnName)) .collect(onlyElement()); if (previousMax != null) { @@ -147,9 +194,74 @@ public static boolean checkParquetFileSorting(TrinoInputFile inputFile, String s return true; } - public static TrinoFileSystemFactory getFileSystemFactory(DistributedQueryRunner queryRunner) + public static TrinoFileSystemFactory getFileSystemFactory(QueryRunner queryRunner) { return ((IcebergConnector) queryRunner.getCoordinator().getConnector(ICEBERG_CATALOG)) .getInjector().getInstance(TrinoFileSystemFactory.class); } + + public static HiveMetastore getHiveMetastore(QueryRunner queryRunner) + { + return ((IcebergConnector) queryRunner.getCoordinator().getConnector(ICEBERG_CATALOG)).getInjector() + .getInstance(HiveMetastoreFactory.class) + .createMetastore(Optional.empty()); + } + + public static BaseTable loadTable(String tableName, + HiveMetastore metastore, + TrinoFileSystemFactory fileSystemFactory, + String catalogName, + String schemaName) + { + IcebergTableOperationsProvider tableOperationsProvider = new FileMetastoreTableOperationsProvider(fileSystemFactory, FILE_IO_FACTORY); + TrinoCatalog catalog = getTrinoCatalog(metastore, fileSystemFactory, catalogName); + return loadIcebergTable(catalog, tableOperationsProvider, SESSION, new SchemaTableName(schemaName, tableName)); + } + + public static TrinoCatalog getTrinoCatalog( + HiveMetastore metastore, + TrinoFileSystemFactory fileSystemFactory, + String catalogName) + { + IcebergTableOperationsProvider tableOperationsProvider = new FileMetastoreTableOperationsProvider(fileSystemFactory, FILE_IO_FACTORY); + CachingHiveMetastore cachingHiveMetastore = createPerTransactionCache(metastore, 1000); + return new TrinoHiveCatalog( + new CatalogName(catalogName), + cachingHiveMetastore, + new TrinoViewHiveMetastore(cachingHiveMetastore, false, "trino-version", "test"), + fileSystemFactory, + FILE_IO_FACTORY, + new TestingTypeManager(), + tableOperationsProvider, + false, + false, + false, + new IcebergConfig().isHideMaterializedViewStorageTable(), + directExecutor()); + } + + public static Map getMetadataFileAndUpdatedMillis(TrinoFileSystem trinoFileSystem, String tableLocation) + throws IOException + { + FileIterator fileIterator = trinoFileSystem.listFiles(Location.of(tableLocation + "/metadata")); + Map metadataFiles = new HashMap<>(); + while (fileIterator.hasNext()) { + FileEntry entry = fileIterator.next(); + if (fromFilePath(entry.location().path()) == METADATA_JSON) { + TableMetadata tableMetadata = TableMetadataParser.read(null, new ForwardingInputFile(trinoFileSystem.newInputFile(entry.location()))); + metadataFiles.put(entry.location().path(), tableMetadata.lastUpdatedMillis()); + } + } + return metadataFiles; + } + + public static ParquetMetadata getParquetFileMetadata(TrinoInputFile inputFile) + { + try (TrinoParquetDataSource dataSource = new TrinoParquetDataSource(inputFile, ParquetReaderOptions.defaultOptions(), new FileFormatDataSourceStats())) { + return MetadataReader.readFooter(dataSource); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/SchemaInitializer.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/SchemaInitializer.java index b153f83b3fc2..0000e931da1c 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/SchemaInitializer.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/SchemaInitializer.java @@ -51,8 +51,8 @@ public void accept(QueryRunner queryRunner) String schemaProperties = this.schemaProperties.entrySet().stream() .map(entry -> entry.getKey() + " = " + entry.getValue()) .collect(Collectors.joining(", ", " WITH ( ", " )")); - queryRunner.execute("CREATE SCHEMA IF NOT EXISTS " + schemaName + (this.schemaProperties.size() > 0 ? schemaProperties : "")); - copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, queryRunner.getDefaultSession(), clonedTpchTables); + queryRunner.execute("CREATE SCHEMA IF NOT EXISTS \"" + schemaName + "\"" + (this.schemaProperties.size() > 0 ? schemaProperties : "")); + copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, clonedTpchTables); } public static Builder builder() diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestCloseIdleWriters.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestCloseIdleWriters.java new file mode 100644 index 000000000000..9377c0ca19ea --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestCloseIdleWriters.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableMap; +import io.trino.Session; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.QueryRunner; +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.Test; + +import static io.trino.SystemSessionProperties.IDLE_WRITER_MIN_DATA_SIZE_THRESHOLD; +import static io.trino.SystemSessionProperties.SCALE_WRITERS; +import static io.trino.SystemSessionProperties.TASK_MAX_WRITER_COUNT; +import static io.trino.SystemSessionProperties.TASK_MIN_WRITER_COUNT; +import static io.trino.SystemSessionProperties.TASK_SCALE_WRITERS_ENABLED; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static org.assertj.core.api.Assertions.assertThat; + +public class TestCloseIdleWriters + extends AbstractTestQueryFramework +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return IcebergQueryRunner.builder() + .setWorkerCount(0) + // Set the target max file size to 100GB so that we don't close writers due to file size in append + // page. + .setIcebergProperties(ImmutableMap.of( + "iceberg.target-max-file-size", "100GB", + "iceberg.idle-writer-min-file-size", "0.1MB")) + .build(); + } + + @Test + public void testCloseIdleWriters() + { + String sourceTable = "tpch.\"sf0.1\".lineitem"; + String targetTable = "task_close_idle_writers_" + randomNameSuffix(); + try { + String zeroShipModes = "'AIR', 'FOB', 'SHIP', 'TRUCK'"; + String oneShipModes = "'MAIL', 'RAIL', 'REG AIR'"; + String bothShipModes = zeroShipModes + ", " + oneShipModes; + + long expectedCount = (long) computeScalar("SELECT count (*) FROM %s WHERE shipmode IN (%s)".formatted(sourceTable, bothShipModes)); + + // Create a table with two partitions (0 and 1). Using the order by trick we will write the partitions in + // this order 0, 1, and then again 0. This way we are sure that during partition 1 write there will + // be an idle writer for partition 0. Additionally, during second partition 0 write, there will be an idle + // writer for partition 1. + @Language("SQL") String createTableSql = + """ + CREATE TABLE %s WITH (format = 'ORC', partitioning = ARRAY['shipmodeVal']) + AS SELECT orderkey, partkey, suppkey, linenumber, quantity, extendedprice, + discount, tax, returnflag, linestatus, commitdate, receiptdate, shipinstruct, + comment, shipdate, + CASE + WHEN shipmode IN (%s) THEN 0 + WHEN shipmode IN (%s) THEN 1 + END AS shipmodeVal + FROM %s + WHERE shipmode IN (%s) + ORDER BY shipmode + LIMIT %s + """.formatted(targetTable, zeroShipModes, oneShipModes, sourceTable, bothShipModes, expectedCount); + + // Disable all kind of scaling and set low idle writer threshold + assertUpdate( + Session.builder(getSession()) + .setSystemProperty(SCALE_WRITERS, "false") + .setSystemProperty(TASK_SCALE_WRITERS_ENABLED, "false") + .setSystemProperty(TASK_MAX_WRITER_COUNT, "1") + .setSystemProperty(TASK_MIN_WRITER_COUNT, "1") + .setSystemProperty(IDLE_WRITER_MIN_DATA_SIZE_THRESHOLD, "0.1MB") + .build(), + createTableSql, + expectedCount); + long files = (long) computeScalar("SELECT count(DISTINCT \"$path\") FROM " + targetTable); + // There should more than 2 files since we triggered close idle writers. + assertThat(files).isGreaterThan(2); + } + finally { + assertUpdate("DROP TABLE IF EXISTS " + targetTable); + } + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestFileBasedConflictDetection.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestFileBasedConflictDetection.java new file mode 100644 index 000000000000..d78343ba7d3e --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestFileBasedConflictDetection.java @@ -0,0 +1,292 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.trino.spi.connector.CatalogHandle; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.RowType; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionSpecParser; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SchemaParser; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.Table; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.stream.Stream; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.PRIMITIVE; +import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.STRUCT; +import static io.trino.plugin.iceberg.IcebergMetadata.extractTupleDomainsFromCommitTasks; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static org.apache.iceberg.FileContent.DATA; +import static org.apache.iceberg.FileContent.POSITION_DELETES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.assertj.core.api.Assertions.assertThat; + +class TestFileBasedConflictDetection +{ + private static final HadoopTables HADOOP_TABLES = new HadoopTables(new Configuration(false)); + private static final String COLUMN_1_NAME = "col1"; + private static final ColumnIdentity COLUMN_1_IDENTITY = new ColumnIdentity(1, COLUMN_1_NAME, PRIMITIVE, ImmutableList.of()); + private static final IcebergColumnHandle COLUMN_1_HANDLE = IcebergColumnHandle.optional(COLUMN_1_IDENTITY).columnType(INTEGER).build(); + private static final String COLUMN_2_NAME = "part"; + private static final ColumnIdentity COLUMN_2_IDENTITY = new ColumnIdentity(2, COLUMN_2_NAME, PRIMITIVE, ImmutableList.of()); + private static final IcebergColumnHandle COLUMN_2_HANDLE = IcebergColumnHandle.optional(COLUMN_2_IDENTITY).columnType(INTEGER).build(); + private static final String CHILD_COLUMN_NAME = "child"; + private static final ColumnIdentity CHILD_COLUMN_IDENTITY = new ColumnIdentity(4, CHILD_COLUMN_NAME, PRIMITIVE, ImmutableList.of()); + private static final String PARENT_COLUMN_NAME = "parent"; + private static final ColumnIdentity PARENT_COLUMN_IDENTITY = new ColumnIdentity(3, PARENT_COLUMN_NAME, STRUCT, ImmutableList.of(CHILD_COLUMN_IDENTITY)); + private static final IcebergColumnHandle CHILD_COLUMN_HANDLE = IcebergColumnHandle.optional(PARENT_COLUMN_IDENTITY) + .fieldType(RowType.rowType(new RowType.Field(Optional.of(CHILD_COLUMN_NAME), INTEGER)), INTEGER) + .path(CHILD_COLUMN_IDENTITY.getId()) + .build(); + + private static final Schema TABLE_SCHEMA = new Schema( + optional(COLUMN_1_IDENTITY.getId(), COLUMN_1_NAME, Types.IntegerType.get()), + optional(COLUMN_2_IDENTITY.getId(), COLUMN_2_NAME, Types.IntegerType.get()), + optional( + PARENT_COLUMN_IDENTITY.getId(), + PARENT_COLUMN_NAME, + Types.StructType.of(optional(CHILD_COLUMN_IDENTITY.getId(), CHILD_COLUMN_NAME, Types.IntegerType.get())))); + + @Test + void testConflictDetectionOnNonPartitionedTable() + { + PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); + Table icebergTable = createIcebergTable(partitionSpec); + + List commitTasks = getCommitTaskDataForUpdate(partitionSpec, Optional.empty()); + TupleDomain icebergColumnHandleTupleDomain = extractTupleDomainsFromCommitTasks(getIcebergTableHandle(partitionSpec), icebergTable, commitTasks, null); + assertThat(icebergColumnHandleTupleDomain.getDomains().orElseThrow()).isEmpty(); + + dropIcebergTable(icebergTable); + } + + @Test + void testConflictDetectionOnPartitionedTable() + { + PartitionSpec partitionSpec = PartitionSpec.builderFor(TABLE_SCHEMA) + .identity(COLUMN_2_NAME) + .build(); + Table icebergTable = createIcebergTable(partitionSpec); + + String partitionDataJson = + """ + {"partitionValues":[40]} + """; + Map expectedDomains = Map.of(COLUMN_2_HANDLE, Domain.singleValue(INTEGER, 40L)); + List commitTasks = getCommitTaskDataForUpdate(partitionSpec, Optional.of(partitionDataJson)); + TupleDomain icebergColumnHandleTupleDomain = extractTupleDomainsFromCommitTasks(getIcebergTableHandle(partitionSpec), icebergTable, commitTasks, null); + assertThat(icebergColumnHandleTupleDomain.getDomains().orElseThrow()).isEqualTo(expectedDomains); + + dropIcebergTable(icebergTable); + } + + @Test + void testConflictDetectionOnPartitionedTableWithMultiplePartitionValues() + { + PartitionSpec partitionSpec = PartitionSpec.builderFor(TABLE_SCHEMA) + .identity(COLUMN_2_NAME) + .build(); + Table icebergTable = createIcebergTable(partitionSpec); + + String partitionDataJson1 = + """ + {"partitionValues":[40]} + """; + String partitionDataJson2 = + """ + {"partitionValues":[50]} + """; + Map expectedDomains = Map.of(COLUMN_2_HANDLE, Domain.multipleValues(INTEGER, ImmutableList.of(40L, 50L))); + // Create commit tasks for updates in two partitions, with values 40 and 50 + List commitTasks = Stream.concat(getCommitTaskDataForUpdate(partitionSpec, Optional.of(partitionDataJson1)).stream(), + getCommitTaskDataForUpdate(partitionSpec, Optional.of(partitionDataJson2)).stream()).collect(toImmutableList()); + TupleDomain icebergColumnHandleTupleDomain = extractTupleDomainsFromCommitTasks(getIcebergTableHandle(partitionSpec), icebergTable, commitTasks, null); + assertThat(icebergColumnHandleTupleDomain.getDomains().orElseThrow()).isEqualTo(expectedDomains); + + dropIcebergTable(icebergTable); + } + + @Test + void testConflictDetectionOnNestedPartitionedTable() + { + PartitionSpec partitionSpec = PartitionSpec.builderFor(TABLE_SCHEMA) + .identity(PARENT_COLUMN_NAME + "." + CHILD_COLUMN_NAME) + .build(); + Table icebergTable = createIcebergTable(partitionSpec); + + String partitionDataJson = + """ + {"partitionValues":[40]} + """; + Map expectedDomains = Map.of(CHILD_COLUMN_HANDLE, Domain.singleValue(INTEGER, 40L)); + List commitTasks = getCommitTaskDataForUpdate(partitionSpec, Optional.of(partitionDataJson)); + TupleDomain icebergColumnHandleTupleDomain = extractTupleDomainsFromCommitTasks(getIcebergTableHandle(partitionSpec), icebergTable, commitTasks, null); + assertThat(icebergColumnHandleTupleDomain.getDomains().orElseThrow()).isEqualTo(expectedDomains); + + dropIcebergTable(icebergTable); + } + + @Test + void testConflictDetectionOnTableWithTwoPartitions() + { + PartitionSpec partitionSpec = PartitionSpec.builderFor(TABLE_SCHEMA) + .identity(COLUMN_2_NAME) + .identity(COLUMN_1_NAME) + .build(); + Table icebergTable = createIcebergTable(partitionSpec); + + String partitionDataJson = + """ + {"partitionValues":[40, 12]} + """; + Map expectedDomains = Map.of(COLUMN_2_HANDLE, Domain.singleValue(INTEGER, 40L), COLUMN_1_HANDLE, Domain.singleValue(INTEGER, 12L)); + List commitTasks = getCommitTaskDataForUpdate(partitionSpec, Optional.of(partitionDataJson)); + TupleDomain icebergColumnHandleTupleDomain = extractTupleDomainsFromCommitTasks(getIcebergTableHandle(partitionSpec), icebergTable, commitTasks, null); + assertThat(icebergColumnHandleTupleDomain.getDomains().orElseThrow()).isEqualTo(expectedDomains); + + dropIcebergTable(icebergTable); + } + + @Test + void testConflictDetectionOnTableWithTwoPartitionsAndMissingPartitionData() + { + PartitionSpec partitionSpec = PartitionSpec.builderFor(TABLE_SCHEMA) + .identity(COLUMN_2_NAME) + .identity(COLUMN_1_NAME) + .build(); + Table icebergTable = createIcebergTable(partitionSpec); + + String partitionDataJson = + """ + {"partitionValues":[40]} + """; + Map expectedDomains = Map.of(COLUMN_2_HANDLE, Domain.singleValue(INTEGER, 40L), COLUMN_1_HANDLE, Domain.onlyNull(INTEGER)); + List commitTasks = getCommitTaskDataForUpdate(partitionSpec, Optional.of(partitionDataJson)); + TupleDomain icebergColumnHandleTupleDomain = extractTupleDomainsFromCommitTasks(getIcebergTableHandle(partitionSpec), icebergTable, commitTasks, null); + assertThat(icebergColumnHandleTupleDomain.getDomains().orElseThrow()).isEqualTo(expectedDomains); + + dropIcebergTable(icebergTable); + } + + @Test + void testConflictDetectionOnEvolvedTable() + { + PartitionSpec previousPartitionSpec = PartitionSpec.builderFor(TABLE_SCHEMA) + .identity(COLUMN_1_NAME) + .build(); + PartitionSpec currentPartitionSpec = PartitionSpec.builderFor(TABLE_SCHEMA) + .identity(COLUMN_2_NAME) + .build(); + Table icebergTable = createIcebergTable(currentPartitionSpec); + + String partitionDataJson = + """ + {"partitionValues":[40]} + """; + CommitTaskData commitTaskData1 = new CommitTaskData("test_location/data/new.parquet", IcebergFileFormat.PARQUET, 0, new MetricsWrapper(new Metrics()), PartitionSpecParser.toJson(currentPartitionSpec), + Optional.of(partitionDataJson), DATA, Optional.empty(), Optional.empty()); + // Remove file from version with previous partition specification + CommitTaskData commitTaskData2 = new CommitTaskData("test_location/data/old.parquet", IcebergFileFormat.PARQUET, 0, new MetricsWrapper(new Metrics()), PartitionSpecParser.toJson(previousPartitionSpec), + Optional.of(partitionDataJson), POSITION_DELETES, Optional.empty(), Optional.empty()); + TupleDomain icebergColumnHandleTupleDomain = extractTupleDomainsFromCommitTasks(getIcebergTableHandle(currentPartitionSpec), icebergTable, List.of(commitTaskData1, commitTaskData2), null); + assertThat(icebergColumnHandleTupleDomain.getDomains().orElseThrow()).isEmpty(); + + dropIcebergTable(icebergTable); + } + + private static List getCommitTaskDataForUpdate(PartitionSpec partitionSpec, Optional partitionDataJson) + { + // Update operation contains two commit tasks + CommitTaskData commitTaskData1 = new CommitTaskData( + "test_location/data/new.parquet", + IcebergFileFormat.PARQUET, + 0, + new MetricsWrapper(new Metrics()), + PartitionSpecParser.toJson(partitionSpec), + partitionDataJson, + DATA, + Optional.empty(), + Optional.empty()); + CommitTaskData commitTaskData2 = new CommitTaskData( + "test_location/data/old.parquet", + IcebergFileFormat.PARQUET, + 0, + new MetricsWrapper(new Metrics()), + PartitionSpecParser.toJson(partitionSpec), + partitionDataJson, + POSITION_DELETES, + Optional.empty(), + Optional.empty()); + + return List.of(commitTaskData1, commitTaskData2); + } + + private static IcebergTableHandle getIcebergTableHandle(PartitionSpec partitionSpec) + { + String partitionSpecJson = PartitionSpecParser.toJson(partitionSpec); + return new IcebergTableHandle( + CatalogHandle.fromId("iceberg:NORMAL:v12345"), + "schemaName", + "tableName", + TableType.DATA, + Optional.empty(), + SchemaParser.toJson(TABLE_SCHEMA), + Optional.of(partitionSpecJson), + 1, + TupleDomain.all(), + TupleDomain.all(), + OptionalLong.empty(), + ImmutableSet.of(), + Optional.empty(), + "dummy_table_location", + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + ImmutableSet.of(), + Optional.of(false)); + } + + private static Table createIcebergTable(PartitionSpec partitionSpec) + { + return HADOOP_TABLES.create( + TABLE_SCHEMA, + partitionSpec, + SortOrder.unsorted(), + ImmutableMap.of("write.format.default", "ORC"), + "table_location" + randomNameSuffix()); + } + + private static void dropIcebergTable(Table icebergTable) + { + HADOOP_TABLES.dropTable(icebergTable.location()); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAbfsConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAbfsConnectorSmokeTest.java index 73498030c5bd..a25412152ae3 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAbfsConnectorSmokeTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAbfsConnectorSmokeTest.java @@ -16,12 +16,11 @@ import com.google.common.collect.ImmutableMap; import com.google.common.io.Resources; import io.trino.filesystem.Location; +import io.trino.metastore.HiveMetastore; import io.trino.plugin.hive.containers.HiveHadoop; -import io.trino.plugin.hive.metastore.HiveMetastore; import io.trino.plugin.hive.metastore.thrift.BridgingHiveMetastore; import io.trino.testing.QueryRunner; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.nio.file.Path; import java.nio.file.attribute.FileAttribute; @@ -33,10 +32,10 @@ import static io.trino.plugin.hive.TestingThriftHiveMetastoreBuilder.testingThriftHiveMetastoreBuilder; import static io.trino.plugin.iceberg.IcebergTestUtils.checkOrcFileSorting; import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.TestingProperties.requiredNonEmptySystemProperty; import static java.lang.String.format; import static java.nio.charset.StandardCharsets.UTF_8; import static java.util.Locale.ENGLISH; -import static java.util.Objects.requireNonNull; import static org.apache.iceberg.FileFormat.ORC; import static org.assertj.core.api.Assertions.assertThat; @@ -51,16 +50,12 @@ public class TestIcebergAbfsConnectorSmokeTest private HiveHadoop hiveHadoop; - @Parameters({ - "hive.hadoop2.azure-abfs-container", - "hive.hadoop2.azure-abfs-account", - "hive.hadoop2.azure-abfs-access-key"}) - public TestIcebergAbfsConnectorSmokeTest(String container, String account, String accessKey) + public TestIcebergAbfsConnectorSmokeTest() { super(ORC); - this.container = requireNonNull(container, "container is null"); - this.account = requireNonNull(account, "account is null"); - this.accessKey = requireNonNull(accessKey, "accessKey is null"); + this.container = requiredNonEmptySystemProperty("testing.azure-abfs-container"); + this.account = requiredNonEmptySystemProperty("testing.azure-abfs-account"); + this.accessKey = requiredNonEmptySystemProperty("testing.azure-abfs-access-key"); this.schemaName = "tpch_" + format.name().toLowerCase(ENGLISH); this.bucketName = "test-iceberg-smoke-test-" + randomNameSuffix(); } @@ -89,12 +84,14 @@ protected QueryRunner createQueryRunner() ImmutableMap.builder() .put("iceberg.file-format", format.name()) .put("iceberg.catalog.type", "HIVE_METASTORE") - .put("hive.metastore.uri", "thrift://" + hiveHadoop.getHiveMetastoreEndpoint()) - .put("hive.metastore-timeout", "1m") // read timed out sometimes happens with the default timeout - .put("hive.azure.abfs-storage-account", account) - .put("hive.azure.abfs-access-key", accessKey) + .put("hive.metastore.uri", hiveHadoop.getHiveMetastoreEndpoint().toString()) + .put("hive.metastore.thrift.client.read-timeout", "1m") // read timed out sometimes happens with the default timeout + .put("fs.native-azure.enabled", "true") + .put("azure.auth-type", "ACCESS_KEY") + .put("azure.access-key", accessKey) .put("iceberg.register-table-procedure.enabled", "true") .put("iceberg.writer-sort-buffer-size", "1MB") + .put("iceberg.allowed-extra-properties", "write.metadata.delete-after-commit.enabled,write.metadata.previous-versions-max") .buildOrThrow()) .setSchemaInitializer( SchemaInitializer.builder() @@ -126,7 +123,7 @@ protected void dropTableFromMetastore(String tableName) HiveMetastore metastore = new BridgingHiveMetastore( testingThriftHiveMetastoreBuilder() .metastoreClient(hiveHadoop.getHiveMetastoreEndpoint()) - .build()); + .build(this::closeAfterClass)); metastore.dropTable(schemaName, tableName, false); assertThat(metastore.getTable(schemaName, tableName)).isEmpty(); } @@ -137,7 +134,7 @@ protected String getMetadataLocation(String tableName) HiveMetastore metastore = new BridgingHiveMetastore( testingThriftHiveMetastoreBuilder() .metastoreClient(hiveHadoop.getHiveMetastoreEndpoint()) - .build()); + .build(this::closeAfterClass)); return metastore .getTable(schemaName, tableName).orElseThrow() .getParameters().get("metadata_location"); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAlluxioCacheFileOperations.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAlluxioCacheFileOperations.java new file mode 100644 index 000000000000..b22217da6ed1 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAlluxioCacheFileOperations.java @@ -0,0 +1,172 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.HashMultiset; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultiset; +import com.google.common.collect.Multiset; +import io.opentelemetry.sdk.trace.data.SpanData; +import io.trino.plugin.iceberg.util.FileOperationUtils.FileType; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.DistributedQueryRunner; +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; + +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.trino.filesystem.tracing.CacheFileSystemTraceUtils.getCacheOperationSpans; +import static io.trino.filesystem.tracing.CacheFileSystemTraceUtils.getFileLocation; +import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.DATA; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.MANIFEST; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.METADATA_JSON; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.SNAPSHOT; +import static io.trino.testing.MultisetAssertions.assertMultisetsEqual; +import static java.util.stream.Collectors.toCollection; + +// single-threaded as DistributedQueryRunner.spans is shared mutable state +@Execution(ExecutionMode.SAME_THREAD) +public class TestIcebergAlluxioCacheFileOperations + extends AbstractTestQueryFramework +{ + public static final String TEST_SCHEMA = "test_alluxio_schema"; + private Path cacheDirectory; + + @Override + protected DistributedQueryRunner createQueryRunner() + throws Exception + { + cacheDirectory = Files.createTempDirectory("cache"); + closeAfterClass(() -> deleteRecursively(cacheDirectory, ALLOW_INSECURE)); + Path metastoreDirectory = Files.createTempDirectory(ICEBERG_CATALOG); + closeAfterClass(() -> deleteRecursively(metastoreDirectory, ALLOW_INSECURE)); + + Map icebergProperties = ImmutableMap.builder() + .put("fs.cache.enabled", "true") + .put("fs.cache.directories", cacheDirectory.toAbsolutePath().toString()) + .put("fs.cache.max-sizes", "100MB") + .put("iceberg.metadata-cache.enabled", "false") + .put("hive.metastore.catalog.dir", metastoreDirectory.toUri().toString()) + .buildOrThrow(); + + DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() + .setSchemaInitializer(SchemaInitializer.builder() + .withSchemaName(TEST_SCHEMA) + .build()) + .setIcebergProperties(icebergProperties) + .setWorkerCount(0) + .build(); + queryRunner.execute("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA); + return queryRunner; + } + + @Test + public void testCacheFileOperations() + { + assertUpdate("DROP TABLE IF EXISTS test_cache_file_operations"); + assertUpdate("CREATE TABLE test_cache_file_operations(key varchar, data varchar) with (partitioning=ARRAY['key'])"); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p1', '1-abc')", 1); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p2', '2-xyz')", 1); + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("Input.readFully", DATA), 2) + .addCopies(new CacheOperation("Alluxio.readCached", DATA), 2) + .addCopies(new CacheOperation("Alluxio.writeCache", DATA), 2) + .add(new CacheOperation("Alluxio.readExternalStream", METADATA_JSON)) + .add(new CacheOperation("InputFile.length", METADATA_JSON)) + .add(new CacheOperation("Alluxio.readCached", METADATA_JSON)) + .add(new CacheOperation("Alluxio.writeCache", METADATA_JSON)) + .addCopies(new CacheOperation("Alluxio.readCached", SNAPSHOT), 2) + .add(new CacheOperation("InputFile.length", SNAPSHOT)) + .addCopies(new CacheOperation("Alluxio.readExternalStream", MANIFEST), 2) + .addCopies(new CacheOperation("Alluxio.readCached", MANIFEST), 4) + .addCopies(new CacheOperation("Alluxio.writeCache", MANIFEST), 2) + .build()); + + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("Alluxio.readCached", DATA), 2) + .add(new CacheOperation("Alluxio.readCached", METADATA_JSON)) + .add(new CacheOperation("InputFile.length", METADATA_JSON)) + .addCopies(new CacheOperation("Alluxio.readCached", SNAPSHOT), 2) + .add(new CacheOperation("InputFile.length", SNAPSHOT)) + .addCopies(new CacheOperation("Alluxio.readCached", MANIFEST), 4) + .build()); + + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p3', '3-xyz')", 1); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p4', '4-xyz')", 1); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p5', '5-xyz')", 1); + + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("Input.readFully", DATA), 3) + .addCopies(new CacheOperation("Alluxio.readCached", DATA), 5) + .addCopies(new CacheOperation("Alluxio.writeCache", DATA), 3) + .add(new CacheOperation("Alluxio.readExternalStream", METADATA_JSON)) + .add(new CacheOperation("InputFile.length", METADATA_JSON)) + .addCopies(new CacheOperation("Alluxio.readCached", METADATA_JSON), 2) + .add(new CacheOperation("Alluxio.writeCache", METADATA_JSON)) + .addCopies(new CacheOperation("Alluxio.readCached", SNAPSHOT), 2) + .add(new CacheOperation("InputFile.length", SNAPSHOT)) + .addCopies(new CacheOperation("Alluxio.readExternalStream", MANIFEST), 3) + .addCopies(new CacheOperation("Alluxio.readCached", MANIFEST), 10) + .addCopies(new CacheOperation("Alluxio.writeCache", MANIFEST), 3) + .build()); + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("Alluxio.readCached", DATA), 5) + .addCopies(new CacheOperation("Alluxio.readCached", METADATA_JSON), 2) + .addCopies(new CacheOperation("Alluxio.readCached", SNAPSHOT), 2) + .addCopies(new CacheOperation("Alluxio.readCached", MANIFEST), 10) + .add(new CacheOperation("InputFile.length", METADATA_JSON)) + .add(new CacheOperation("InputFile.length", SNAPSHOT)) + .build()); + } + + private void assertFileSystemAccesses(@Language("SQL") String query, Multiset expectedCacheAccesses) + { + DistributedQueryRunner queryRunner = getDistributedQueryRunner(); + queryRunner.executeWithPlan(queryRunner.getDefaultSession(), query); + assertMultisetsEqual(getCacheOperations(), expectedCacheAccesses); + } + + private Multiset getCacheOperations() + { + return getCacheOperationSpans(getQueryRunner()) + .stream() + .filter(span -> !span.getName().startsWith("InputFile.newStream")) + .map(CacheOperation::create) + .collect(toCollection(HashMultiset::create)); + } + + private record CacheOperation(String operationName, FileType fileType) + { + public static CacheOperation create(SpanData span) + { + String path = getFileLocation(span); + return new CacheOperation(span.getName(), FileType.fromFilePath(path)); + } + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAvroConnectorTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAvroConnectorTest.java index 1c9d7538b960..d85a8fe78043 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAvroConnectorTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergAvroConnectorTest.java @@ -13,9 +13,10 @@ */ package io.trino.plugin.iceberg; -import org.testng.SkipException; +import org.junit.jupiter.api.Test; import static io.trino.plugin.iceberg.IcebergFileFormat.AVRO; +import static org.junit.jupiter.api.Assumptions.abort; public class TestIcebergAvroConnectorTest extends BaseIcebergConnectorTest @@ -37,15 +38,22 @@ protected boolean supportsRowGroupStatistics(String typeName) return false; } + @Test @Override public void testIncorrectIcebergFileSizes() { - throw new SkipException("Avro does not do tail reads"); + abort("Avro does not do tail reads"); } @Override protected boolean isFileSorted(String path, String sortColumnName) { - throw new SkipException("Unimplemented"); + return abort("Unimplemented"); + } + + @Override + protected boolean supportsPhysicalPushdown() + { + return false; } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergBucketing.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergBucketing.java index b9902e821509..1697b2ffdbff 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergBucketing.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergBucketing.java @@ -30,8 +30,9 @@ import org.apache.iceberg.types.Types.DecimalType; import org.apache.iceberg.types.Types.DoubleType; import org.apache.iceberg.types.Types.FloatType; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; import java.math.BigDecimal; import java.nio.ByteBuffer; @@ -69,8 +70,8 @@ import static java.lang.String.format; import static java.time.ZoneOffset.UTC; import static org.apache.iceberg.types.Type.TypeID.DECIMAL; -import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; -import static org.testng.Assert.assertEquals; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; public class TestIcebergBucketing { @@ -187,7 +188,8 @@ public void testBucketingSpecValues() assertBucketAndHashEquals("binary", ByteBuffer.wrap(new byte[] {0x00, 0x01, 0x02, 0x03}), -188683207 & Integer.MAX_VALUE); } - @Test(dataProvider = "unsupportedBucketingTypes") + @ParameterizedTest + @MethodSource("unsupportedBucketingTypes") public void testUnsupportedTypes(Type type) { assertThatThrownBy(() -> computeIcebergBucket(type, null, 1)) @@ -197,8 +199,7 @@ public void testUnsupportedTypes(Type type) .hasMessage("Unsupported type for 'bucket': %s", toTrinoType(type, TYPE_MANAGER)); } - @DataProvider - public Object[][] unsupportedBucketingTypes() + public static Object[][] unsupportedBucketingTypes() { return new Object[][] { {BooleanType.get()}, @@ -237,10 +238,9 @@ private void assertBucketNumberEquals(Type icebergType, Object icebergValue, int Integer icebergBucket = computeIcebergBucket(icebergType, icebergValue, bucketCount); Integer trinoBucket = computeTrinoBucket(icebergType, icebergValue, bucketCount); - assertEquals( - trinoBucket, - icebergBucket, - format("icebergType=%s, bucketCount=%s, icebergBucket=%d, trinoBucket=%d;", icebergType, bucketCount, icebergBucket, trinoBucket)); + assertThat(trinoBucket) + .describedAs(format("icebergType=%s, bucketCount=%s, icebergBucket=%d, trinoBucket=%d;", icebergType, bucketCount, icebergBucket, trinoBucket)) + .isEqualTo(icebergBucket); } private void assertHashEquals(Type icebergType, Object icebergValue, Integer expectedHash) @@ -252,16 +252,14 @@ private void assertHashEquals(Type icebergType, Object icebergValue, Integer exp Integer trinoBucketHash = computeTrinoBucket(icebergType, icebergValue, Integer.MAX_VALUE); // Ensure hash is stable and does not change - assertEquals( - icebergBucketHash, - expectedHash, - format("expected Iceberg %s(%s) bucket with %sd buckets to be %d, got %d", icebergType, icebergValue, Integer.MAX_VALUE, expectedHash, icebergBucketHash)); + assertThat(icebergBucketHash) + .describedAs(format("expected Iceberg %s(%s) bucket with %sd buckets to be %d, got %d", icebergType, icebergValue, Integer.MAX_VALUE, expectedHash, icebergBucketHash)) + .isEqualTo(expectedHash); // Ensure hash is stable and does not change - assertEquals( - trinoBucketHash, - expectedHash, - format("expected Trino %s(%s) bucket with %sd buckets to be %d, got %d", icebergType, icebergValue, Integer.MAX_VALUE, expectedHash, trinoBucketHash)); + assertThat(trinoBucketHash) + .describedAs(format("expected Trino %s(%s) bucket with %sd buckets to be %d, got %d", icebergType, icebergValue, Integer.MAX_VALUE, expectedHash, trinoBucketHash)) + .isEqualTo(expectedHash); } private Integer computeIcebergBucket(Type type, Object icebergValue, int bucketCount) @@ -274,7 +272,7 @@ private Integer computeTrinoBucket(Type icebergType, Object icebergValue, int bu { io.trino.spi.type.Type trinoType = toTrinoType(icebergType, TYPE_MANAGER); ColumnTransform transform = PartitionTransforms.bucket(trinoType, bucketCount); - Function blockTransform = transform.getBlockTransform(); + Function blockTransform = transform.blockTransform(); BlockBuilder blockBuilder = trinoType.createBlockBuilder(null, 1); @@ -285,11 +283,11 @@ private Integer computeTrinoBucket(Type icebergType, Object icebergValue, int bu Block bucketBlock = blockTransform.apply(block); verify(bucketBlock.getPositionCount() == 1); - Integer trinoBucketWithBlock = bucketBlock.isNull(0) ? null : bucketBlock.getInt(0, 0); + Integer trinoBucketWithBlock = bucketBlock.isNull(0) ? null : INTEGER.getInt(bucketBlock, 0); - Long trinoBucketWithValue = (Long) transform.getValueTransform().apply(block, 0); + Long trinoBucketWithValue = (Long) transform.valueTransform().apply(block, 0); Integer trinoBucketWithValueAsInteger = trinoBucketWithValue == null ? null : toIntExact(trinoBucketWithValue); - assertEquals(trinoBucketWithValueAsInteger, trinoBucketWithBlock); + assertThat(trinoBucketWithValueAsInteger).isEqualTo(trinoBucketWithBlock); return trinoBucketWithBlock; } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergColumnHandle.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergColumnHandle.java index 01088be73cba..b09d7bebc406 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergColumnHandle.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergColumnHandle.java @@ -22,9 +22,7 @@ import io.trino.spi.type.RowType; import io.trino.spi.type.Type; import io.trino.type.TypeDeserializer; -import org.testng.annotations.Test; - -import java.util.Optional; +import org.junit.jupiter.api.Test; import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.ARRAY; import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.PRIMITIVE; @@ -32,14 +30,14 @@ import static io.trino.plugin.iceberg.ColumnIdentity.primitiveColumnIdentity; import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER; -import static org.testng.Assert.assertEquals; +import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergColumnHandle { @Test public void testRoundTrip() { - testRoundTrip(new IcebergColumnHandle(primitiveColumnIdentity(12, "blah"), BIGINT, ImmutableList.of(), BIGINT, Optional.of("this is a comment"))); + testRoundTrip(IcebergColumnHandle.optional(primitiveColumnIdentity(12, "blah")).columnType(BIGINT).comment("this is a comment").build()); // Nested column ColumnIdentity foo1 = new ColumnIdentity(1, "foo1", PRIMITIVE, ImmutableList.of()); @@ -48,28 +46,23 @@ public void testRoundTrip() Type nestedColumnType = RowType.from(ImmutableList.of( RowType.field("foo2", BIGINT), RowType.field("foo3", new ArrayType(BIGINT)))); - IcebergColumnHandle nestedColumn = new IcebergColumnHandle( - new ColumnIdentity( + IcebergColumnHandle nestedColumn = IcebergColumnHandle.optional(new ColumnIdentity( 5, "foo5", STRUCT, - ImmutableList.of(foo2, foo3)), - nestedColumnType, - ImmutableList.of(), - nestedColumnType, - Optional.empty()); + ImmutableList.of(foo2, foo3))) + .columnType(nestedColumnType) + .build(); testRoundTrip(nestedColumn); - IcebergColumnHandle partialColumn = new IcebergColumnHandle( - new ColumnIdentity( + IcebergColumnHandle partialColumn = IcebergColumnHandle.optional(new ColumnIdentity( 5, "foo5", STRUCT, - ImmutableList.of(foo2, foo3)), - nestedColumnType, - ImmutableList.of(2), - BIGINT, - Optional.empty()); + ImmutableList.of(foo2, foo3))) + .fieldType(nestedColumnType, BIGINT) + .path(2) + .build(); testRoundTrip(partialColumn); } @@ -82,14 +75,14 @@ private void testRoundTrip(IcebergColumnHandle expected) String json = codec.toJson(expected); IcebergColumnHandle actual = codec.fromJson(json); - assertEquals(actual, expected); - assertEquals(actual.getBaseColumnIdentity(), expected.getBaseColumnIdentity()); - assertEquals(actual.getBaseType(), expected.getBaseType()); - assertEquals(actual.getQualifiedName(), expected.getQualifiedName()); - assertEquals(actual.getName(), expected.getName()); - assertEquals(actual.getColumnIdentity(), expected.getColumnIdentity()); - assertEquals(actual.getId(), actual.getId()); - assertEquals(actual.getType(), expected.getType()); - assertEquals(actual.getComment(), expected.getComment()); + assertThat(actual).isEqualTo(expected); + assertThat(actual.getBaseColumnIdentity()).isEqualTo(expected.getBaseColumnIdentity()); + assertThat(actual.getBaseType()).isEqualTo(expected.getBaseType()); + assertThat(actual.getQualifiedName()).isEqualTo(expected.getQualifiedName()); + assertThat(actual.getName()).isEqualTo(expected.getName()); + assertThat(actual.getColumnIdentity()).isEqualTo(expected.getColumnIdentity()); + assertThat(actual.getId()).isEqualTo(expected.getId()); + assertThat(actual.getType()).isEqualTo(expected.getType()); + assertThat(actual.getComment()).isEqualTo(expected.getComment()); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConfig.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConfig.java index 5711641500e5..d7410a93f069 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConfig.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConfig.java @@ -13,27 +13,31 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import io.airlift.units.DataSize; import io.airlift.units.Duration; -import io.trino.plugin.hive.HiveCompressionCodec; -import org.testng.annotations.Test; +import io.trino.plugin.hive.HiveCompressionOption; +import jakarta.validation.constraints.AssertFalse; +import org.junit.jupiter.api.Test; import java.util.Map; import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static io.airlift.testing.ValidationAssertions.assertFailsValidation; import static io.airlift.units.DataSize.Unit.GIGABYTE; import static io.airlift.units.DataSize.Unit.MEGABYTE; -import static io.trino.plugin.hive.HiveCompressionCodec.ZSTD; +import static io.trino.plugin.hive.HiveCompressionOption.ZSTD; import static io.trino.plugin.iceberg.CatalogType.GLUE; import static io.trino.plugin.iceberg.CatalogType.HIVE_METASTORE; import static io.trino.plugin.iceberg.IcebergFileFormat.ORC; import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; import static java.util.concurrent.TimeUnit.DAYS; import static java.util.concurrent.TimeUnit.HOURS; -import static java.util.concurrent.TimeUnit.MINUTES; +import static java.util.concurrent.TimeUnit.SECONDS; public class TestIcebergConfig { @@ -43,11 +47,12 @@ public void testDefaults() assertRecordedDefaults(recordDefaults(IcebergConfig.class) .setFileFormat(PARQUET) .setCompressionCodec(ZSTD) + .setMaxCommitRetry(null) .setUseFileSizeFromMetadata(true) .setMaxPartitionsPerWriter(100) .setUniqueTableLocation(true) .setCatalogType(HIVE_METASTORE) - .setDynamicFilteringWaitTimeout(new Duration(0, MINUTES)) + .setDynamicFilteringWaitTimeout(new Duration(1, SECONDS)) .setTableStatisticsEnabled(true) .setExtendedStatisticsEnabled(true) .setCollectExtendedStatisticsOnWrite(true) @@ -58,10 +63,26 @@ public void testDefaults() .setRemoveOrphanFilesMinRetention(new Duration(7, DAYS)) .setDeleteSchemaLocationsFallback(false) .setTargetMaxFileSize(DataSize.of(1, GIGABYTE)) + .setIdleWriterMinFileSize(DataSize.of(16, MEGABYTE)) .setMinimumAssignedSplitWeight(0.05) + .setHideMaterializedViewStorageTable(true) .setMaterializedViewsStorageSchema(null) .setRegisterTableProcedureEnabled(false) - .setSortedWritingEnabled(true)); + .setAddFilesProcedureEnabled(false) + .setSortedWritingEnabled(true) + .setQueryPartitionFilterRequired(false) + .setQueryPartitionFilterRequiredSchemas(ImmutableSet.of()) + .setSplitManagerThreads(Integer.toString(Math.min(Runtime.getRuntime().availableProcessors() * 2, 32))) + .setPlanningThreads(Integer.toString(Math.min(Runtime.getRuntime().availableProcessors(), 16))) + .setFileDeleteThreads(Integer.toString(Runtime.getRuntime().availableProcessors() * 2)) + .setAllowedExtraProperties(ImmutableList.of()) + .setIncrementalRefreshEnabled(true) + .setMetadataCacheEnabled(true) + .setIncrementalRefreshEnabled(true) + .setObjectStoreLayoutEnabled(false) + .setMetadataParallelism(8) + .setBucketExecutionEnabled(true) + .setFileBasedConflictDetectionEnabled(true)); } @Test @@ -70,6 +91,7 @@ public void testExplicitPropertyMappings() Map properties = ImmutableMap.builder() .put("iceberg.file-format", "ORC") .put("iceberg.compression-codec", "NONE") + .put("iceberg.max-commit-retry", "100") .put("iceberg.use-file-size-from-metadata", "false") .put("iceberg.max-partitions-per-writer", "222") .put("iceberg.unique-table-location", "false") @@ -81,19 +103,35 @@ public void testExplicitPropertyMappings() .put("iceberg.projection-pushdown-enabled", "false") .put("iceberg.hive-catalog-name", "hive") .put("iceberg.format-version", "1") - .put("iceberg.expire_snapshots.min-retention", "13h") - .put("iceberg.remove_orphan_files.min-retention", "14h") + .put("iceberg.expire-snapshots.min-retention", "13h") + .put("iceberg.remove-orphan-files.min-retention", "14h") .put("iceberg.delete-schema-locations-fallback", "true") .put("iceberg.target-max-file-size", "1MB") + .put("iceberg.idle-writer-min-file-size", "1MB") .put("iceberg.minimum-assigned-split-weight", "0.01") + .put("iceberg.materialized-views.hide-storage-table", "false") .put("iceberg.materialized-views.storage-schema", "mv_storage_schema") .put("iceberg.register-table-procedure.enabled", "true") + .put("iceberg.add-files-procedure.enabled", "true") .put("iceberg.sorted-writing-enabled", "false") + .put("iceberg.query-partition-filter-required", "true") + .put("iceberg.query-partition-filter-required-schemas", "bronze,silver") + .put("iceberg.split-manager-threads", "42") + .put("iceberg.planning-threads", "42") + .put("iceberg.file-delete-threads", "42") + .put("iceberg.allowed-extra-properties", "propX,propY") + .put("iceberg.incremental-refresh-enabled", "false") + .put("iceberg.metadata-cache.enabled", "false") + .put("iceberg.object-store-layout.enabled", "true") + .put("iceberg.metadata.parallelism", "10") + .put("iceberg.bucket-execution", "false") + .put("iceberg.file-based-conflict-detection", "false") .buildOrThrow(); IcebergConfig expected = new IcebergConfig() .setFileFormat(ORC) - .setCompressionCodec(HiveCompressionCodec.NONE) + .setCompressionCodec(HiveCompressionOption.NONE) + .setMaxCommitRetry(100) .setUseFileSizeFromMetadata(false) .setMaxPartitionsPerWriter(222) .setUniqueTableLocation(false) @@ -109,11 +147,39 @@ public void testExplicitPropertyMappings() .setRemoveOrphanFilesMinRetention(new Duration(14, HOURS)) .setDeleteSchemaLocationsFallback(true) .setTargetMaxFileSize(DataSize.of(1, MEGABYTE)) + .setIdleWriterMinFileSize(DataSize.of(1, MEGABYTE)) .setMinimumAssignedSplitWeight(0.01) + .setHideMaterializedViewStorageTable(false) .setMaterializedViewsStorageSchema("mv_storage_schema") .setRegisterTableProcedureEnabled(true) - .setSortedWritingEnabled(false); + .setAddFilesProcedureEnabled(true) + .setSortedWritingEnabled(false) + .setQueryPartitionFilterRequired(true) + .setQueryPartitionFilterRequiredSchemas(ImmutableSet.of("bronze", "silver")) + .setSplitManagerThreads("42") + .setPlanningThreads("42") + .setFileDeleteThreads("42") + .setAllowedExtraProperties(ImmutableList.of("propX", "propY")) + .setIncrementalRefreshEnabled(false) + .setMetadataCacheEnabled(false) + .setIncrementalRefreshEnabled(false) + .setObjectStoreLayoutEnabled(true) + .setMetadataParallelism(10) + .setBucketExecutionEnabled(false) + .setFileBasedConflictDetectionEnabled(false); assertFullMapping(properties, expected); } + + @Test + public void testValidation() + { + assertFailsValidation( + new IcebergConfig() + .setHideMaterializedViewStorageTable(true) + .setMaterializedViewsStorageSchema("storage_schema"), + "storageSchemaSetWhenHidingIsEnabled", + "iceberg.materialized-views.storage-schema may only be set when iceberg.materialized-views.hide-storage-table is set to false", + AssertFalse.class); + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorFactory.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorFactory.java index 21efb5afb94d..de9461e920d0 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorFactory.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorFactory.java @@ -16,35 +16,25 @@ import com.google.common.collect.ImmutableMap; import io.trino.spi.connector.ConnectorFactory; import io.trino.testing.TestingConnectorContext; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.util.Map; -import static org.assertj.core.api.Assertions.assertThatThrownBy; - public class TestIcebergConnectorFactory { @Test public void testBasicConfig() { - Map config = ImmutableMap.of("hive.metastore.uri", "thrift://localhost:1234"); + Map config = ImmutableMap.of( + "hive.metastore.uri", "thrift://localhost:1234", + "bootstrap.quiet", "true"); createConnector(config); } - @Test - public void testCachingHiveMetastore() - { - Map config = ImmutableMap.builder() - .put("hive.metastore.uri", "thrift://localhost:1234") - .put("hive.metastore-cache-ttl", "5m") - .buildOrThrow(); - assertThatThrownBy(() -> createConnector(config)) - .hasMessageContaining("Hive metastore caching must not be enabled for Iceberg"); - } - private static void createConnector(Map config) { ConnectorFactory factory = new IcebergConnectorFactory(); - factory.create("test", config, new TestingConnectorContext()); + factory.create("test", config, new TestingConnectorContext()) + .shutdown(); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorSmokeTest.java index 99a4c1d96595..c698ba1434a4 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorSmokeTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergConnectorSmokeTest.java @@ -15,31 +15,31 @@ import com.google.common.collect.ImmutableMap; import io.trino.filesystem.Location; -import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.metastore.HiveMetastore; import io.trino.testing.QueryRunner; -import org.testng.annotations.AfterClass; +import io.trino.testing.sql.TestTable; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; -import java.io.File; import java.io.IOException; import java.io.UncheckedIOException; -import java.nio.file.Files; -import java.nio.file.Path; -import static com.google.common.io.MoreFiles.deleteRecursively; -import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; import static io.trino.plugin.iceberg.IcebergTestUtils.checkOrcFileSorting; -import static java.lang.String.format; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.tpch.TpchTable.NATION; +import static io.trino.tpch.TpchTable.ORDERS; +import static io.trino.tpch.TpchTable.REGION; import static org.apache.iceberg.FileFormat.ORC; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; // Redundant over TestIcebergOrcConnectorTest, but exists to exercise BaseConnectorSmokeTest // Some features like materialized views may be supported by Iceberg only. +@TestInstance(PER_CLASS) public class TestIcebergConnectorSmokeTest extends BaseIcebergConnectorSmokeTest { private HiveMetastore metastore; - private File metastoreDir; public TestIcebergConnectorSmokeTest() { @@ -50,24 +50,16 @@ public TestIcebergConnectorSmokeTest() protected QueryRunner createQueryRunner() throws Exception { - this.metastoreDir = Files.createTempDirectory("test_iceberg_table_smoke_test").toFile(); - this.metastoreDir.deleteOnExit(); - this.metastore = createTestingFileHiveMetastore(metastoreDir); - return IcebergQueryRunner.builder() - .setInitialTables(REQUIRED_TPCH_TABLES) - .setMetastoreDirectory(metastoreDir) + QueryRunner queryRunner = IcebergQueryRunner.builder() + .setInitialTables(NATION, ORDERS, REGION) .setIcebergProperties(ImmutableMap.of( "iceberg.file-format", format.name(), "iceberg.register-table-procedure.enabled", "true", - "iceberg.writer-sort-buffer-size", "1MB")) + "iceberg.writer-sort-buffer-size", "1MB", + "iceberg.allowed-extra-properties", "write.metadata.delete-after-commit.enabled,write.metadata.previous-versions-max")) .build(); - } - - @AfterClass(alwaysRun = true) - public void tearDown() - throws IOException - { - deleteRecursively(metastoreDir.toPath(), ALLOW_INSECURE); + metastore = getHiveMetastore(queryRunner); + return queryRunner; } @Override @@ -88,20 +80,25 @@ protected String getMetadataLocation(String tableName) @Override protected String schemaPath() { - return format("%s/%s", metastoreDir, getSession().getSchema().orElseThrow()); + return "local:///%s".formatted(getSession().getSchema().orElseThrow()); } @Override protected boolean locationExists(String location) { - return Files.exists(Path.of(location)); + try { + return fileSystem.newInputFile(Location.of(location)).exists(); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } } @Override protected void deleteDirectory(String location) { try { - deleteRecursively(Path.of(location), ALLOW_INSECURE); + fileSystem.deleteDirectory(Location.of(location)); } catch (IOException e) { throw new UncheckedIOException(e); @@ -113,4 +110,46 @@ protected boolean isFileSorted(Location path, String sortColumnName) { return checkOrcFileSorting(fileSystem, path, sortColumnName); } + + @Test + public void testRowConstructorColumnLimitForMergeQuery() + { + String[] colNames = {"orderkey", "custkey", "orderstatus", "totalprice", "orderpriority", "clerk", "shippriority", "comment", "orderdate"}; + String[] colTypes = {"bigint", "bigint", "varchar", "decimal(12,2)", "varchar", "varchar", "int", "varchar", "date"}; + String tableDefinition = "("; + String columns = "("; + String selectQuery = "select "; + String notMatchedClause = ""; + String matchedClause = ""; + // Creating merge query with 325 columns + for (int i = 0; i < 36; i++) { + for (int j = 0; j < 9; j++) { + String columnName = colNames[j]; + String columnType = colTypes[j]; + tableDefinition += columnName + "_" + i + " " + columnType + ","; + selectQuery += columnName + " " + columnName + "_" + i + ","; + columns += columnName + "_" + i + ","; + notMatchedClause += "s." + columnName + "_" + i + ","; + matchedClause += columnName + "_" + i + " = s." + columnName + "_" + i + ","; + } + } + tableDefinition += "orderkey bigint, custkey bigint, orderstatus varchar, totalprice decimal(12,2), orderpriority varchar) "; + selectQuery += "orderkey, custkey, orderstatus, totalprice, orderpriority from orders limit 1 "; + columns += "orderkey, custkey, orderstatus, totalprice, orderpriority) "; + notMatchedClause += "s.orderkey, s.custkey, s.orderstatus, s.totalprice, s.orderpriority "; + matchedClause += "orderkey = s.orderkey, custkey = s.custkey, orderstatus = s.orderstatus, totalprice = t.totalprice, orderpriority = s.orderpriority "; + TestTable table = newTrinoTable("test_merge_", tableDefinition); + assertUpdate("INSERT INTO " + table.getName() + " " + columns + " " + selectQuery, 1); + TestTable mergeTable = newTrinoTable("test_table_", tableDefinition); + assertUpdate("INSERT INTO " + mergeTable.getName() + " " + columns + " " + selectQuery, 1); + assertUpdate( + """ + MERGE INTO %s t + USING (select * from %s ) s + ON (t.orderkey = s.orderkey) + WHEN MATCHED THEN UPDATE SET %s + WHEN NOT MATCHED THEN INSERT VALUES (%s) + """.formatted(mergeTable.getName(), table.getName(), matchedClause, notMatchedClause), + 1); + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDisabledRegisterTableProcedure.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDisabledRegisterTableProcedure.java index ecdc95bdc158..502a248d612a 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDisabledRegisterTableProcedure.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDisabledRegisterTableProcedure.java @@ -15,7 +15,7 @@ import io.trino.testing.AbstractTestQueryFramework; import io.trino.testing.QueryRunner; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; public class TestIcebergDisabledRegisterTableProcedure extends AbstractTestQueryFramework diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDynamicPartitionPruningTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDynamicPartitionPruningTest.java index 7d7b7f05f9ca..dc43ba7d0f0c 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDynamicPartitionPruningTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergDynamicPartitionPruningTest.java @@ -13,10 +13,10 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableList; import io.trino.testing.BaseDynamicPartitionPruningTest; import io.trino.testing.QueryRunner; import org.intellij.lang.annotations.Language; -import org.testng.SkipException; import java.util.List; import java.util.Map; @@ -38,12 +38,6 @@ protected QueryRunner createQueryRunner() .build(); } - @Override - public void testJoinDynamicFilteringMultiJoinOnBucketedTables() - { - throw new SkipException("Iceberg does not support bucketing"); - } - @Override protected void createLineitemTable(String tableName, List columns, List partitionColumns) { @@ -69,6 +63,15 @@ protected void createPartitionedTable(String tableName, List columns, Li @Override protected void createPartitionedAndBucketedTable(String tableName, List columns, List partitionColumns, List bucketColumns) { - throw new UnsupportedOperationException(); + ImmutableList.Builder partitioning = ImmutableList.builder(); + partitionColumns.forEach(partitioning::add); + bucketColumns.forEach(column -> partitioning.add("bucket(%s,10)".formatted(column))); + + String sql = format( + "CREATE TABLE %s (%s) WITH (partitioning=ARRAY[%s])", + tableName, + String.join(",", columns), + String.join(",", partitioning.build().stream().map("'%s'"::formatted).toList())); + getQueryRunner().execute(sql); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergFileFormat.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergFileFormat.java new file mode 100644 index 000000000000..c9905802151f --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergFileFormat.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import io.trino.plugin.hive.HiveStorageFormat; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TestIcebergFileFormat +{ + @Test + public void testHumanName() + { + for (IcebergFileFormat icebergFileFormat : IcebergFileFormat.values()) { + assertThat(icebergFileFormat.humanName()) + .isEqualTo(HiveStorageFormat.valueOf(icebergFileFormat.name()).humanName()); + } + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergFileOperations.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergFileOperations.java new file mode 100644 index 000000000000..c87fb97bb09d --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergFileOperations.java @@ -0,0 +1,995 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultiset; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multiset; +import io.trino.Session; +import io.trino.SystemSessionProperties; +import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.metastore.HiveMetastore; +import io.trino.plugin.iceberg.util.FileOperationUtils.Scope; +import io.trino.plugin.tpch.TpchPlugin; +import io.trino.sql.planner.plan.FilterNode; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.QueryRunner; +import org.apache.iceberg.Table; +import org.apache.iceberg.util.ThreadPools; +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.nio.file.Path; +import java.util.Optional; + +import static com.google.common.collect.ImmutableMultiset.toImmutableMultiset; +import static io.trino.SystemSessionProperties.MIN_INPUT_SIZE_PER_TASK; +import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.plugin.iceberg.IcebergSessionProperties.COLLECT_EXTENDED_STATISTICS_ON_WRITE; +import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.plugin.iceberg.util.EqualityDeleteUtils.writeEqualityDeleteForTable; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileOperation; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.DATA; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.DELETE; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.MANIFEST; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.METADATA_JSON; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.SNAPSHOT; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.STATS; +import static io.trino.plugin.iceberg.util.FileOperationUtils.Scope.ALL_FILES; +import static io.trino.plugin.iceberg.util.FileOperationUtils.Scope.METADATA_FILES; +import static io.trino.plugin.iceberg.util.FileOperationUtils.getOperations; +import static io.trino.testing.MultisetAssertions.assertMultisetsEqual; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.TestingSession.testSessionBuilder; +import static java.lang.Math.min; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; + +@Execution(ExecutionMode.SAME_THREAD) +public class TestIcebergFileOperations + extends AbstractTestQueryFramework +{ + private static final int MAX_PREFIXES_COUNT = 10; + + private HiveMetastore metastore; + private TrinoFileSystemFactory fileSystemFactory; + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + Session session = testSessionBuilder() + .setCatalog("iceberg") + .setSchema("test_schema") + // It is essential to disable DeterminePartitionCount rule since all queries in this test scans small + // amount of data which makes them run with single hash partition count. However, this test requires them + // to run over multiple nodes. + .setSystemProperty(MIN_INPUT_SIZE_PER_TASK, "0MB") + .build(); + + QueryRunner queryRunner = DistributedQueryRunner.builder(session) + // the delete test must run with a single task, so we can verify the delete file is read once per task + // currently the only way to achieve this is to set worker count to 0 + .setWorkerCount(0) + .addCoordinatorProperty("optimizer.experimental-max-prefetched-information-schema-prefixes", Integer.toString(MAX_PREFIXES_COUNT)) + .build(); + + Path dataDirectory = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data"); + dataDirectory.toFile().mkdirs(); + queryRunner.installPlugin(new TestingIcebergPlugin(dataDirectory)); + queryRunner.createCatalog(ICEBERG_CATALOG, "iceberg", ImmutableMap.builder() + .put("iceberg.split-manager-threads", "0") + // FS accesses with metadata cache are tested separately in io.trino.plugin.iceberg.TestIcebergMemoryCacheFileOperations + .put("iceberg.metadata-cache.enabled", "false") + .buildOrThrow()); + + metastore = getHiveMetastore(queryRunner); + + queryRunner.installPlugin(new TpchPlugin()); + queryRunner.createCatalog("tpch", "tpch"); + + queryRunner.execute("CREATE SCHEMA test_schema"); + + return queryRunner; + } + + @BeforeAll + public void initFileSystemFactory() + { + fileSystemFactory = getFileSystemFactory(getDistributedQueryRunner()); + } + + @Test + public void testCreateTable() + { + assertFileSystemAccesses("CREATE TABLE test_create (id VARCHAR, age INT)", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "OutputFile.create")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .build()); + } + + @Test + public void testCreateOrReplaceTable() + { + assertFileSystemAccesses("CREATE OR REPLACE TABLE test_create_or_replace (id VARCHAR, age INT)", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "OutputFile.create")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .build()); + assertFileSystemAccesses("CREATE OR REPLACE TABLE test_create_or_replace (id VARCHAR, age INT)", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "OutputFile.create")) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .build()); + } + + @Test + public void testCreateTableAsSelect() + { + assertFileSystemAccesses( + withStatsOnWrite(getSession(), false), + "CREATE TABLE test_create_as_select AS SELECT 1 col_name", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "OutputFile.create")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .add(new FileOperation(MANIFEST, "OutputFile.create")) + .build()); + + assertFileSystemAccesses( + withStatsOnWrite(getSession(), true), + "CREATE TABLE test_create_as_select_with_stats AS SELECT 1 col_name", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .addCopies(new FileOperation(METADATA_JSON, "OutputFile.create"), 2) // TODO (https://github.com/trinodb/trino/issues/15439): it would be good to publish data and stats in one commit + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .add(new FileOperation(MANIFEST, "OutputFile.create")) + .add(new FileOperation(STATS, "OutputFile.create")) + .build()); + } + + @Test + public void testCreateOrReplaceTableAsSelect() + { + assertFileSystemAccesses( + "CREATE OR REPLACE TABLE test_create_or_replace_as_select AS SELECT 1 col_name", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "OutputFile.create"), 2) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .add(new FileOperation(MANIFEST, "OutputFile.create")) + .add(new FileOperation(STATS, "OutputFile.create")) + .build()); + + assertFileSystemAccesses( + "CREATE OR REPLACE TABLE test_create_or_replace_as_select AS SELECT 1 col_name", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "OutputFile.create"), 2) + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 2) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .add(new FileOperation(MANIFEST, "OutputFile.create")) + .add(new FileOperation(STATS, "OutputFile.create")) + .build()); + } + + @Test + public void testInsert() + { + assertUpdate("CREATE TABLE test_insert (id VARCHAR, age INT)"); + + assertFileSystemAccesses( + "INSERT INTO test_insert VALUES('a', 1)", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "OutputFile.create"), 2) + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 3) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.length"), 3) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.newStream"), 3) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .add(new FileOperation(MANIFEST, "OutputFile.create")) + .add(new FileOperation(STATS, "OutputFile.create")) + .build()); + + assertFileSystemAccesses( + "INSERT INTO test_insert VALUES('b', 2)", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "OutputFile.create"), 2) + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 3) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.newStream"), 3) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.length"), 3) + .add(new FileOperation(STATS, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "OutputFile.create")) + .add(new FileOperation(MANIFEST, "OutputFile.create")) + .add(new FileOperation(STATS, "OutputFile.create")) + .build()); + } + + @Test + public void testSelect() + { + assertUpdate("CREATE TABLE test_select AS SELECT 1 col_name", 1); + assertFileSystemAccesses("SELECT * FROM test_select", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + } + + @ParameterizedTest + @MethodSource("testSelectWithLimitDataProvider") + public void testSelectWithLimit(int numberOfFiles) + { + assertUpdate("DROP TABLE IF EXISTS test_select_with_limit"); // test is parameterized + + // Create table with multiple files + assertUpdate("CREATE TABLE test_select_with_limit(k varchar, v integer) WITH (partitioning=ARRAY['truncate(k, 1)'])"); + // 2 files per partition, numberOfFiles files in total, in numberOfFiles separate manifests (due to fastAppend) + for (int i = 0; i < numberOfFiles; i++) { + String k = Integer.toString(10 + i * 5); + assertUpdate("INSERT INTO test_select_with_limit VALUES ('" + k + "', " + i + ")", 1); + } + + // org.apache.iceberg.util.ParallelIterable, even if used with a direct executor, schedules 2 * ThreadPools.WORKER_THREAD_POOL_SIZE upfront + int icebergManifestPrefetching = 2 * ThreadPools.WORKER_THREAD_POOL_SIZE; + + assertFileSystemAccesses("SELECT * FROM test_select_with_limit LIMIT 3", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), min(icebergManifestPrefetching, numberOfFiles)) + .build()); + + assertFileSystemAccesses("EXPLAIN SELECT * FROM test_select_with_limit LIMIT 3", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), numberOfFiles) + .build()); + + assertFileSystemAccesses("EXPLAIN ANALYZE SELECT * FROM test_select_with_limit LIMIT 3", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), numberOfFiles + min(icebergManifestPrefetching, numberOfFiles)) + .build()); + + assertUpdate("DROP TABLE test_select_with_limit"); + } + + public Object[][] testSelectWithLimitDataProvider() + { + return new Object[][] { + {10}, + {50}, + // 2 * ThreadPools.WORKER_THREAD_POOL_SIZE manifest is always read, so include one more data point to show this is a constant number + {2 * 2 * ThreadPools.WORKER_THREAD_POOL_SIZE + 6}, + }; + } + + @Test + public void testReadWholePartition() + { + assertUpdate("DROP TABLE IF EXISTS test_read_part_key"); + + assertUpdate("CREATE TABLE test_read_part_key(key varchar, data varchar) WITH (partitioning=ARRAY['key'])"); + + // Create multiple files per partition + assertUpdate("INSERT INTO test_read_part_key(key, data) VALUES ('p1', '1-abc'), ('p1', '1-def'), ('p2', '2-abc'), ('p2', '2-def')", 4); + assertUpdate("INSERT INTO test_read_part_key(key, data) VALUES ('p1', '1-baz'), ('p2', '2-baz')", 2); + + // Read partition and data columns + assertFileSystemAccesses( + "SELECT key, max(data) FROM test_read_part_key GROUP BY key", + ALL_FILES, + ImmutableMultiset.builder() + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(DATA, "InputFile.newInput"), 4) + .build()); + + // Read partition column only + assertFileSystemAccesses( + "SELECT key, count(*) FROM test_read_part_key GROUP BY key", + ALL_FILES, + ImmutableMultiset.builder() + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .build()); + + // Read partition column only, one partition only + assertFileSystemAccesses( + "SELECT count(*) FROM test_read_part_key WHERE key = 'p1'", + ALL_FILES, + ImmutableMultiset.builder() + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .build()); + + // Read partition and synthetic columns + assertFileSystemAccesses( + "SELECT count(*), array_agg(\"$path\"), max(\"$file_modified_time\") FROM test_read_part_key GROUP BY key", + ALL_FILES, + ImmutableMultiset.builder() + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + // TODO return synthetic columns without opening the data files + .addCopies(new FileOperation(DATA, "InputFile.newInput"), 4) + .addCopies(new FileOperation(DATA, "InputFile.lastModified"), 4) + .build()); + + // Read only row count + assertFileSystemAccesses( + "SELECT count(*) FROM test_read_part_key", + ALL_FILES, + ImmutableMultiset.builder() + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .build()); + + assertUpdate("DROP TABLE test_read_part_key"); + } + + @Test + public void testReadWholePartitionSplittableFile() + { + String catalog = getSession().getCatalog().orElseThrow(); + + assertUpdate("DROP TABLE IF EXISTS test_read_whole_splittable_file"); + assertUpdate("CREATE TABLE test_read_whole_splittable_file(key varchar, data varchar) WITH (partitioning=ARRAY['key'])"); + + assertUpdate( + Session.builder(getSession()) + .setSystemProperty(SystemSessionProperties.WRITER_SCALING_MIN_DATA_PROCESSED, "1PB") + .setCatalogSessionProperty(catalog, "parquet_writer_block_size", "1kB") + .setCatalogSessionProperty(catalog, "orc_writer_max_stripe_size", "1kB") + .setCatalogSessionProperty(catalog, "orc_writer_max_stripe_rows", "1000") + .build(), + "INSERT INTO test_read_whole_splittable_file SELECT 'single partition', comment FROM tpch.tiny.orders", 15000); + + Session session = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, IcebergSessionProperties.SPLIT_SIZE, "1kB") + .build(); + + // Read partition column only + assertFileSystemAccesses( + session, + "SELECT key, count(*) FROM test_read_whole_splittable_file GROUP BY key", + ALL_FILES, + ImmutableMultiset.builder() + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .build()); + + // Read only row count + assertFileSystemAccesses( + session, + "SELECT count(*) FROM test_read_whole_splittable_file", + ALL_FILES, + ImmutableMultiset.builder() + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .build()); + + assertUpdate("DROP TABLE test_read_whole_splittable_file"); + } + + @Test + public void testSelectFromVersionedTable() + { + String tableName = "test_select_from_versioned_table"; + assertUpdate("CREATE TABLE " + tableName + " (id int, age int)"); + long v1SnapshotId = getLatestSnapshotId(tableName); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, 20)", 1); + long v2SnapshotId = getLatestSnapshotId(tableName); + assertUpdate("INSERT INTO " + tableName + " VALUES (3, 30)", 1); + long v3SnapshotId = getLatestSnapshotId(tableName); + assertFileSystemAccesses("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v1SnapshotId, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .build()); + assertFileSystemAccesses("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + assertFileSystemAccesses("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v3SnapshotId, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .build()); + assertFileSystemAccesses("SELECT * FROM " + tableName, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .build()); + } + + @Test + public void testSelectFromVersionedTableWithSchemaEvolution() + { + String tableName = "test_select_from_versioned_table_with_schema_evolution"; + assertUpdate("CREATE TABLE " + tableName + " (id int, age int)"); + long v1SnapshotId = getLatestSnapshotId(tableName); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, 20)", 1); + long v2SnapshotId = getLatestSnapshotId(tableName); + assertUpdate("ALTER TABLE " + tableName + " ADD COLUMN address varchar"); + assertUpdate("INSERT INTO " + tableName + " VALUES (3, 30, 'London')", 1); + long v3SnapshotId = getLatestSnapshotId(tableName); + assertFileSystemAccesses("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v1SnapshotId, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .build()); + assertFileSystemAccesses("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v2SnapshotId, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + assertFileSystemAccesses("SELECT * FROM " + tableName + " FOR VERSION AS OF " + v3SnapshotId, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .build()); + assertFileSystemAccesses("SELECT * FROM " + tableName, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .build()); + } + + @Test + public void testSelectWithFilter() + { + assertUpdate("CREATE TABLE test_select_with_filter AS SELECT 1 col_name", 1); + assertFileSystemAccesses("SELECT * FROM test_select_with_filter WHERE col_name = 1", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + } + + @Test + public void testPartialTimestampPartitionPruningEffectivenessWithPartitionTransform() + { + testPartialTimestampPartitionPruningEffectivenessWithPartitionTransform("hour(d)", 4); + testPartialTimestampPartitionPruningEffectivenessWithPartitionTransform("day(d)", 2); + testPartialTimestampPartitionPruningEffectivenessWithPartitionTransform("month(d)", 2); + testPartialTimestampPartitionPruningEffectivenessWithPartitionTransform("year(d)", 2); + testPartialTimestampPartitionPruningEffectivenessWithPartitionTransform("bucket(d, 4)", 2); + } + + private void testPartialTimestampPartitionPruningEffectivenessWithPartitionTransform(String partitionTransform, int expectedDataFileOperations) + { + String tableName = "test_transform_timestamp" + randomNameSuffix(); + assertUpdate(format("CREATE TABLE %s (d TIMESTAMP(6), b BIGINT) WITH (partitioning = ARRAY['%s'])", tableName, partitionTransform)); + + @Language("SQL") String values = + """ + VALUES + (NULL, 101), + (TIMESTAMP '1969-12-25 15:13:12.876543', 8), + (TIMESTAMP '1969-12-30 18:47:33.345678', 9), + (TIMESTAMP '1969-12-31 00:00:00.000000', 10), + (TIMESTAMP '1969-12-31 05:06:07.234567', 11), + (TIMESTAMP '1970-01-01 12:03:08.456789', 12), + (TIMESTAMP '2015-01-01 10:01:23.123456', 1), + (TIMESTAMP '2015-01-01 11:10:02.987654', 2), + (TIMESTAMP '2015-01-01 12:55:00.456789', 3), + (TIMESTAMP '2015-05-15 13:05:01.234567', 4), + (TIMESTAMP '2015-05-15 14:21:02.345678', 5), + (TIMESTAMP '2020-02-21 15:11:11.876543', 6), + (TIMESTAMP '2020-02-21 16:12:12.654321', 7) + """; + assertUpdate("INSERT INTO " + tableName + " " + values, 13); + assertQuery("SELECT * FROM " + tableName, values); + + @Language("SQL") String selectQuery = "SELECT * FROM " + tableName + " WHERE d >= TIMESTAMP '2015-05-15 01:23:45.678901'"; + assertThat(query(selectQuery)).isNotFullyPushedDown(FilterNode.class); + + assertFileSystemAccesses( + getSession(), + selectQuery, + ALL_FILES, + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .addCopies(new FileOperation(DATA, "InputFile.newInput"), expectedDataFileOperations) + .build()); + + assertThat((long) computeScalar("SELECT COUNT(DISTINCT file_path) FROM \"" + tableName + "$files\"")) + .isGreaterThan(expectedDataFileOperations); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testJoin() + { + assertUpdate("CREATE TABLE test_join_t1 AS SELECT 2 AS age, 'id1' AS id", 1); + assertUpdate("CREATE TABLE test_join_t2 AS SELECT 'name1' AS name, 'id1' AS id", 1); + + assertFileSystemAccesses("SELECT name, age FROM test_join_t1 JOIN test_join_t2 ON test_join_t2.id = test_join_t1.id", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 2) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.length"), 2) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.newStream"), 2) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 4) + .build()); + } + + @Test + public void testSelfJoinStatistics() + { + assertUpdate("CREATE TABLE test_self_join AS SELECT 'name1' AS name, 2 AS age, 'id1' AS id", 1); + + // We use column statistics for all three columns from t1 and single column from t2. + // IcebergMetadata#tableStatisticsCache should be able to avoid multiple reads by re-using stats from t1. + assertFileSystemAccesses("EXPLAIN SELECT t1.name, t1.age FROM test_self_join t1 JOIN test_self_join t2 ON t1.id = t2.id", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + + // Same columns projected from both t1 and t2, but with different predicate which prevents reuse of statistics from IcebergMetadata#tableStatisticsCache + assertFileSystemAccesses("EXPLAIN SELECT t1.age FROM test_self_join t1 JOIN test_self_join t2 ON t1.id = t2.id WHERE t2.age > 0", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .build()); + + // Different columns projected from t1 and t2 prevents reuse of statistics from IcebergMetadata#tableStatisticsCache + assertFileSystemAccesses("EXPLAIN SELECT t1.name FROM test_self_join t1 JOIN test_self_join t2 ON t1.name = t2.id", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .build()); + } + + @Test + public void testJoinWithPartitionedTable() + { + assertUpdate("CREATE TABLE test_join_partitioned_t1 (a BIGINT, b TIMESTAMP(6) with time zone) WITH (partitioning = ARRAY['a', 'day(b)'])"); + assertUpdate("CREATE TABLE test_join_partitioned_t2 (foo BIGINT)"); + assertUpdate("INSERT INTO test_join_partitioned_t2 VALUES(123)", 1); + assertUpdate("INSERT INTO test_join_partitioned_t1 VALUES(123, current_date)", 1); + + assertFileSystemAccesses("SELECT count(*) FROM test_join_partitioned_t1 t1 join test_join_partitioned_t2 t2 on t1.a = t2.foo", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 2) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.length"), 2) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.newStream"), 2) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 4) + .build()); + } + + @Test + public void testExplainSelect() + { + assertUpdate("CREATE TABLE test_explain AS SELECT 2 AS age", 1); + + assertFileSystemAccesses("EXPLAIN SELECT * FROM test_explain", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + } + + @Test + public void testShowStatsForTable() + { + assertUpdate("CREATE TABLE test_show_stats AS SELECT 2 AS age", 1); + + assertFileSystemAccesses("SHOW STATS FOR test_show_stats", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + } + + @Test + public void testShowStatsForPartitionedTable() + { + assertUpdate("CREATE TABLE test_show_stats_partitioned " + + "WITH (partitioning = ARRAY['regionkey']) " + + "AS SELECT * FROM tpch.tiny.nation", 25); + + assertFileSystemAccesses("SHOW STATS FOR test_show_stats_partitioned", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + } + + @Test + public void testShowStatsForTableWithFilter() + { + assertUpdate("CREATE TABLE test_show_stats_with_filter AS SELECT 2 AS age", 1); + + assertFileSystemAccesses("SHOW STATS FOR (SELECT * FROM test_show_stats_with_filter WHERE age >= 2)", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) + .build()); + } + + @Test + public void testPredicateWithVarcharCastToDate() + { + assertUpdate("CREATE TABLE test_varchar_as_date_predicate(a varchar) WITH (partitioning=ARRAY['truncate(a, 4)'])"); + assertUpdate("INSERT INTO test_varchar_as_date_predicate VALUES '2001-01-31'", 1); + assertUpdate("INSERT INTO test_varchar_as_date_predicate VALUES '2005-09-10'", 1); + + assertFileSystemAccesses("SELECT * FROM test_varchar_as_date_predicate", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 2) + .build()); + + // CAST to date and comparison + assertFileSystemAccesses("SELECT * FROM test_varchar_as_date_predicate WHERE CAST(a AS date) >= DATE '2005-01-01'", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) // fewer than without filter + .build()); + + // CAST to date and BETWEEN + assertFileSystemAccesses("SELECT * FROM test_varchar_as_date_predicate WHERE CAST(a AS date) BETWEEN DATE '2005-01-01' AND DATE '2005-12-31'", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) // fewer than without filter + .build()); + + // conversion to date as a date function + assertFileSystemAccesses("SELECT * FROM test_varchar_as_date_predicate WHERE date(a) >= DATE '2005-01-01'", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .add(new FileOperation(SNAPSHOT, "InputFile.length")) + .add(new FileOperation(SNAPSHOT, "InputFile.newStream")) + .add(new FileOperation(MANIFEST, "InputFile.newStream")) // fewer than without filter + .build()); + + assertUpdate("DROP TABLE test_varchar_as_date_predicate"); + } + + @Test + public void testRemoveOrphanFiles() + { + String tableName = "test_remove_orphan_files_" + randomNameSuffix(); + Session sessionWithShortRetentionUnlocked = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", "remove_orphan_files_min_retention", "0s") + .build(); + assertUpdate("CREATE TABLE " + tableName + " (key varchar, value integer)"); + assertUpdate("INSERT INTO " + tableName + " VALUES ('one', 1)", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES ('two', 2), ('three', 3)", 2); + assertUpdate("DELETE FROM " + tableName + " WHERE key = 'two'", 1); + + assertFileSystemAccesses( + sessionWithShortRetentionUnlocked, + "ALTER TABLE " + tableName + " EXECUTE REMOVE_ORPHAN_FILES (retention_threshold => '0s')", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.length"), 4) + .addCopies(new FileOperation(SNAPSHOT, "InputFile.newStream"), 4) + .addCopies(new FileOperation(MANIFEST, "InputFile.newStream"), 5) + .build()); + + assertUpdate("DROP TABLE " + tableName); + } + + @ParameterizedTest + @MethodSource("metadataQueriesTestTableCountDataProvider") + public void testInformationSchemaColumns(int tables) + { + String schemaName = "test_i_s_columns_schema" + randomNameSuffix(); + assertUpdate("CREATE SCHEMA " + schemaName); + Session session = Session.builder(getSession()) + .setSchema(schemaName) + .build(); + + for (int i = 0; i < tables; i++) { + assertUpdate(session, "CREATE TABLE test_select_i_s_columns" + i + "(id varchar, age integer)"); + // Produce multiple snapshots and metadata files + assertUpdate(session, "INSERT INTO test_select_i_s_columns" + i + " VALUES ('abc', 11)", 1); + assertUpdate(session, "INSERT INTO test_select_i_s_columns" + i + " VALUES ('xyz', 12)", 1); + + assertUpdate(session, "CREATE TABLE test_other_select_i_s_columns" + i + "(id varchar, age integer)"); // won't match the filter + } + + // Bulk retrieval + assertFileSystemAccesses(session, "SELECT * FROM information_schema.columns WHERE table_schema = CURRENT_SCHEMA AND table_name LIKE 'test_select_i_s_columns%'", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), tables * 2) + .build()); + + // Pointed lookup + assertFileSystemAccesses(session, "SELECT * FROM information_schema.columns WHERE table_schema = CURRENT_SCHEMA AND table_name = 'test_select_i_s_columns0'", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .build()); + + // Pointed lookup via DESCRIBE (which does some additional things before delegating to information_schema.columns) + assertFileSystemAccesses(session, "DESCRIBE test_select_i_s_columns0", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .build()); + + for (int i = 0; i < tables; i++) { + assertUpdate(session, "DROP TABLE test_select_i_s_columns" + i); + assertUpdate(session, "DROP TABLE test_other_select_i_s_columns" + i); + } + } + + @ParameterizedTest + @MethodSource("metadataQueriesTestTableCountDataProvider") + public void testSystemMetadataTableComments(int tables) + { + String schemaName = "test_s_m_table_comments" + randomNameSuffix(); + assertUpdate("CREATE SCHEMA " + schemaName); + Session session = Session.builder(getSession()) + .setSchema(schemaName) + .build(); + + for (int i = 0; i < tables; i++) { + assertUpdate(session, "CREATE TABLE test_select_s_m_t_comments" + i + "(id varchar, age integer)"); + // Produce multiple snapshots and metadata files + assertUpdate(session, "INSERT INTO test_select_s_m_t_comments" + i + " VALUES ('abc', 11)", 1); + assertUpdate(session, "INSERT INTO test_select_s_m_t_comments" + i + " VALUES ('xyz', 12)", 1); + + assertUpdate(session, "CREATE TABLE test_other_select_s_m_t_comments" + i + "(id varchar, age integer)"); // won't match the filter + } + + // Bulk retrieval + assertFileSystemAccesses(session, "SELECT * FROM system.metadata.table_comments WHERE schema_name = CURRENT_SCHEMA AND table_name LIKE 'test_select_s_m_t_comments%'", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), tables * 2) + .build()); + + // Bulk retrieval for two schemas + assertFileSystemAccesses(session, "SELECT * FROM system.metadata.table_comments WHERE schema_name IN (CURRENT_SCHEMA, 'non_existent') AND table_name LIKE 'test_select_s_m_t_comments%'", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), tables * 2) + .build()); + + // Pointed lookup + assertFileSystemAccesses(session, "SELECT * FROM system.metadata.table_comments WHERE schema_name = CURRENT_SCHEMA AND table_name = 'test_select_s_m_t_comments0'", + ImmutableMultiset.builder() + .add(new FileOperation(METADATA_JSON, "InputFile.newStream")) + .build()); + + for (int i = 0; i < tables; i++) { + assertUpdate(session, "DROP TABLE test_select_s_m_t_comments" + i); + assertUpdate(session, "DROP TABLE test_other_select_s_m_t_comments" + i); + } + } + + public Object[][] metadataQueriesTestTableCountDataProvider() + { + return new Object[][] { + {3}, + {MAX_PREFIXES_COUNT}, + {MAX_PREFIXES_COUNT + 3}, + }; + } + + @Test + public void testSystemMetadataMaterializedViews() + { + String schemaName = "test_materialized_views_" + randomNameSuffix(); + assertUpdate("CREATE SCHEMA " + schemaName); + Session session = Session.builder(getSession()) + .setSchema(schemaName) + .build(); + + assertUpdate(session, "CREATE TABLE test_table1 AS SELECT 1 a", 1); + assertUpdate(session, "CREATE TABLE test_table2 AS SELECT 1 a", 1); + + assertUpdate(session, "CREATE MATERIALIZED VIEW mv1 AS SELECT * FROM test_table1 JOIN test_table2 USING (a)"); + assertUpdate(session, "REFRESH MATERIALIZED VIEW mv1", 1); + + assertUpdate(session, "CREATE MATERIALIZED VIEW mv2 AS SELECT count(*) c FROM test_table1 JOIN test_table2 USING (a)"); + assertUpdate(session, "REFRESH MATERIALIZED VIEW mv2", 1); + + // Bulk retrieval + assertFileSystemAccesses(session, "SELECT * FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 4) + .build()); + + // Bulk retrieval without selecting freshness + assertFileSystemAccesses( + session, + "SELECT schema_name, name FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA", + ImmutableMultiset.of()); + + // Bulk retrieval for two schemas + assertFileSystemAccesses(session, "SELECT * FROM system.metadata.materialized_views WHERE schema_name IN (CURRENT_SCHEMA, 'non_existent')", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 4) + .build()); + + // Pointed lookup + assertFileSystemAccesses(session, "SELECT * FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA AND name = 'mv1'", + ImmutableMultiset.builder() + .addCopies(new FileOperation(METADATA_JSON, "InputFile.newStream"), 3) + .build()); + + // Pointed lookup without selecting freshness + assertFileSystemAccesses( + session, + "SELECT schema_name, name FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA AND name = 'mv1'", + ImmutableMultiset.of()); + + assertFileSystemAccesses( + session, + "SELECT * FROM iceberg.information_schema.columns WHERE table_schema = CURRENT_SCHEMA AND table_name = 'mv1'", + ImmutableMultiset.of()); + + assertUpdate("DROP SCHEMA " + schemaName + " CASCADE"); + } + + @Test + public void testV2TableEnsureEqualityDeleteFilesAreReadOnce() + throws Exception + { + String tableName = "test_equality_deletes_ensure_delete_read_count" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INT, age INT)"); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, 20), (3, 30)", 2); + // change the schema and do another insert to force at least 2 splits + // use the same ID in both files so the delete file doesn't get optimized away by statstics + assertUpdate("INSERT INTO " + tableName + " VALUES (2, 22)", 1); + Table icebergTable = IcebergTestUtils.loadTable(tableName, metastore, fileSystemFactory, "iceberg", "test_schema"); + + // Delete only 1 row in the file so the data file is not pruned completely + writeEqualityDeleteForTable(icebergTable, + fileSystemFactory, + Optional.of(icebergTable.spec()), + Optional.empty(), + ImmutableMap.of("id", 2), + Optional.empty()); + + ImmutableMultiset expectedAccesses = ImmutableMultiset.builder() + .addCopies(new FileOperation(DATA, "InputFile.newInput"), 2) + .addCopies(new FileOperation(DELETE, "InputFile.newInput"), 1) + .build(); + + QueryRunner.MaterializedResultWithPlan queryResult = getDistributedQueryRunner().executeWithPlan(getSession(), "SELECT * FROM " + tableName); + assertThat(queryResult.result().getRowCount()) + .describedAs("query result row count") + .isEqualTo(1); + assertMultisetsEqual( + getOperations(getDistributedQueryRunner().getSpans()).stream() + .filter(operation -> ImmutableSet.of(DATA, DELETE).contains(operation.fileType())) + .collect(toImmutableMultiset()), + expectedAccesses); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testShowTables() + { + assertFileSystemAccesses("SHOW TABLES", ImmutableMultiset.of()); + } + + private void assertFileSystemAccesses(@Language("SQL") String query, Multiset expectedAccesses) + { + assertFileSystemAccesses(query, METADATA_FILES, expectedAccesses); + } + + private void assertFileSystemAccesses(@Language("SQL") String query, Scope scope, Multiset expectedAccesses) + { + assertFileSystemAccesses(getSession(), query, scope, expectedAccesses); + } + + private void assertFileSystemAccesses(Session session, @Language("SQL") String query, Multiset expectedAccesses) + { + assertFileSystemAccesses(session, query, METADATA_FILES, expectedAccesses); + } + + private synchronized void assertFileSystemAccesses(Session session, @Language("SQL") String query, Scope scope, Multiset expectedAccesses) + { + getDistributedQueryRunner().executeWithPlan(session, query); + assertMultisetsEqual( + getOperations(getDistributedQueryRunner().getSpans()).stream() + .filter(scope) + .collect(toImmutableMultiset()), + expectedAccesses); + } + + private long getLatestSnapshotId(String tableName) + { + return (long) computeScalar(format("SELECT snapshot_id FROM \"%s$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES", tableName)); + } + + private static Session withStatsOnWrite(Session session, boolean enabled) + { + String catalog = session.getCatalog().orElseThrow(); + return Session.builder(session) + .setCatalogSessionProperty(catalog, COLLECT_EXTENDED_STATISTICS_ON_WRITE, Boolean.toString(enabled)) + .build(); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGcsConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGcsConnectorSmokeTest.java index 2c92f12b9d96..98ee8a30ada2 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGcsConnectorSmokeTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGcsConnectorSmokeTest.java @@ -17,14 +17,14 @@ import com.google.common.io.Resources; import io.airlift.log.Logger; import io.trino.filesystem.Location; +import io.trino.metastore.HiveMetastore; import io.trino.plugin.hive.containers.HiveHadoop; -import io.trino.plugin.hive.metastore.HiveMetastore; import io.trino.plugin.hive.metastore.thrift.BridgingHiveMetastore; import io.trino.testing.QueryRunner; import io.trino.testing.TestingConnectorBehavior; -import org.testng.annotations.AfterClass; -import org.testng.annotations.Parameters; -import org.testng.annotations.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import java.io.IOException; import java.io.UncheckedIOException; @@ -38,12 +38,14 @@ import static io.trino.plugin.hive.containers.HiveHadoop.HIVE3_IMAGE; import static io.trino.plugin.iceberg.IcebergTestUtils.checkOrcFileSorting; import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.TestingProperties.requiredNonEmptySystemProperty; import static java.lang.String.format; import static java.nio.charset.StandardCharsets.UTF_8; -import static java.util.Objects.requireNonNull; import static org.apache.iceberg.FileFormat.ORC; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public class TestIcebergGcsConnectorSmokeTest extends BaseIcebergConnectorSmokeTest { @@ -56,12 +58,11 @@ public class TestIcebergGcsConnectorSmokeTest private HiveHadoop hiveHadoop; - @Parameters({"testing.gcp-storage-bucket", "testing.gcp-credentials-key"}) - public TestIcebergGcsConnectorSmokeTest(String gcpStorageBucket, String gcpCredentialKey) + public TestIcebergGcsConnectorSmokeTest() { super(ORC); - this.gcpStorageBucket = requireNonNull(gcpStorageBucket, "gcpStorageBucket is null"); - this.gcpCredentialKey = requireNonNull(gcpCredentialKey, "gcpCredentialKey is null"); + this.gcpStorageBucket = requiredNonEmptySystemProperty("testing.gcp-storage-bucket"); + this.gcpCredentialKey = requiredNonEmptySystemProperty("testing.gcp-credentials-key"); this.schema = "test_iceberg_gcs_connector_smoke_test_" + randomNameSuffix(); } @@ -93,11 +94,13 @@ protected QueryRunner createQueryRunner() return IcebergQueryRunner.builder() .setIcebergProperties(ImmutableMap.builder() .put("iceberg.catalog.type", "hive_metastore") - .put("hive.gcs.json-key", gcpCredentials) - .put("hive.metastore.uri", "thrift://" + hiveHadoop.getHiveMetastoreEndpoint()) + .put("fs.native-gcs.enabled", "true") + .put("gcs.json-key", gcpCredentials) + .put("hive.metastore.uri", hiveHadoop.getHiveMetastoreEndpoint().toString()) .put("iceberg.file-format", format.name()) .put("iceberg.register-table-procedure.enabled", "true") .put("iceberg.writer-sort-buffer-size", "1MB") + .put("iceberg.allowed-extra-properties", "write.metadata.delete-after-commit.enabled,write.metadata.previous-versions-max") .buildOrThrow()) .setSchemaInitializer( SchemaInitializer.builder() @@ -108,7 +111,7 @@ protected QueryRunner createQueryRunner() .build(); } - @AfterClass(alwaysRun = true) + @AfterAll public void removeTestData() { try { @@ -172,7 +175,7 @@ protected void dropTableFromMetastore(String tableName) HiveMetastore metastore = new BridgingHiveMetastore( testingThriftHiveMetastoreBuilder() .metastoreClient(hiveHadoop.getHiveMetastoreEndpoint()) - .build()); + .build(this::closeAfterClass)); metastore.dropTable(schema, tableName, false); assertThat(metastore.getTable(schema, tableName)).isEmpty(); } @@ -183,7 +186,7 @@ protected String getMetadataLocation(String tableName) HiveMetastore metastore = new BridgingHiveMetastore( testingThriftHiveMetastoreBuilder() .metastoreClient(hiveHadoop.getHiveMetastoreEndpoint()) - .build()); + .build(this::closeAfterClass)); return metastore .getTable(schema, tableName).orElseThrow() .getParameters().get("metadata_location"); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGetTableStatisticsOperations.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGetTableStatisticsOperations.java index 08e56a4e8b3c..d7be5d85dc59 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGetTableStatisticsOperations.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergGetTableStatisticsOperations.java @@ -14,79 +14,55 @@ package io.trino.plugin.iceberg; import com.google.common.collect.ImmutableMap; -import io.opentelemetry.sdk.testing.exporter.InMemorySpanExporter; -import io.opentelemetry.sdk.trace.SdkTracerProvider; import io.opentelemetry.sdk.trace.data.SpanData; -import io.opentelemetry.sdk.trace.export.SimpleSpanProcessor; -import io.trino.execution.warnings.WarningCollector; -import io.trino.metadata.InternalFunctionBundle; -import io.trino.metadata.MetadataManager; -import io.trino.plugin.hive.metastore.Database; -import io.trino.plugin.hive.metastore.HiveMetastore; -import io.trino.plugin.iceberg.catalog.file.TestingIcebergFileMetastoreCatalogModule; +import io.trino.metastore.Database; +import io.trino.metastore.HiveMetastore; import io.trino.plugin.tpch.TpchPlugin; import io.trino.spi.security.PrincipalType; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.LocalQueryRunner; +import io.trino.testing.DistributedQueryRunner; import io.trino.testing.QueryRunner; -import io.trino.tracing.TracingMetadata; import org.intellij.lang.annotations.Language; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.parallel.Execution; -import java.io.File; -import java.io.IOException; import java.nio.file.Files; +import java.nio.file.Path; import java.util.Optional; -import static com.google.common.io.MoreFiles.deleteRecursively; -import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; -import static com.google.inject.util.Modules.EMPTY_MODULE; -import static io.trino.execution.querystats.PlanOptimizersStatsCollector.createPlanOptimizersStatsCollector; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; -import static io.trino.sql.planner.LogicalPlanner.Stage.OPTIMIZED_AND_VALIDATED; -import static io.trino.testing.TestingSession.testSessionBuilder; -import static io.trino.transaction.TransactionBuilder.transaction; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.testing.TestingSession.testSession; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; // Cost-based optimizers' behaviors are affected by the statistics returned by the Connectors. Here is to count the getTableStatistics calls // when CBOs work with Iceberg Connector. -@Test(singleThreaded = true) // counting metadata is a shared mutable state +@TestInstance(PER_CLASS) +@Execution(SAME_THREAD) public class TestIcebergGetTableStatisticsOperations extends AbstractTestQueryFramework { - private LocalQueryRunner localQueryRunner; - private InMemorySpanExporter spanExporter; - private File metastoreDir; + private QueryRunner queryRunner; + private Path metastoreDir; @Override protected QueryRunner createQueryRunner() throws Exception { - spanExporter = closeAfterClass(InMemorySpanExporter.create()); - - SdkTracerProvider tracerProvider = SdkTracerProvider.builder() - .addSpanProcessor(SimpleSpanProcessor.create(spanExporter)) + queryRunner = DistributedQueryRunner.builder(testSession()) + .setWorkerCount(0) .build(); + queryRunner.installPlugin(new TpchPlugin()); + queryRunner.createCatalog("tpch", "tpch", ImmutableMap.of()); - localQueryRunner = LocalQueryRunner.builder(testSessionBuilder().build()) - .withMetadataProvider((systemSecurityMetadata, transactionManager, globalFunctionCatalog, typeManager) - -> new TracingMetadata(tracerProvider.get("test"), new MetadataManager(systemSecurityMetadata, transactionManager, globalFunctionCatalog, typeManager))) - .build(); - localQueryRunner.installPlugin(new TpchPlugin()); - localQueryRunner.createCatalog("tpch", "tpch", ImmutableMap.of()); + metastoreDir = Files.createTempDirectory("test_iceberg_get_table_statistics_operations"); + queryRunner.installPlugin(new TestingIcebergPlugin(metastoreDir)); + queryRunner.createCatalog("iceberg", "iceberg", ImmutableMap.of()); - InternalFunctionBundle.InternalFunctionBundleBuilder functions = InternalFunctionBundle.builder(); - new IcebergPlugin().getFunctions().forEach(functions::functions); - localQueryRunner.addFunctions(functions.build()); + HiveMetastore metastore = getHiveMetastore(queryRunner); - metastoreDir = Files.createTempDirectory("test_iceberg_get_table_statistics_operations").toFile(); - HiveMetastore metastore = createTestingFileHiveMetastore(metastoreDir); - localQueryRunner.createCatalog( - "iceberg", - new TestingIcebergConnectorFactory(Optional.of(new TestingIcebergFileMetastoreCatalogModule(metastore)), Optional.empty(), EMPTY_MODULE), - ImmutableMap.of()); Database database = Database.builder() .setDatabaseName("tiny") .setOwnerName(Optional.of("public")) @@ -94,27 +70,11 @@ protected QueryRunner createQueryRunner() .build(); metastore.createDatabase(database); - localQueryRunner.execute("CREATE TABLE iceberg.tiny.orders AS SELECT * FROM tpch.tiny.orders"); - localQueryRunner.execute("CREATE TABLE iceberg.tiny.lineitem AS SELECT * FROM tpch.tiny.lineitem"); - localQueryRunner.execute("CREATE TABLE iceberg.tiny.customer AS SELECT * FROM tpch.tiny.customer"); - - return localQueryRunner; - } - - @AfterClass(alwaysRun = true) - public void tearDown() - throws IOException - { - deleteRecursively(metastoreDir.toPath(), ALLOW_INSECURE); - localQueryRunner.close(); - localQueryRunner = null; - spanExporter = null; - } + queryRunner.execute("CREATE TABLE iceberg.tiny.orders AS SELECT * FROM tpch.tiny.orders"); + queryRunner.execute("CREATE TABLE iceberg.tiny.lineitem AS SELECT * FROM tpch.tiny.lineitem"); + queryRunner.execute("CREATE TABLE iceberg.tiny.customer AS SELECT * FROM tpch.tiny.customer"); - @BeforeMethod - public void resetCounters() - { - spanExporter.reset(); + return queryRunner; } @Test @@ -137,15 +97,12 @@ public void testThreeWayJoin() private void planDistributedQuery(@Language("SQL") String sql) { - transaction(localQueryRunner.getTransactionManager(), localQueryRunner.getAccessControl()) - .execute(localQueryRunner.getDefaultSession(), session -> { - localQueryRunner.createPlan(session, sql, OPTIMIZED_AND_VALIDATED, false, WarningCollector.NOOP, createPlanOptimizersStatsCollector()); - }); + queryRunner.inTransaction(transactionSession -> queryRunner.createPlan(transactionSession, sql)); } private long getTableStatisticsMethodInvocations() { - return spanExporter.getFinishedSpanItems().stream() + return queryRunner.getSpans().stream() .map(SpanData::getName) .filter(name -> name.equals("Metadata.getTableStatistics")) .count(); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergInputInfo.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergInputInfo.java index 0281aa736dd2..0b1394f3b1a4 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergInputInfo.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergInputInfo.java @@ -20,8 +20,9 @@ import io.trino.testing.AbstractTestQueryFramework; import io.trino.testing.QueryRunner; import io.trino.tpch.TpchTable; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import java.util.List; import java.util.Optional; import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; @@ -45,7 +46,7 @@ public void testInputWithPartitioning() { String tableName = "test_input_info_with_part_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['regionkey', 'truncate(name, 1)']) AS SELECT * FROM nation WHERE nationkey < 10", 10); - assertInputInfo(tableName, true, "PARQUET"); + assertInputInfo(tableName, ImmutableList.of("regionkey: identity", "name_trunc: truncate[1]"), "PARQUET", 9); assertUpdate("DROP TABLE " + tableName); } @@ -54,7 +55,7 @@ public void testInputWithoutPartitioning() { String tableName = "test_input_info_without_part_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM nation WHERE nationkey < 10", 10); - assertInputInfo(tableName, false, "PARQUET"); + assertInputInfo(tableName, ImmutableList.of(), "PARQUET", 1); assertUpdate("DROP TABLE " + tableName); } @@ -63,14 +64,14 @@ public void testInputWithOrcFileFormat() { String tableName = "test_input_info_with_orc_file_format_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (format = 'ORC') AS SELECT * FROM nation WHERE nationkey < 10", 10); - assertInputInfo(tableName, false, "ORC"); + assertInputInfo(tableName, ImmutableList.of(), "ORC", 1); assertUpdate("DROP TABLE " + tableName); } - private void assertInputInfo(String tableName, boolean expectedPartition, String expectedFileFormat) + private void assertInputInfo(String tableName, List partitionFields, String expectedFileFormat, long dataFiles) { inTransaction(session -> { - Metadata metadata = getQueryRunner().getMetadata(); + Metadata metadata = getQueryRunner().getPlannerContext().getMetadata(); QualifiedObjectName qualifiedObjectName = new QualifiedObjectName( session.getCatalog().orElse(ICEBERG_CATALOG), session.getSchema().orElse("tpch"), @@ -81,9 +82,13 @@ private void assertInputInfo(String tableName, boolean expectedPartition, String assertThat(tableInfo).isPresent(); IcebergInputInfo icebergInputInfo = (IcebergInputInfo) tableInfo.get(); assertThat(icebergInputInfo).isEqualTo(new IcebergInputInfo( - icebergInputInfo.getSnapshotId(), - Optional.of(expectedPartition), - expectedFileFormat)); + icebergInputInfo.snapshotId(), + partitionFields, + expectedFileFormat, + Optional.of("10"), + Optional.empty(), + Optional.of(String.valueOf(dataFiles)), + Optional.of("0"))); }); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergLocalConcurrentWrites.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergLocalConcurrentWrites.java new file mode 100644 index 000000000000..10ea8caa59dc --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergLocalConcurrentWrites.java @@ -0,0 +1,1413 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableList; +import io.airlift.concurrent.MoreFutures; +import io.trino.Session; +import io.trino.plugin.blackhole.BlackHolePlugin; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.MaterializedResult; +import io.trino.testing.QueryRunner; +import io.trino.testing.sql.TestTable; +import org.junit.jupiter.api.RepeatedTest; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.Callable; +import java.util.concurrent.CyclicBarrier; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static io.airlift.concurrent.MoreFutures.tryGetFutureValue; +import static io.trino.plugin.iceberg.IcebergSessionProperties.FILE_BASED_CONFLICT_DETECTION_ENABLED; +import static io.trino.testing.QueryAssertions.getTrinoExceptionCause; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static java.lang.String.format; +import static java.util.concurrent.Executors.newFixedThreadPool; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; + +@TestInstance(PER_CLASS) +final class TestIcebergLocalConcurrentWrites + extends AbstractTestQueryFramework +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + DistributedQueryRunner queryRunner = IcebergQueryRunner.builder().build(); + queryRunner.installPlugin(new BlackHolePlugin()); + queryRunner.createCatalog("blackhole", "blackhole"); + return queryRunner; + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentInserts() + throws Exception + { + testConcurrentInserts(false); + testConcurrentInserts(true); + } + + private void testConcurrentInserts(boolean partitioned) + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_inserts_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a INT, part INT) " + + (partitioned ? " WITH (partitioning = ARRAY['part'])" : "")); + + try { + // insert data concurrently + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (1, 10)"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (11, 20)"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (21, 30)"); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (11, 20), (21, 30)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentInsertsSelectingFromTheSameTable() + throws Exception + { + testConcurrentInsertsSelectingFromTheSameTable(true); + testConcurrentInsertsSelectingFromTheSameTable(false); + } + + private void testConcurrentInsertsSelectingFromTheSameTable(boolean partitioned) + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_inserts_select_from_same_table_" + randomNameSuffix(); + + assertUpdate( + "CREATE TABLE " + tableName + " (a, part) " + (partitioned ? " WITH (partitioning = ARRAY['part'])" : "") + " AS VALUES (0, 10)", + 1); + + try { + List> futures = IntStream.range(0, threads) + .mapToObj(_ -> executor.submit(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " SELECT COUNT(*), 10 AS part FROM " + tableName); + return true; + })) + .collect(toImmutableList()); + + long successfulInsertsCount = futures.stream() + .map(MoreFutures::getFutureValue) + .filter(success -> success) + .count(); + + assertThat(successfulInsertsCount).isEqualTo(3); + // Queries in Iceberg have snapshot isolation, so all writes are done with data available at beginning of transaction + assertQuery( + "SELECT * FROM " + tableName, + "VALUES (0, 10), (1, 10), (1, 10), (1, 10)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentInsertsSelectingFromTheSameVersionedTable() + throws Exception + { + testConcurrentInsertsSelectingFromTheSameVersionedTable(true); + testConcurrentInsertsSelectingFromTheSameVersionedTable(false); + } + + private void testConcurrentInsertsSelectingFromTheSameVersionedTable(boolean partitioned) + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_inserts_select_from_same_versioned_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) " + (partitioned ? " WITH (partitioning = ARRAY['part'])" : "") + " AS VALUES (0, 'a')", 1); + + long currentSnapshotId = getCurrentSnapshotId(tableName); + + try { + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " SELECT 1, 'b' AS part FROM " + tableName + " FOR VERSION AS OF " + currentSnapshotId); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " SELECT 2, 'c' AS part FROM " + tableName + " FOR VERSION AS OF " + currentSnapshotId); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " SELECT 3, 'd' AS part FROM " + tableName + " FOR VERSION AS OF " + currentSnapshotId); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertQuery("SELECT * FROM " + tableName, "VALUES (0, 'a'), (1, 'b'), (2, 'c'), (3, 'd')"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentDelete() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_deletes_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20), (21, 30), (31, 40)", 4); + + try { + // delete data concurrently by using non-overlapping partition predicate + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("DELETE FROM " + tableName + " WHERE part = 10"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("DELETE FROM " + tableName + " WHERE part = 20"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("DELETE FROM " + tableName + " WHERE part = 30"); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (31, 40)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentDeleteFromTheSamePartition() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_delete_from_same_partition_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (0, 10), (11, 20), (22, 30)", 3); + + try { + List> futures = IntStream.range(0, threads) + .mapToObj(threadNumber -> executor.submit(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("DELETE FROM " + tableName + " WHERE part = 10"); + return true; + })) + .collect(toImmutableList()); + + long successfulDeletesCount = futures.stream() + .map(MoreFutures::getFutureValue) + .filter(success -> success) + .count(); + + assertThat(successfulDeletesCount).isEqualTo(3); + assertQuery("SELECT * FROM " + tableName, "VALUES (11, 20), (22, 30)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentTruncate() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_truncate_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (0, 10), (11, 20), (22, 30)", 3); + + try { + List> futures = IntStream.range(0, threads) + .mapToObj(_ -> executor.submit(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("TRUNCATE TABLE " + tableName); + return true; + })) + .collect(toImmutableList()); + + long successfulTruncatesCount = futures.stream() + .map(MoreFutures::getFutureValue) + .filter(success -> success) + .count(); + + assertThat(successfulTruncatesCount).isEqualTo(3); + assertThat(query("SELECT * FROM " + tableName)).returnsEmptyResult(); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentTruncateAndInserts() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_truncate_and_inserts_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20)", 2); + + try { + // truncate data while concurrently adding new inserts + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("TRUNCATE TABLE " + tableName); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (21, 30)"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (31, 40)"); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertQuery("SELECT * FROM " + tableName, "VALUES (21, 30), (31, 40)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentNonOverlappingUpdate() + throws Exception + { + testConcurrentNonOverlappingUpdate(getSession()); + testConcurrentNonOverlappingUpdate(withFileBasedConflictDetectionDisabledSession()); + } + + private void testConcurrentNonOverlappingUpdate(Session session) + throws InterruptedException + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_non_overlapping_updates_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20), (21, NULL), (31, 40)", 4); + + try { + // update data concurrently by using non-overlapping partition predicate + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE part = 10"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE part = 20"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE part IS NULL"); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (2, 10), (12, 20), (22, NULL), (31, 40)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentNonOverlappingUpdateMultipleDataFiles() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_non_overlapping_updates_table_" + randomNameSuffix(); + // Force creating more parquet files + Session session = Session.builder(getSession()) + .setCatalogSessionProperty("iceberg", "target_max_file_size", "1kB") + .build(); + + assertUpdate("CREATE TABLE " + tableName + " (a BIGINT, part BIGINT) WITH (partitioning = ARRAY['part'])"); + assertUpdate(session, " INSERT INTO " + tableName + " SELECT * FROM " + + "(select * from UNNEST(SEQUENCE(1, 10000)) AS t(a)) CROSS JOIN (select * from UNNEST(SEQUENCE(1, 3)) AS t(part))", 30000); + + // UPDATE will increase every value by 1 + long expectedDataSum = (long) computeScalar("SELECT sum(a + 1) FROM " + tableName); + + try { + // update data concurrently by using non-overlapping partition predicate + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE part = 1"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE part = 2"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE part = 3"); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat((long) computeScalar("SELECT SUM(a) FROM " + tableName)).isEqualTo(expectedDataSum); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentOverlappingUpdate() + throws Exception + { + testConcurrentOverlappingUpdate(false); + testConcurrentOverlappingUpdate(true); + } + + private void testConcurrentOverlappingUpdate(boolean partitioned) + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_overlapping_updates_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) " + + (partitioned ? " WITH (partitioning = ARRAY['part'])" : "") + + " AS VALUES (1, 10), (11, 20), (21, NULL), (31, 40)", 4); + + try { + List> futures = IntStream.range(0, threads) + .mapToObj(_ -> executor.submit(() -> { + barrier.await(10, SECONDS); + try { + getQueryRunner().execute("UPDATE " + tableName + " SET a = a + 1 WHERE a > 11"); + return true; + } + catch (Exception e) { + RuntimeException trinoException = getTrinoExceptionCause(e); + try { + assertThat(trinoException).hasMessageMatching("Failed to commit the transaction during write.*|" + + "Failed to commit during write.*"); + } + catch (Throwable verifyFailure) { + if (verifyFailure != e) { + verifyFailure.addSuppressed(e); + } + throw verifyFailure; + } + return false; + } + })) + .collect(toImmutableList()); + + long successes = futures.stream() + .map(future -> tryGetFutureValue(future, 10, SECONDS).orElseThrow(() -> new RuntimeException("Wait timed out"))) + .filter(success -> success) + .count(); + + assertThat(successes).isGreaterThanOrEqualTo(1); + //There can be different possible results depending on query order execution. + switch ((int) successes) { + case 1 -> assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (11, 20), (22, NULL), (32, 40)"); + case 2 -> assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (11, 20), (23, NULL), (33, 40)"); + case 3 -> assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (11, 20), (24, NULL), (34, 40)"); + } + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentNonOverlappingUpdateOnNestedPartition() + throws Exception + { + testConcurrentNonOverlappingUpdateOnNestedPartition(getSession()); + testConcurrentNonOverlappingUpdateOnNestedPartition(withFileBasedConflictDetectionDisabledSession()); + } + + private void testConcurrentNonOverlappingUpdateOnNestedPartition(Session session) + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_non_overlapping_updates_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a int, parent ROW(child int)) WITH (partitioning = ARRAY['\"parent.child\"'])"); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW(10)), " + + "(11, ROW(20)), " + + "(21, ROW(NULL)), " + + "(31, ROW(40))", + 4); + try { + // update data concurrently by using non-overlapping partition predicate + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE parent.child = 10"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE parent.child = 20"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(session, "UPDATE " + tableName + " SET a = a + 1 WHERE parent.child IS NULL"); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat(query("SELECT a, parent.child FROM " + tableName)).matches("VALUES (2, 10), (12, 20), (22, NULL), (31, 40)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentDeleteAndInserts() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_delete_and_inserts_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20)", 2); + + try { + // Use a WHERE predicate in the DELETE statement which involves scanning the whole table while concurrently adding new inserts + List> futures = executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + // DELETE will in most cases conflict with (21, 30) insert + try { + getQueryRunner().execute("DELETE FROM " + tableName + " WHERE a > 10"); + return true; + } + catch (Exception e) { + RuntimeException trinoException = getTrinoExceptionCause(e); + try { + assertThat(trinoException).hasMessageMatching("Failed to commit the transaction during write.*|" + + "Failed to commit during write.*"); + } + catch (Throwable verifyFailure) { + if (verifyFailure != e) { + verifyFailure.addSuppressed(e); + } + throw verifyFailure; + } + return false; + } + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (8, 10)"); + return true; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (21, 30)"); + return true; + }) + .build()) + .stream().collect(toImmutableList()); + + long successfulWrites = futures.stream() + .map(future -> tryGetFutureValue(future, 10, SECONDS).orElseThrow(() -> new RuntimeException("Wait timed out"))) + .filter(success -> success) + .count(); + + assertThat(successfulWrites).isGreaterThanOrEqualTo(2); + + //There can be different possible results depending on query order execution. + if (successfulWrites == 2) { + // If all queries starts at the same time DELETE will fail and results are: + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (8, 10), (11, 20), (21, 30)"); + } + else { + // If DELETE is executed after INSERTS: + MaterializedResult expected1 = computeActual("VALUES (1, 10), (8, 10)"); + // If DELETE is executed before INSERTS: + MaterializedResult expected2 = computeActual("VALUES (1, 10), (8, 10), (21, 30)"); + assertThat(computeActual("SELECT * FROM " + tableName + " ORDER BY a")) + .isIn(expected1, expected2); + } + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentUpdateAndInserts() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_update_and_inserts_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20)", 2); + + try { + // Use a WHERE predicate in the UPDATE statement which involves scanning the whole table while concurrently adding new inserts + List> futures = executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + try { + getQueryRunner().execute("UPDATE " + tableName + " SET a = a + 1"); + return true; + } + catch (Exception e) { + RuntimeException trinoException = getTrinoExceptionCause(e); + try { + assertThat(trinoException).hasMessageMatching("Failed to commit the transaction during write.*|" + + "Failed to commit during write.*"); + } + catch (Throwable verifyFailure) { + if (verifyFailure != e) { + verifyFailure.addSuppressed(e); + } + throw verifyFailure; + } + return false; + } + }) + .add(() -> { + barrier.await(10, SECONDS); + Thread.sleep(1000); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (13, 20)"); + return true; + }) + .add(() -> { + barrier.await(10, SECONDS); + Thread.sleep(1000); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (21, 30)"); + return true; + }) + .build()) + .stream().collect(toImmutableList()); + + long successfulWrites = futures.stream() + .map(future -> tryGetFutureValue(future, 10, SECONDS).orElseThrow(() -> new RuntimeException("Wait timed out"))) + .filter(success -> success) + .count(); + + assertThat(successfulWrites).isGreaterThanOrEqualTo(2); + + //There can be different possible results depending on query order execution. + if (successfulWrites == 2) { + // If all queries starts at the same time UPDATE will fail and results are: + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (11, 20), (13, 20), (21, 30)"); + } + else { + // If UPDATE is executed after INSERTS: + MaterializedResult expected1 = computeActual("VALUES (2, 10), (12, 20), (13, 20), (21, 30)"); + // If UPDATE is executed before INSERTS: + MaterializedResult expected2 = computeActual("VALUES (2, 10), (12, 20), (14, 20), (22, 30)"); + assertThat(computeActual("SELECT * FROM " + tableName + " ORDER BY a")) + .isIn(expected1, expected2); + } + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + public void testConcurrentMerge() + throws Exception + { + int threads = 4; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_merges_table_" + randomNameSuffix(); + String sourceTableName = "test_concurrent_merges_source_table_" + randomNameSuffix(); + + // Helper table to simulate longer query time during MERGE + assertUpdate("CREATE TABLE " + sourceTableName + " (a, part, string_rep) AS SELECT *, format('a%spart%s', a, part) FROM " + + "(select * from UNNEST(SEQUENCE(1, 2000)) AS t(a)) CROSS JOIN (select * from UNNEST(SEQUENCE(1, 2000)) AS t(part))", 4000000); + assertUpdate("INSERT INTO " + sourceTableName + " VALUES (42, NULL, 'a42partNULL')", 1); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20), (21, 30), (31, 40), (41, NULL)", 5); + // Add more files in the partition 30 + assertUpdate("INSERT INTO " + tableName + " VALUES (22, 30)", 1); + try { + // merge data concurrently by using non-overlapping partition predicate + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute( + """ + MERGE INTO %s t USING (select a, part from %s where string_rep LIKE '%%a12part20') AS s + ON (FALSE) + WHEN NOT MATCHED THEN INSERT (a, part) VALUES(s.a, s.part) + """.formatted(tableName, sourceTableName)); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute( + """ + MERGE INTO %s t USING (select a, part from %s where string_rep LIKE '%%a42partNULL') AS s + ON (FALSE) + WHEN NOT MATCHED THEN INSERT (a, part) VALUES(s.a, s.part) + """.formatted(tableName, sourceTableName)); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute( + """ + MERGE INTO %s t USING (VALUES (21, 30)) AS s(a, part) + ON (t.part = s.part) + WHEN MATCHED THEN DELETE + """.formatted(tableName)); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute( + """ + MERGE INTO %s t USING (VALUES (32, 40)) AS s(a, part) + ON (t.part = s.part) + WHEN MATCHED THEN UPDATE SET a = s.a + """.formatted(tableName)); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (11, 20), (12, 20), (32, 40), (41, NULL), (42, NULL)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentMergeAndInserts() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_merge_and_inserts_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20)", 2); + + try { + List> futures = executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + try { + getQueryRunner().execute( + """ + MERGE INTO %s t USING (VALUES (11, 20), (8, 10), (21, 30)) AS s(a, part) + ON (t.a = s.a AND t.part = s.part) + WHEN MATCHED THEN DELETE + """.formatted(tableName)); + return true; + } + catch (Exception e) { + RuntimeException trinoException = getTrinoExceptionCause(e); + try { + assertThat(trinoException).hasMessageMatching("Failed to commit the transaction during write.*|" + + "Failed to commit during write.*"); + } + catch (Throwable verifyFailure) { + if (verifyFailure != e) { + verifyFailure.addSuppressed(e); + } + throw verifyFailure; + } + return false; + } + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (8, 10)"); + return true; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " VALUES (21, 30)"); + return true; + }) + .build()) + .stream().collect(toImmutableList()); + + long successfulWrites = futures.stream() + .map(future -> tryGetFutureValue(future, 10, SECONDS).orElseThrow(() -> new RuntimeException("Wait timed out"))) + .filter(success -> success) + .count(); + + assertThat(successfulWrites).isGreaterThanOrEqualTo(2); + + //There can be different possible results depending on query order execution. + if (successfulWrites == 2) { + // If all queries starts at the same time MERGE will fail and results are: + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (1, 10), (11, 20), (8, 10), (21, 30)"); + } + else { + // If MERGE is executed after INSERTS: + MaterializedResult expected1 = computeActual("VALUES (1, 10)"); + // If MERGE is executed before INSERTS: + MaterializedResult expected2 = computeActual("VALUES (1, 10), (8, 10), (21, 30)"); + assertThat(computeActual("SELECT * FROM " + tableName + " ORDER BY a")) + .isIn(expected1, expected2); + } + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentDeleteAndDeletePushdownAndInsert() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + String tableName = "test_concurrent_delete_and_inserts_table_" + randomNameSuffix(); + + assertUpdate("CREATE TABLE " + tableName + " (a, part) WITH (partitioning = ARRAY['part']) AS VALUES (1, 10), (11, 20), (21, 30)", 3); + // Add more files in the partition 10 + assertUpdate("INSERT INTO " + tableName + " VALUES (2, 10)", 1); + + try { + // The DELETE and INSERT operation operate on non-overlapping partitions + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + // Use a non-partition filter as well to ensure the DELETE operation is not being pushed down + getQueryRunner().execute("DELETE FROM " + tableName + " WHERE part = 10 AND a IN (1, 2)"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("INSERT INTO " + tableName + " SELECT a + 1, part FROM " + tableName + " WHERE part = 20"); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute("DELETE FROM " + tableName + " WHERE part = 30"); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES (11, 20), (12, 20)"); + } + finally { + assertUpdate("DROP TABLE " + tableName); + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentUpdateWithPartitionTransformation() + throws Exception + { + int threads = 4; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + List rows = ImmutableList.of("('A', DATE '2024-01-01')", "('B', DATE '2024-02-02')", "('C', DATE '2024-03-03')", "('D', DATE '2024-04-04')"); + List partitions = ImmutableList.of("DATE '2024-01-01'", "DATE '2024-02-02'", "DATE '2024-03-03'", "DATE '2024-04-04'"); + + try (TestTable table = newTrinoTable( + "test_concurrent_update_partition_transform_table_", + "(data varchar, part date) with (partitioning = array['month(part)'])")) { + String tableName = table.getName(); + + assertUpdate("INSERT INTO " + tableName + " VALUES " + String.join(", ", rows), 4); + + List> futures = IntStream.range(0, threads) + .mapToObj(threadNumber -> executor.submit(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(format("UPDATE %s SET data = data || data WHERE part = %s", tableName, partitions.get(threadNumber))); + return true; + })) + .collect(toImmutableList()); + + futures.forEach(future -> { + Optional value = tryGetFutureValue(future, 20, SECONDS); + checkState(value.isPresent(), "Task did not complete in time"); + boolean updateSuccessful = value.get(); + checkState(updateSuccessful, "Task did not complete successfully"); + }); + + assertThat(query("SELECT data, part FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES ('AA', DATE '2024-01-01'), ('BB', DATE '2024-02-02'), ('CC', DATE '2024-03-03'), ('DD', DATE '2024-04-04')"); + } + finally { + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentUpdateWithNestedPartitionTransformation() + throws Exception + { + int threads = 4; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + List rows = ImmutableList.of("('A', ROW(DATE '2024-01-01'))", "('B', ROW(DATE '2024-02-02'))", "('C', ROW(DATE '2024-03-03'))", "('D', ROW(DATE '2024-04-04'))"); + List partitions = ImmutableList.of("DATE '2024-01-01'", "DATE '2024-02-02'", "DATE '2024-03-03'", "DATE '2024-04-04'"); + + try (TestTable table = newTrinoTable( + "test_concurrent_update_partition_transform_table_", + "(data varchar, parent ROW (part date)) with (partitioning = array['month(\"parent.part\")'])")) { + String tableName = table.getName(); + + assertUpdate("INSERT INTO " + tableName + " VALUES " + String.join(", ", rows), 4); + + List> futures = IntStream.range(0, threads) + .mapToObj(threadNumber -> executor.submit(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(format("UPDATE %s SET data = data || data WHERE parent.part = %s", tableName, partitions.get(threadNumber))); + return true; + })) + .collect(toImmutableList()); + + futures.forEach(future -> { + Optional value = tryGetFutureValue(future, 20, SECONDS); + checkState(value.isPresent(), "Task did not complete in time"); + boolean updateSuccessful = value.get(); + checkState(updateSuccessful, "Task did not complete successfully"); + }); + + assertThat(query("SELECT data, parent.part FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES ('AA', DATE '2024-01-01'), ('BB', DATE '2024-02-02'), ('CC', DATE '2024-03-03'), ('DD', DATE '2024-04-04')"); + } + finally { + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentUpdateWithMultiplePartitionTransformation() + throws Exception + { + int threads = 4; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + List rows = ImmutableList.of("('A', TIMESTAMP '2024-01-01 01:01', 1, 'aaa')", + "('B', TIMESTAMP '2024-01-01 02:02', 1, 'aab')", + "('C', TIMESTAMP '2024-01-01 03:03', 1, 'aac')", + "('D', TIMESTAMP '2024-01-01 04:04', 1, 'aad')"); + // Only hour partition is not-overlapping + List partitions1 = ImmutableList.of("TIMESTAMP '2024-01-01 01:01'", "TIMESTAMP '2024-01-01 02:02'", "TIMESTAMP '2024-01-01 03:03'", "TIMESTAMP '2024-01-01 04:04'"); + List partitions2 = ImmutableList.of("1", "1", "1", "1"); + List partitions3 = ImmutableList.of("'aaa'", "'aab'", "'aac'", "'aad'"); + + try (TestTable table = newTrinoTable( + "test_concurrent_update_multiple_partition_transform_table_", + "(data varchar, part1 timestamp, part2 int, part3 varchar) with (partitioning = array['hour(part1)', 'bucket(part2, 10)', 'truncate(part3, 2)'])")) { + String tableName = table.getName(); + + assertUpdate("INSERT INTO " + tableName + " VALUES " + String.join(", ", rows), 4); + + List> futures = IntStream.range(0, threads) + .mapToObj(threadNumber -> executor.submit(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(format( + "UPDATE %s SET data = data || data WHERE part1 = %s AND part2 = %s AND part3 = %s", + tableName, + partitions1.get(threadNumber), + partitions2.get(threadNumber), + partitions3.get(threadNumber))); + return true; + })) + .collect(toImmutableList()); + + futures.forEach(future -> { + Optional value = tryGetFutureValue(future, 20, SECONDS); + checkState(value.isPresent(), "Task did not complete in time"); + boolean updateSuccessful = value.get(); + checkState(updateSuccessful, "Task did not complete successfully"); + }); + + assertThat(query("SELECT data, part1, part2, part3 FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES ('AA', TIMESTAMP '2024-01-01 01:01', 1, 'aaa'), " + + "('BB', TIMESTAMP '2024-01-01 02:02', 1, 'aab')," + + " ('CC', TIMESTAMP '2024-01-01 03:03', 1, 'aac'), " + + "('DD', TIMESTAMP '2024-01-01 04:04', 1, 'aad')"); + } + finally { + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentUpdateWithOverlappingPartitionTransformation() + throws Exception + { + int threads = 4; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + List rows = ImmutableList.of("('A', DATE '2024-01-01')", "('B', DATE '2024-01-02')", "('C', DATE '2024-03-03')", "('D', DATE '2024-04-04')"); + List partitions = ImmutableList.of("DATE '2024-01-01'", "DATE '2024-01-02'", "DATE '2024-03-03'", "DATE '2024-04-04'"); + + try (TestTable table = newTrinoTable( + "test_concurrent_update_overlapping_partition_transform_table_", + "(data varchar, part date) with (partitioning = array['month(part)'])")) { + String tableName = table.getName(); + + assertUpdate("INSERT INTO " + tableName + " VALUES " + String.join(", ", rows), 4); + + List> futures = IntStream.range(0, threads) + .mapToObj(threadNumber -> executor.submit(() -> { + barrier.await(10, SECONDS); + try { + getQueryRunner().execute(format("UPDATE %s SET data = data || data WHERE part = %s", tableName, partitions.get(threadNumber))); + return true; + } + catch (Exception e) { + RuntimeException trinoException = getTrinoExceptionCause(e); + try { + assertThat(trinoException).hasMessageMatching("Failed to commit the transaction during write.*|" + + "Failed to commit during write.*"); + } + catch (Throwable verifyFailure) { + if (verifyFailure != e) { + verifyFailure.addSuppressed(e); + } + throw verifyFailure; + } + return false; + } + })) + .collect(toImmutableList()); + + long successfulWrites = futures.stream() + .map(future -> tryGetFutureValue(future, 10, SECONDS).orElseThrow(() -> new RuntimeException("Wait timed out"))) + .filter(success -> success) + .count(); + + assertThat(successfulWrites).isEqualTo(3); + + //There can be two possible results depended on which thread fails + MaterializedResult expected1 = computeActual("VALUES (VARCHAR 'AA', DATE '2024-01-01'), ('B', DATE '2024-01-02'), ('CC', DATE '2024-03-03'), ('DD', DATE '2024-04-04')"); + MaterializedResult expected2 = computeActual("VALUES (VARCHAR 'A', DATE '2024-01-01'), ('BB', DATE '2024-01-02'), ('CC', DATE '2024-03-03'), ('DD', DATE '2024-04-04')"); + assertThat(computeActual("SELECT data, part FROM " + tableName + " ORDER BY data")) + .isIn(expected1, expected2); + } + finally { + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentUpdateWithEnforcedAndUnenforcedPartitions() + throws Exception + { + int threads = 4; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + List rows = ImmutableList.of("('A', 'a', DATE '2024-01-01')", "('B', 'b', DATE '2024-02-02')", "('C', 'c', DATE '2024-03-03')", "('D', 'd', DATE '2024-04-04')"); + List partitions1 = ImmutableList.of("'a'", "'b'", "'c'", "'d'"); + List partitions2 = ImmutableList.of("DATE '2024-01-01'", "DATE '2024-02-02'", "DATE '2024-03-03'", "DATE '2024-04-04'"); + + try (TestTable table = newTrinoTable( + "test_concurrent_update_enforced_unenforced_partition_transform_table_", + // part1 is enforced and part2 is unenforced as it has transformation + "(data varchar, part1 varchar, part2 date) with (partitioning = array['part1', 'month(part2)'])")) { + String tableName = table.getName(); + + assertUpdate("INSERT INTO " + tableName + " VALUES " + String.join(", ", rows), 4); + + List> futures = IntStream.range(0, threads) + .mapToObj(threadNumber -> executor.submit(() -> { + barrier.await(10, SECONDS); + getQueryRunner().execute(format("UPDATE %s SET data = data || data WHERE part1 = %s AND part2 = %s", tableName, partitions1.get(threadNumber), partitions2.get(threadNumber))); + return true; + })) + .collect(toImmutableList()); + + futures.forEach(future -> { + Optional value = tryGetFutureValue(future, 20, SECONDS); + checkState(value.isPresent(), "Task did not complete in time"); + boolean updateSuccessful = value.get(); + checkState(updateSuccessful, "Task did not complete successfully"); + }); + + assertThat(query("SELECT data, part1, part2 FROM " + tableName)) + .skippingTypesCheck() + .matches("VALUES ('AA', 'a', DATE '2024-01-01'), ('BB', 'b', DATE '2024-02-02'), ('CC', 'c', DATE '2024-03-03'), ('DD', 'd', DATE '2024-04-04')"); + } + finally { + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + @Test + public void testOptimizeDuringWriteOperations() + throws Exception + { + runOptimizeDuringWriteOperations(true); + runOptimizeDuringWriteOperations(false); + } + + private void runOptimizeDuringWriteOperations(boolean useSmallFiles) + throws Exception + { + int threads = 5; + int deletionThreads = threads - 1; + int rows = 12; + int rowsPerThread = rows / deletionThreads; + + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + + // Slow down the delete operations so optimize is more likely to complete + String blackholeTable = "blackhole_table_" + randomNameSuffix(); + assertUpdate("CREATE TABLE blackhole.default.%s (a INT, b INT) WITH (split_count = 1, pages_per_split = 1, rows_per_page = 1, page_processing_delay = '3s')".formatted(blackholeTable)); + + try (TestTable table = newTrinoTable( + "test_optimize_during_write_operations", + "(int_col INT)")) { + String tableName = table.getName(); + + // Testing both situations where a file is fully removed by the delete operation and when a row level delete is required. + if (useSmallFiles) { + for (int i = 0; i < rows; i++) { + assertUpdate(format("INSERT INTO %s VALUES %s", tableName, i), 1); + } + } + else { + String values = IntStream.range(0, rows).mapToObj(String::valueOf).collect(Collectors.joining(", ")); + assertUpdate(format("INSERT INTO %s VALUES %s", tableName, values), rows); + } + + List>> deletionFutures = IntStream.range(0, deletionThreads) + .mapToObj(threadNumber -> executor.submit(() -> { + barrier.await(10, SECONDS); + List successfulDeletes = new ArrayList<>(); + for (int i = 0; i < rowsPerThread; i++) { + try { + int rowNumber = threadNumber * rowsPerThread + i; + getQueryRunner().execute(format("DELETE FROM %s WHERE int_col = %s OR ((SELECT count(*) FROM blackhole.default.%s) > 42)", tableName, rowNumber, blackholeTable)); + successfulDeletes.add(true); + } + catch (RuntimeException e) { + successfulDeletes.add(false); + } + } + return successfulDeletes; + })) + .collect(toImmutableList()); + + Future optimizeFuture = executor.submit(() -> { + try { + barrier.await(10, SECONDS); + // Allow for some deletes to start before running optimize + Thread.sleep(50); + assertUpdate("ALTER TABLE %s EXECUTE optimize".formatted(tableName)); + } + catch (Exception e) { + throw new RuntimeException(e); + } + }); + + List expectedValues = new ArrayList<>(); + for (int threadNumber = 0; threadNumber < deletionThreads; threadNumber++) { + List deleteOutcomes = deletionFutures.get(threadNumber).get(); + verify(deleteOutcomes.size() == rowsPerThread); + for (int rowNumber = 0; rowNumber < rowsPerThread; rowNumber++) { + boolean successfulDelete = deleteOutcomes.get(rowNumber); + if (!successfulDelete) { + expectedValues.add(String.valueOf(threadNumber * rowsPerThread + rowNumber)); + } + } + } + + optimizeFuture.get(); + assertThat(expectedValues.size()).isGreaterThan(0).isLessThan(rows); + assertQuery("SELECT * FROM " + tableName, "VALUES " + String.join(", ", expectedValues)); + } + finally { + executor.shutdownNow(); + executor.awaitTermination(10, SECONDS); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentOverlappingOptimize() + throws Exception + { + testConcurrentOverlappingOptimize(true); + testConcurrentOverlappingOptimize(false); + } + + private void testConcurrentOverlappingOptimize(boolean partitioned) + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + + try (TestTable table = newTrinoTable( + "test_concurrent_non_overlapping_optimize_table_", + "(a INT, part INT) " + (partitioned ? " WITH (partitioning = ARRAY['part'])" : ""))) { + ImmutableList.Builder expectedValues = ImmutableList.builder(); + // Add 10 files to each partition + for (int i = 0; i < 10; i++) { + String values = format("(%1$d, 10), (%1$d, 20), (%1$d, NULL), (%1$d, 40)", i); + expectedValues.add(values); + assertUpdate(format("INSERT INTO %s VALUES %s", table.getName(), values), 4); + } + + List> futures = IntStream.range(0, threads) + .mapToObj(_ -> executor.submit(() -> { + barrier.await(10, SECONDS); + try { + getQueryRunner().execute("ALTER TABLE %s EXECUTE optimize".formatted(table.getName())); + return true; + } + catch (Exception e) { + RuntimeException trinoException = getTrinoExceptionCause(e); + try { + assertThat(trinoException).hasMessageMatching("Failed to commit the transaction during optimize.*|" + + "Failed to commit during optimize.*"); + } + catch (Throwable verifyFailure) { + if (verifyFailure != e) { + verifyFailure.addSuppressed(e); + } + throw verifyFailure; + } + return false; + } + })) + .collect(toImmutableList()); + + long successes = futures.stream() + .map(future -> tryGetFutureValue(future, 10, SECONDS).orElseThrow(() -> new RuntimeException("Wait timed out"))) + .filter(success -> success) + .count(); + + assertThat(successes).isGreaterThanOrEqualTo(1); + + assertThat(query("SELECT * FROM " + table.getName())).matches("VALUES " + String.join(", ", expectedValues.build())); + } + finally { + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + // Repeat test with invocationCount for better test coverage, since the tested aspect is inherently non-deterministic. + @RepeatedTest(3) + void testConcurrentNonOverlappingOptimize() + throws Exception + { + int threads = 3; + CyclicBarrier barrier = new CyclicBarrier(threads); + ExecutorService executor = newFixedThreadPool(threads); + + try (TestTable table = newTrinoTable( + "test_concurrent_non_overlapping_optimize_table_", + "(a INT, part INT) WITH (partitioning = ARRAY['part']) ")) { + ImmutableList.Builder expectedValues = ImmutableList.builder(); + // Add 10 files to each partition + for (int i = 0; i < 10; i++) { + String values = format("(%1$d, 10), (%1$d, 20), (%1$d, NULL), (%1$d, 40)", i); + expectedValues.add(values); + assertUpdate(format("INSERT INTO %s VALUES %s", table.getName(), values), 4); + } + + // optimize concurrently by using non-overlapping partition predicate + executor.invokeAll(ImmutableList.>builder() + .add(() -> { + barrier.await(10, SECONDS); + assertUpdate("ALTER TABLE %s EXECUTE optimize WHERE part = 10".formatted(table.getName())); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + assertUpdate("ALTER TABLE %s EXECUTE optimize WHERE part = 20".formatted(table.getName())); + return null; + }) + .add(() -> { + barrier.await(10, SECONDS); + assertUpdate("ALTER TABLE %s EXECUTE optimize WHERE part IS NULL".formatted(table.getName())); + return null; + }) + .build()) + .forEach(MoreFutures::getDone); + + assertThat(query("SELECT * FROM " + table.getName())).matches("VALUES " + String.join(", ", expectedValues.build())); + } + finally { + executor.shutdownNow(); + assertThat(executor.awaitTermination(10, SECONDS)).isTrue(); + } + } + + private long getCurrentSnapshotId(String tableName) + { + return (long) computeScalar("SELECT snapshot_id FROM \"" + tableName + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES"); + } + + private Session withFileBasedConflictDetectionDisabledSession() + { + return Session.builder(getSession()) + .setCatalogSessionProperty(getSession().getCatalog().orElseThrow(), FILE_BASED_CONFLICT_DETECTION_ENABLED, "false") + .build(); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMaterializedView.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMaterializedView.java index 59ba415c1dab..b7cb33d278ac 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMaterializedView.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMaterializedView.java @@ -14,37 +14,55 @@ package io.trino.plugin.iceberg; import io.trino.Session; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.Table; import io.trino.sql.tree.ExplainType; -import io.trino.testing.DistributedQueryRunner; -import org.testng.annotations.Test; +import io.trino.testing.QueryRunner; +import org.junit.jupiter.api.Test; import java.util.Map; import static io.trino.plugin.base.util.Closables.closeAllSuppress; -import static io.trino.plugin.iceberg.IcebergQueryRunner.createIcebergQueryRunner; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static org.apache.iceberg.BaseMetastoreTableOperations.METADATA_LOCATION_PROP; import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergMaterializedView extends BaseIcebergMaterializedViewTest { private Session secondIceberg; + private HiveMetastore metastore; @Override - protected DistributedQueryRunner createQueryRunner() + protected QueryRunner createQueryRunner() throws Exception { - DistributedQueryRunner queryRunner = createIcebergQueryRunner(); + QueryRunner queryRunner = IcebergQueryRunner.builder() + .build(); try { + metastore = getHiveMetastore(queryRunner); + queryRunner.createCatalog("iceberg2", "iceberg", Map.of( "iceberg.catalog.type", "TESTING_FILE_METASTORE", "hive.metastore.catalog.dir", queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg2-catalog").toString(), - "iceberg.hive-catalog-name", "hive")); + "iceberg.hive-catalog-name", "hive", + "fs.hadoop.enabled", "true")); secondIceberg = Session.builder(queryRunner.getDefaultSession()) .setCatalog("iceberg2") .build(); + queryRunner.createCatalog("iceberg_legacy_mv", "iceberg", Map.of( + "iceberg.catalog.type", "TESTING_FILE_METASTORE", + "hive.metastore.catalog.dir", queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data").toString(), + "iceberg.hive-catalog-name", "hive", + "iceberg.materialized-views.hide-storage-table", "false", + "fs.hadoop.enabled", "true")); + queryRunner.execute(secondIceberg, "CREATE SCHEMA " + secondIceberg.getSchema().orElseThrow()); + + queryRunner.installPlugin(createMockConnectorPlugin()); + queryRunner.createCatalog("mock", "mock"); } catch (Throwable e) { closeAllSuppress(e, queryRunner); @@ -56,7 +74,14 @@ protected DistributedQueryRunner createQueryRunner() @Override protected String getSchemaDirectory() { - return getDistributedQueryRunner().getCoordinator().getBaseDataDir().resolve("iceberg_data/tpch").toString(); + return "local:///tpch"; + } + + @Override + protected String getStorageMetadataLocation(String materializedViewName) + { + Table table = metastore.getTable("tpch", materializedViewName).orElseThrow(); + return table.getParameters().get(METADATA_LOCATION_PROP); } @Test @@ -69,9 +94,10 @@ public void testTwoIcebergCatalogs() assertUpdate(secondIceberg, createTable, 1); // this one will be used by MV assertUpdate(defaultIceberg, createTable, 1); // this one exists so that it can be mistakenly treated as the base table - assertUpdate(defaultIceberg, """ - CREATE MATERIALIZED VIEW iceberg.tpch.mv_on_iceberg2 - AS SELECT sum(value) AS s FROM iceberg2.tpch.common_base_table + assertUpdate(defaultIceberg, + """ + CREATE MATERIALIZED VIEW iceberg.tpch.mv_on_iceberg2 + AS SELECT sum(value) AS s FROM iceberg2.tpch.common_base_table """); // The MV is initially stale @@ -83,7 +109,7 @@ AS SELECT sum(value) AS s FROM iceberg2.tpch.common_base_table // After REFRESH, the MV is fresh assertUpdate(defaultIceberg, "REFRESH MATERIALIZED VIEW mv_on_iceberg2", 1); assertThat(getExplainPlan("TABLE mv_on_iceberg2", ExplainType.Type.IO)) - .contains("\"table\" : \"st_") + .contains("\"table\" : \"mv_on_iceberg2$materialized_view_storage") .doesNotContain("common_base_table"); assertThat(query("TABLE mv_on_iceberg2")) .matches("VALUES BIGINT '10'"); @@ -91,7 +117,7 @@ AS SELECT sum(value) AS s FROM iceberg2.tpch.common_base_table // After INSERT to the base table, the MV is still fresh, because it currently does not detect changes to tables in other catalog. assertUpdate(secondIceberg, "INSERT INTO common_base_table VALUES 7", 1); assertThat(getExplainPlan("TABLE mv_on_iceberg2", ExplainType.Type.IO)) - .contains("\"table\" : \"st_") + .contains("\"table\" : \"mv_on_iceberg2$materialized_view_storage") .doesNotContain("common_base_table"); assertThat(query("TABLE mv_on_iceberg2")) .matches("VALUES BIGINT '10'"); @@ -99,7 +125,7 @@ AS SELECT sum(value) AS s FROM iceberg2.tpch.common_base_table // After REFRESH, the MV is fresh again assertUpdate(defaultIceberg, "REFRESH MATERIALIZED VIEW mv_on_iceberg2", 1); assertThat(getExplainPlan("TABLE mv_on_iceberg2", ExplainType.Type.IO)) - .contains("\"table\" : \"st_") + .contains("\"table\" : \"mv_on_iceberg2$materialized_view_storage") .doesNotContain("common_base_table"); assertThat(query("TABLE mv_on_iceberg2")) .matches("VALUES BIGINT '17'"); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMemoryCacheFileOperations.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMemoryCacheFileOperations.java new file mode 100644 index 000000000000..51a18ee943cb --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMemoryCacheFileOperations.java @@ -0,0 +1,217 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.HashMultiset; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultiset; +import com.google.common.collect.Multiset; +import io.opentelemetry.sdk.trace.data.SpanData; +import io.trino.plugin.iceberg.util.FileOperationUtils.FileType; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.DistributedQueryRunner; +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; + +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.trino.filesystem.tracing.CacheFileSystemTraceUtils.getFileLocation; +import static io.trino.filesystem.tracing.CacheFileSystemTraceUtils.isTrinoSchemaOrPermissions; +import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.DATA; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.MANIFEST; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.METADATA_JSON; +import static io.trino.plugin.iceberg.util.FileOperationUtils.FileType.SNAPSHOT; +import static io.trino.testing.MultisetAssertions.assertMultisetsEqual; +import static java.util.stream.Collectors.toCollection; + +@Execution(ExecutionMode.SAME_THREAD) +public class TestIcebergMemoryCacheFileOperations + extends AbstractTestQueryFramework +{ + private static final String TEST_SCHEMA = "test_memory_schema"; + + @Override + protected DistributedQueryRunner createQueryRunner() + throws Exception + { + Path metastoreDirectory = Files.createTempDirectory(ICEBERG_CATALOG); + closeAfterClass(() -> deleteRecursively(metastoreDirectory, ALLOW_INSECURE)); + + Map icebergProperties = ImmutableMap.builder() + .put("iceberg.metadata-cache.enabled", "true") + .put("hive.metastore.catalog.dir", metastoreDirectory.toUri().toString()) + .buildOrThrow(); + + DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() + .setSchemaInitializer(SchemaInitializer.builder() + .withSchemaName(TEST_SCHEMA) + .build()) + .setIcebergProperties(icebergProperties) + .setWorkerCount(0) + .build(); + queryRunner.execute("CREATE SCHEMA IF NOT EXISTS " + TEST_SCHEMA); + return queryRunner; + } + + @Test + public void testCacheFileOperations() + { + assertUpdate("DROP TABLE IF EXISTS test_cache_file_operations"); + assertUpdate("CREATE TABLE test_cache_file_operations(key varchar, data varchar) with (partitioning=ARRAY['key'])"); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p1', '1-abc')", 1); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p2', '2-xyz')", 1); + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("Input.readTail", DATA), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheInput", DATA), 2) + .add(new CacheOperation("Input.readTail", METADATA_JSON)) + .add(new CacheOperation("InputFile.length", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT)) + .addCopies(new CacheOperation("Input.readTail", MANIFEST), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", MANIFEST), 2) + .build()); + + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("FileSystemCache.cacheInput", DATA), 2) + .add(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT)) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", MANIFEST), 2) + .build()); + + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p3', '3-xyz')", 1); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p4', '4-xyz')", 1); + assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p5', '5-xyz')", 1); + + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("Input.readTail", DATA), 3) + .addCopies(new CacheOperation("FileSystemCache.cacheInput", DATA), 5) + .add(new CacheOperation("Input.readTail", METADATA_JSON)) + .add(new CacheOperation("InputFile.length", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT)) + .addCopies(new CacheOperation("Input.readTail", MANIFEST), 3) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", MANIFEST), 5) + .build()); + + assertFileSystemAccesses( + "SELECT * FROM test_cache_file_operations", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("FileSystemCache.cacheInput", DATA), 5) + .add(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT)) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", MANIFEST), 5) + .build()); + } + + @Test + public void testSelectWithFilter() + { + assertUpdate("CREATE TABLE test_select_with_filter AS SELECT 1 col_name", 1); + assertFileSystemAccesses( + "SELECT * FROM test_select_with_filter WHERE col_name = 1", + ImmutableMultiset.builder() + .add(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON)) + .add(new CacheOperation("Input.readTail", METADATA_JSON)) + .add(new CacheOperation("InputFile.length", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheStream", MANIFEST)) + .add(new CacheOperation("Input.readTail", MANIFEST)) + .add(new CacheOperation("FileSystemCache.cacheInput", DATA)) + .add(new CacheOperation("Input.readTail", DATA)) + .build()); + + assertFileSystemAccesses( + "SELECT * FROM test_select_with_filter WHERE col_name = 1", + ImmutableMultiset.builder() + .add(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON)) + .add(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT)) + .add(new CacheOperation("FileSystemCache.cacheStream", MANIFEST)) + .add(new CacheOperation("FileSystemCache.cacheInput", DATA)) + .build()); + } + + @Test + public void testJoin() + { + assertUpdate("CREATE TABLE test_join_t1 AS SELECT 2 AS age, 'id1' AS id", 1); + assertUpdate("CREATE TABLE test_join_t2 AS SELECT 'name1' AS name, 'id1' AS id", 1); + + assertFileSystemAccesses("SELECT name, age FROM test_join_t1 JOIN test_join_t2 ON test_join_t2.id = test_join_t1.id", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("Input.readTail", METADATA_JSON), 2) + .addCopies(new CacheOperation("InputFile.length", METADATA_JSON), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT), 2) + .addCopies(new CacheOperation("Input.readTail", MANIFEST), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", MANIFEST), 4) + .addCopies(new CacheOperation("Input.readTail", DATA), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheInput", DATA), 2) + .build()); + + assertFileSystemAccesses("SELECT name, age FROM test_join_t1 JOIN test_join_t2 ON test_join_t2.id = test_join_t1.id", + ImmutableMultiset.builder() + .addCopies(new CacheOperation("FileSystemCache.cacheStream", METADATA_JSON), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", SNAPSHOT), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheLength", SNAPSHOT), 2) + .addCopies(new CacheOperation("FileSystemCache.cacheStream", MANIFEST), 4) + .addCopies(new CacheOperation("FileSystemCache.cacheInput", DATA), 2) + .build()); + } + + private void assertFileSystemAccesses(@Language("SQL") String query, Multiset expectedCacheAccesses) + { + DistributedQueryRunner queryRunner = getDistributedQueryRunner(); + queryRunner.executeWithPlan(queryRunner.getDefaultSession(), query); + assertMultisetsEqual(expectedCacheAccesses, getCacheOperations()); + } + + private Multiset getCacheOperations() + { + return getQueryRunner().getSpans().stream() + .filter(span -> span.getName().startsWith("Input.") || span.getName().startsWith("InputFile.") || span.getName().startsWith("FileSystemCache.")) + .filter(span -> !span.getName().startsWith("InputFile.newInput")) + .filter(span -> !isTrinoSchemaOrPermissions(getFileLocation(span))) + .map(CacheOperation::create) + .collect(toCollection(HashMultiset::create)); + } + + private record CacheOperation(String operationName, FileType fileType) + { + public static CacheOperation create(SpanData span) + { + String path = getFileLocation(span); + return new CacheOperation(span.getName(), FileType.fromFilePath(path)); + } + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMergeAppend.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMergeAppend.java index 9c7435958f5a..763e6cf08391 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMergeAppend.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMergeAppend.java @@ -14,30 +14,28 @@ package io.trino.plugin.iceberg; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.plugin.base.CatalogName; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.cache.CachingHiveMetastore; import io.trino.plugin.hive.TrinoViewHiveMetastore; -import io.trino.plugin.hive.metastore.HiveMetastore; -import io.trino.plugin.hive.metastore.cache.CachingHiveMetastore; import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.file.FileMetastoreTableOperationsProvider; import io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog; +import io.trino.spi.catalog.CatalogName; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.type.TestingTypeManager; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DistributedQueryRunner; import io.trino.testing.MaterializedResult; import io.trino.testing.QueryRunner; -import io.trino.testing.TestingConnectorSession; import org.apache.iceberg.Table; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; -import java.io.File; - -import static io.trino.plugin.hive.metastore.cache.CachingHiveMetastore.memoizeMetastore; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.trino.metastore.cache.CachingHiveMetastore.createPerTransactionCache; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; -import static org.testng.Assert.assertEquals; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergMergeAppend extends AbstractTestQueryFramework @@ -49,22 +47,24 @@ public class TestIcebergMergeAppend protected QueryRunner createQueryRunner() throws Exception { - DistributedQueryRunner queryRunner = IcebergQueryRunner.createIcebergQueryRunner(); - File baseDir = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data").toFile(); - HiveMetastore metastore = createTestingFileHiveMetastore(baseDir); + QueryRunner queryRunner = IcebergQueryRunner.builder().build(); + HiveMetastore metastore = getHiveMetastore(queryRunner); + CachingHiveMetastore cachingHiveMetastore = createPerTransactionCache(metastore, 1000); TrinoFileSystemFactory fileSystemFactory = getFileSystemFactory(queryRunner); - tableOperationsProvider = new FileMetastoreTableOperationsProvider(fileSystemFactory); - CachingHiveMetastore cachingHiveMetastore = memoizeMetastore(metastore, 1000); + tableOperationsProvider = new FileMetastoreTableOperationsProvider(fileSystemFactory, FILE_IO_FACTORY); trinoCatalog = new TrinoHiveCatalog( new CatalogName("catalog"), cachingHiveMetastore, new TrinoViewHiveMetastore(cachingHiveMetastore, false, "trino-version", "test"), fileSystemFactory, + FILE_IO_FACTORY, new TestingTypeManager(), tableOperationsProvider, false, false, - false); + false, + new IcebergConfig().isHideMaterializedViewStorageTable(), + directExecutor()); return queryRunner; } @@ -73,16 +73,16 @@ protected QueryRunner createQueryRunner() public void testInsertWithAppend() { assertUpdate("CREATE TABLE table_to_insert (_bigint BIGINT, _varchar VARCHAR)"); - Table table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, TestingConnectorSession.SESSION, + Table table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, IcebergTestUtils.SESSION, new SchemaTableName("tpch", "table_to_insert")); table.updateProperties() .set("commit.manifest.min-count-to-merge", "2") .commit(); assertUpdate("INSERT INTO table_to_insert VALUES (1, 'a'), (2, 'b'), (3, 'c')", 3); MaterializedResult result = computeActual("select * from \"table_to_insert$manifests\""); - assertEquals(result.getRowCount(), 1); + assertThat(result.getRowCount()).isEqualTo(1); assertUpdate("INSERT INTO table_to_insert VALUES (4, 'd')", 1); result = computeActual("select * from \"table_to_insert$manifests\""); - assertEquals(result.getRowCount(), 1); + assertThat(result.getRowCount()).isEqualTo(1); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetadataListing.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetadataListing.java index 5ca8525d54d3..b1d8ae8afce6 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetadataListing.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetadataListing.java @@ -15,39 +15,36 @@ import com.google.common.collect.ImmutableMap; import io.trino.Session; -import io.trino.metadata.MaterializedViewDefinition; -import io.trino.metadata.QualifiedObjectName; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.HiveMetastoreFactory; import io.trino.plugin.hive.TestingHivePlugin; -import io.trino.plugin.hive.metastore.file.FileHiveMetastore; -import io.trino.plugin.iceberg.catalog.file.TestingIcebergFileMetastoreCatalogModule; -import io.trino.spi.connector.SchemaTableName; import io.trino.spi.security.Identity; import io.trino.spi.security.SelectedRole; import io.trino.testing.AbstractTestQueryFramework; import io.trino.testing.DistributedQueryRunner; -import io.trino.transaction.TransactionId; -import io.trino.transaction.TransactionManager; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import io.trino.testing.QueryRunner; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import java.io.File; import java.util.Optional; -import static com.google.inject.util.Modules.EMPTY_MODULE; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static io.trino.plugin.hive.TestingHiveUtils.getConnectorService; import static io.trino.spi.security.SelectedRole.Type.ROLE; import static io.trino.testing.TestingSession.testSessionBuilder; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public class TestIcebergMetadataListing extends AbstractTestQueryFramework { - private FileHiveMetastore metastore; - private SchemaTableName storageTable; + private HiveMetastore metastore; @Override - protected DistributedQueryRunner createQueryRunner() + protected QueryRunner createQueryRunner() throws Exception { Session session = testSessionBuilder() @@ -55,21 +52,22 @@ protected DistributedQueryRunner createQueryRunner() .withConnectorRole("hive", new SelectedRole(ROLE, Optional.of("admin"))) .build()) .build(); - DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(session).build(); + QueryRunner queryRunner = DistributedQueryRunner.builder(session).build(); File baseDir = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data").toFile(); - metastore = createTestingFileHiveMetastore(baseDir); - - queryRunner.installPlugin(new TestingIcebergPlugin(Optional.of(new TestingIcebergFileMetastoreCatalogModule(metastore)), Optional.empty(), EMPTY_MODULE)); + queryRunner.installPlugin(new TestingIcebergPlugin(baseDir.toPath())); queryRunner.createCatalog("iceberg", "iceberg"); - queryRunner.installPlugin(new TestingHivePlugin(metastore)); + queryRunner.installPlugin(new TestingHivePlugin(baseDir.toPath())); queryRunner.createCatalog("hive", "hive", ImmutableMap.of("hive.security", "sql-standard")); + metastore = getConnectorService(queryRunner, HiveMetastoreFactory.class) + .createMetastore(Optional.empty()); + return queryRunner; } - @BeforeClass + @BeforeAll public void setUp() { assertQuerySucceeds("CREATE SCHEMA hive.test_schema"); @@ -77,14 +75,13 @@ public void setUp() assertQuerySucceeds("CREATE TABLE iceberg.test_schema.iceberg_table2 (_double DOUBLE) WITH (partitioning = ARRAY['_double'])"); assertQuerySucceeds("CREATE MATERIALIZED VIEW iceberg.test_schema.iceberg_materialized_view AS " + "SELECT * FROM iceberg.test_schema.iceberg_table1"); - storageTable = getStorageTable("iceberg", "test_schema", "iceberg_materialized_view"); assertQuerySucceeds("CREATE VIEW iceberg.test_schema.iceberg_view AS SELECT * FROM iceberg.test_schema.iceberg_table1"); assertQuerySucceeds("CREATE TABLE hive.test_schema.hive_table (_double DOUBLE)"); assertQuerySucceeds("CREATE VIEW hive.test_schema.hive_view AS SELECT * FROM hive.test_schema.hive_table"); } - @AfterClass(alwaysRun = true) + @AfterAll public void tearDown() { assertQuerySucceeds("DROP TABLE IF EXISTS hive.test_schema.hive_table"); @@ -99,12 +96,12 @@ public void tearDown() @Test public void testTableListing() { - assertThat(metastore.getAllTables("test_schema")) + assertThat(metastore.getTables("test_schema")) + .extracting(table -> table.tableName().getTableName()) .containsExactlyInAnyOrder( "iceberg_table1", "iceberg_table2", "iceberg_materialized_view", - storageTable.getTableName(), "iceberg_view", "hive_table", "hive_view"); @@ -115,7 +112,6 @@ public void testTableListing() "'iceberg_table1', " + "'iceberg_table2', " + "'iceberg_materialized_view', " + - "'" + storageTable.getTableName() + "', " + "'iceberg_view', " + "'hive_table', " + "'hive_view'"); @@ -133,8 +129,6 @@ public void testTableColumnListing() "('iceberg_table2', '_double'), " + "('iceberg_materialized_view', '_string'), " + "('iceberg_materialized_view', '_integer'), " + - "('" + storageTable.getTableName() + "', '_string'), " + - "('" + storageTable.getTableName() + "', '_integer'), " + "('iceberg_view', '_string'), " + "('iceberg_view', '_integer'), " + "('hive_view', '_double')"); @@ -152,15 +146,4 @@ public void testTableValidation() assertQuerySucceeds("SELECT * FROM iceberg.test_schema.iceberg_table1"); assertQueryFails("SELECT * FROM iceberg.test_schema.hive_table", "Not an Iceberg table: test_schema.hive_table"); } - - private SchemaTableName getStorageTable(String catalogName, String schemaName, String objectName) - { - TransactionManager transactionManager = getQueryRunner().getTransactionManager(); - TransactionId transactionId = transactionManager.beginTransaction(false); - Session session = getSession().beginTransactionId(transactionId, transactionManager, getQueryRunner().getAccessControl()); - Optional materializedView = getQueryRunner().getMetadata() - .getMaterializedView(session, new QualifiedObjectName(catalogName, schemaName, objectName)); - assertThat(materializedView).isPresent(); - return materializedView.get().getStorageTable().get().getSchemaTableName(); - } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetastoreAccessOperations.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetastoreAccessOperations.java index 0706f08e4e7b..c9ffe5d571bf 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetastoreAccessOperations.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMetastoreAccessOperations.java @@ -16,67 +16,55 @@ import com.google.common.collect.ImmutableMultiset; import com.google.common.collect.Multiset; import io.trino.Session; -import io.trino.plugin.hive.metastore.CountingAccessHiveMetastore; -import io.trino.plugin.hive.metastore.CountingAccessHiveMetastoreUtil; -import io.trino.plugin.iceberg.catalog.file.TestingIcebergFileMetastoreCatalogModule; +import io.trino.plugin.hive.metastore.MetastoreMethod; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.QueryRunner; import org.intellij.lang.annotations.Language; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; -import java.io.File; import java.util.Optional; -import static com.google.inject.util.Modules.EMPTY_MODULE; -import static io.trino.plugin.hive.metastore.CountingAccessHiveMetastore.Method.CREATE_TABLE; -import static io.trino.plugin.hive.metastore.CountingAccessHiveMetastore.Method.DROP_TABLE; -import static io.trino.plugin.hive.metastore.CountingAccessHiveMetastore.Method.GET_ALL_TABLES_FROM_DATABASE; -import static io.trino.plugin.hive.metastore.CountingAccessHiveMetastore.Method.GET_DATABASE; -import static io.trino.plugin.hive.metastore.CountingAccessHiveMetastore.Method.GET_TABLE; -import static io.trino.plugin.hive.metastore.CountingAccessHiveMetastore.Method.GET_TABLE_WITH_PARAMETER; -import static io.trino.plugin.hive.metastore.CountingAccessHiveMetastore.Method.REPLACE_TABLE; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static io.trino.plugin.hive.metastore.MetastoreInvocations.assertMetastoreInvocationsForQuery; +import static io.trino.plugin.hive.metastore.MetastoreMethod.CREATE_TABLE; +import static io.trino.plugin.hive.metastore.MetastoreMethod.DROP_TABLE; +import static io.trino.plugin.hive.metastore.MetastoreMethod.GET_DATABASE; +import static io.trino.plugin.hive.metastore.MetastoreMethod.GET_TABLE; +import static io.trino.plugin.hive.metastore.MetastoreMethod.GET_TABLES; +import static io.trino.plugin.hive.metastore.MetastoreMethod.REPLACE_TABLE; import static io.trino.plugin.iceberg.IcebergSessionProperties.COLLECT_EXTENDED_STATISTICS_ON_WRITE; +import static io.trino.plugin.iceberg.TableType.ALL_ENTRIES; +import static io.trino.plugin.iceberg.TableType.ALL_MANIFESTS; import static io.trino.plugin.iceberg.TableType.DATA; +import static io.trino.plugin.iceberg.TableType.ENTRIES; import static io.trino.plugin.iceberg.TableType.FILES; import static io.trino.plugin.iceberg.TableType.HISTORY; import static io.trino.plugin.iceberg.TableType.MANIFESTS; +import static io.trino.plugin.iceberg.TableType.MATERIALIZED_VIEW_STORAGE; +import static io.trino.plugin.iceberg.TableType.METADATA_LOG_ENTRIES; import static io.trino.plugin.iceberg.TableType.PARTITIONS; import static io.trino.plugin.iceberg.TableType.PROPERTIES; import static io.trino.plugin.iceberg.TableType.REFS; import static io.trino.plugin.iceberg.TableType.SNAPSHOTS; import static io.trino.testing.TestingNames.randomNameSuffix; -import static io.trino.testing.TestingSession.testSessionBuilder; import static org.assertj.core.api.Assertions.assertThat; -@Test(singleThreaded = true) // metastore invocation counters shares mutable state so can't be run from many threads simultaneously +@Execution(ExecutionMode.SAME_THREAD) // metastore invocation counters shares mutable state so can't be run from many threads simultaneously public class TestIcebergMetastoreAccessOperations extends AbstractTestQueryFramework { private static final int MAX_PREFIXES_COUNT = 10; - private static final Session TEST_SESSION = testSessionBuilder() - .setCatalog("iceberg") - .setSchema("test_schema") - .build(); - - private CountingAccessHiveMetastore metastore; @Override - protected DistributedQueryRunner createQueryRunner() + protected QueryRunner createQueryRunner() throws Exception { - DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(TEST_SESSION) + return IcebergQueryRunner.builder() .addCoordinatorProperty("optimizer.experimental-max-prefetched-information-schema-prefixes", Integer.toString(MAX_PREFIXES_COUNT)) .build(); - - File baseDir = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data").toFile(); - metastore = new CountingAccessHiveMetastore(createTestingFileHiveMetastore(baseDir)); - queryRunner.installPlugin(new TestingIcebergPlugin(Optional.of(new TestingIcebergFileMetastoreCatalogModule(metastore)), Optional.empty(), EMPTY_MODULE)); - queryRunner.createCatalog("iceberg", "iceberg"); - - queryRunner.execute("CREATE SCHEMA test_schema"); - return queryRunner; } @Test @@ -89,7 +77,7 @@ public void testUse() .setSchema(Optional.empty()) .build(); assertMetastoreInvocations(session, "USE %s.%s".formatted(catalog, schema), - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_DATABASE) .build()); } @@ -98,11 +86,28 @@ public void testUse() public void testCreateTable() { assertMetastoreInvocations("CREATE TABLE test_create (id VARCHAR, age INT)", - ImmutableMultiset.builder() + ImmutableMultiset.builder() + .add(CREATE_TABLE) + .add(GET_DATABASE) + .add(GET_TABLE) + .build()); + } + + @Test + public void testCreateOrReplaceTable() + { + assertMetastoreInvocations("CREATE OR REPLACE TABLE test_create_or_replace (id VARCHAR, age INT)", + ImmutableMultiset.builder() .add(CREATE_TABLE) .add(GET_DATABASE) .add(GET_TABLE) .build()); + assertMetastoreInvocations("CREATE OR REPLACE TABLE test_create_or_replace (id VARCHAR, age INT)", + ImmutableMultiset.builder() + .add(GET_DATABASE) + .add(REPLACE_TABLE) + .add(GET_TABLE) + .build()); } @Test @@ -111,7 +116,7 @@ public void testCreateTableAsSelect() assertMetastoreInvocations( withStatsOnWrite(getSession(), false), "CREATE TABLE test_ctas AS SELECT 1 AS age", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_DATABASE) .add(CREATE_TABLE) .add(GET_TABLE) @@ -120,7 +125,7 @@ public void testCreateTableAsSelect() assertMetastoreInvocations( withStatsOnWrite(getSession(), true), "CREATE TABLE test_ctas_with_stats AS SELECT 1 AS age", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_DATABASE) .add(CREATE_TABLE) .addCopies(GET_TABLE, 4) @@ -128,13 +133,34 @@ public void testCreateTableAsSelect() .build()); } + @Test + public void testCreateOrReplaceTableAsSelect() + { + assertMetastoreInvocations( + "CREATE OR REPLACE TABLE test_cortas AS SELECT 1 AS age", + ImmutableMultiset.builder() + .add(GET_DATABASE) + .add(CREATE_TABLE) + .addCopies(GET_TABLE, 4) + .add(REPLACE_TABLE) + .build()); + + assertMetastoreInvocations( + "CREATE OR REPLACE TABLE test_cortas AS SELECT 1 AS age", + ImmutableMultiset.builder() + .add(GET_DATABASE) + .addCopies(GET_TABLE, 3) + .addCopies(REPLACE_TABLE, 2) + .build()); + } + @Test public void testSelect() { assertUpdate("CREATE TABLE test_select_from (id VARCHAR, age INT)"); assertMetastoreInvocations("SELECT * FROM test_select_from", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_TABLE) .build()); } @@ -145,7 +171,7 @@ public void testSelectWithFilter() assertUpdate("CREATE TABLE test_select_from_where AS SELECT 2 as age", 1); assertMetastoreInvocations("SELECT * FROM test_select_from_where WHERE age = 2", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_TABLE) .build()); } @@ -157,7 +183,7 @@ public void testSelectFromView() assertUpdate("CREATE VIEW test_select_view_view AS SELECT id, age FROM test_select_view_table"); assertMetastoreInvocations("SELECT * FROM test_select_view_view", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 2) .build()); } @@ -169,7 +195,7 @@ public void testSelectFromViewWithFilter() assertUpdate("CREATE VIEW test_select_view_where_view AS SELECT age FROM test_select_view_where_table"); assertMetastoreInvocations("SELECT * FROM test_select_view_where_view WHERE age = 2", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 2) .build()); } @@ -181,8 +207,8 @@ public void testSelectFromMaterializedView() assertUpdate("CREATE MATERIALIZED VIEW test_select_mview_view AS SELECT id, age FROM test_select_mview_table"); assertMetastoreInvocations("SELECT * FROM test_select_mview_view", - ImmutableMultiset.builder() - .addCopies(GET_TABLE, 3) + ImmutableMultiset.builder() + .addCopies(GET_TABLE, 2) .build()); } @@ -193,8 +219,8 @@ public void testSelectFromMaterializedViewWithFilter() assertUpdate("CREATE MATERIALIZED VIEW test_select_mview_where_view AS SELECT age FROM test_select_mview_where_table"); assertMetastoreInvocations("SELECT * FROM test_select_mview_where_view WHERE age = 2", - ImmutableMultiset.builder() - .addCopies(GET_TABLE, 3) + ImmutableMultiset.builder() + .addCopies(GET_TABLE, 2) .build()); } @@ -205,8 +231,8 @@ public void testRefreshMaterializedView() assertUpdate("CREATE MATERIALIZED VIEW test_refresh_mview_view AS SELECT id, age FROM test_refresh_mview_table"); assertMetastoreInvocations("REFRESH MATERIALIZED VIEW test_refresh_mview_view", - ImmutableMultiset.builder() - .addCopies(GET_TABLE, 6) + ImmutableMultiset.builder() + .addCopies(GET_TABLE, 2) .addCopies(REPLACE_TABLE, 1) .build()); } @@ -218,7 +244,7 @@ public void testJoin() assertUpdate("CREATE TABLE test_join_t2 AS SELECT 'name1' as name, 'id1' AS id", 1); assertMetastoreInvocations("SELECT name, age FROM test_join_t1 JOIN test_join_t2 ON test_join_t2.id = test_join_t1.id", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 2) .build()); } @@ -229,7 +255,7 @@ public void testSelfJoin() assertUpdate("CREATE TABLE test_self_join_table AS SELECT 2 as age, 0 parent, 3 AS id", 1); assertMetastoreInvocations("SELECT child.age, parent.age FROM test_self_join_table child JOIN test_self_join_table parent ON child.parent = parent.id", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_TABLE) .build()); } @@ -240,7 +266,7 @@ public void testExplainSelect() assertUpdate("CREATE TABLE test_explain AS SELECT 2 as age", 1); assertMetastoreInvocations("EXPLAIN SELECT * FROM test_explain", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_TABLE) .build()); } @@ -251,7 +277,7 @@ public void testShowStatsForTable() assertUpdate("CREATE TABLE test_show_stats AS SELECT 2 as age", 1); assertMetastoreInvocations("SHOW STATS FOR test_show_stats", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_TABLE) .build()); } @@ -262,7 +288,7 @@ public void testShowStatsForTableWithFilter() assertUpdate("CREATE TABLE test_show_stats_with_filter AS SELECT 2 as age", 1); assertMetastoreInvocations("SHOW STATS FOR (SELECT * FROM test_show_stats_with_filter where age >= 2)", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_TABLE) .build()); } @@ -274,43 +300,70 @@ public void testSelectSystemTable() // select from $history assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$history\"", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 1) .build()); + // select from $metadata_log_entries + assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$metadata_log_entries\"", + ImmutableMultiset.builder() + .add(GET_TABLE) + .build()); + // select from $snapshots assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$snapshots\"", - ImmutableMultiset.builder() + ImmutableMultiset.builder() + .addCopies(GET_TABLE, 1) + .build()); + + // select from $all_manifests + assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$all_manifests\"", + ImmutableMultiset.builder() .addCopies(GET_TABLE, 1) .build()); // select from $manifests assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$manifests\"", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 1) .build()); // select from $partitions assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$partitions\"", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 1) .build()); // select from $files assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$files\"", - ImmutableMultiset.builder() + ImmutableMultiset.builder() + .addCopies(GET_TABLE, 1) + .build()); + + // select from $all_entries + assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$all_entries\"", + ImmutableMultiset.builder() + .addCopies(GET_TABLE, 1) + .build()); + + // select from $entries + assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$entries\"", + ImmutableMultiset.builder() .addCopies(GET_TABLE, 1) .build()); // select from $properties assertMetastoreInvocations("SELECT * FROM \"test_select_snapshots$properties\"", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 1) .build()); + assertQueryFails("SELECT * FROM \"test_select_snapshots$materialized_view_storage\"", + "Table 'tpch.test_select_snapshots\\$materialized_view_storage' not found"); + // This test should get updated if a new system table is added. assertThat(TableType.values()) - .containsExactly(DATA, HISTORY, SNAPSHOTS, MANIFESTS, PARTITIONS, FILES, PROPERTIES, REFS); + .containsExactly(DATA, HISTORY, METADATA_LOG_ENTRIES, SNAPSHOTS, ALL_MANIFESTS, MANIFESTS, PARTITIONS, FILES, ALL_ENTRIES, ENTRIES, PROPERTIES, REFS, MATERIALIZED_VIEW_STORAGE); } @Test @@ -319,14 +372,15 @@ public void testUnregisterTable() assertUpdate("CREATE TABLE test_unregister_table AS SELECT 2 as age", 1); assertMetastoreInvocations("CALL system.unregister_table(CURRENT_SCHEMA, 'test_unregister_table')", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .add(GET_DATABASE) .add(GET_TABLE) .add(DROP_TABLE) .build()); } - @Test(dataProvider = "metadataQueriesTestTableCountDataProvider") + @ParameterizedTest + @MethodSource("metadataQueriesTestTableCountDataProvider") public void testInformationSchemaColumns(int tables) { String schemaName = "test_i_s_columns_schema" + randomNameSuffix(); @@ -344,11 +398,24 @@ public void testInformationSchemaColumns(int tables) assertUpdate(session, "CREATE TABLE test_other_select_i_s_columns" + i + "(id varchar, age integer)"); // won't match the filter } + // Bulk retrieval assertMetastoreInvocations(session, "SELECT * FROM information_schema.columns WHERE table_schema = CURRENT_SCHEMA AND table_name LIKE 'test_select_i_s_columns%'", - ImmutableMultiset.builder() - .add(GET_ALL_TABLES_FROM_DATABASE) + ImmutableMultiset.builder() + .add(GET_TABLES) .addCopies(GET_TABLE, tables * 2) - .addCopies(GET_TABLE_WITH_PARAMETER, 2) + .build()); + + // Pointed lookup + assertMetastoreInvocations(session, "SELECT * FROM information_schema.columns WHERE table_schema = CURRENT_SCHEMA AND table_name = 'test_select_i_s_columns0'", + ImmutableMultiset.builder() + .add(GET_TABLE) + .build()); + + // Pointed lookup via DESCRIBE (which does some additional things before delegating to information_schema.columns) + assertMetastoreInvocations(session, "DESCRIBE test_select_i_s_columns0", + ImmutableMultiset.builder() + .add(GET_DATABASE) + .add(GET_TABLE) .build()); for (int i = 0; i < tables; i++) { @@ -357,7 +424,8 @@ public void testInformationSchemaColumns(int tables) } } - @Test(dataProvider = "metadataQueriesTestTableCountDataProvider") + @ParameterizedTest + @MethodSource("metadataQueriesTestTableCountDataProvider") public void testSystemMetadataTableComments(int tables) { String schemaName = "test_s_m_table_comments" + randomNameSuffix(); @@ -377,15 +445,21 @@ public void testSystemMetadataTableComments(int tables) // Bulk retrieval assertMetastoreInvocations(session, "SELECT * FROM system.metadata.table_comments WHERE schema_name = CURRENT_SCHEMA AND table_name LIKE 'test_select_s_m_t_comments%'", - ImmutableMultiset.builder() - .add(GET_ALL_TABLES_FROM_DATABASE) + ImmutableMultiset.builder() + .add(GET_TABLES) + .addCopies(GET_TABLE, tables * 2) + .build()); + + // Bulk retrieval for two schemas + assertMetastoreInvocations(session, "SELECT * FROM system.metadata.table_comments WHERE schema_name IN (CURRENT_SCHEMA, 'non_existent') AND table_name LIKE 'test_select_s_m_t_comments%'", + ImmutableMultiset.builder() + .addCopies(GET_TABLES, 2) .addCopies(GET_TABLE, tables * 2) - .addCopies(GET_TABLE_WITH_PARAMETER, 2) .build()); // Pointed lookup assertMetastoreInvocations(session, "SELECT * FROM system.metadata.table_comments WHERE schema_name = CURRENT_SCHEMA AND table_name = 'test_select_s_m_t_comments0'", - ImmutableMultiset.builder() + ImmutableMultiset.builder() .addCopies(GET_TABLE, 1) .build()); @@ -395,7 +469,6 @@ public void testSystemMetadataTableComments(int tables) } } - @DataProvider public Object[][] metadataQueriesTestTableCountDataProvider() { return new Object[][] { @@ -405,14 +478,77 @@ public Object[][] metadataQueriesTestTableCountDataProvider() }; } - private void assertMetastoreInvocations(@Language("SQL") String query, Multiset expectedInvocations) + @Test + public void testSystemMetadataMaterializedViews() + { + String schemaName = "test_materialized_views_" + randomNameSuffix(); + assertUpdate("CREATE SCHEMA " + schemaName); + Session session = Session.builder(getSession()) + .setSchema(schemaName) + .build(); + + assertUpdate(session, "CREATE TABLE test_table1 AS SELECT 1 a", 1); + assertUpdate(session, "CREATE TABLE test_table2 AS SELECT 1 a", 1); + + assertUpdate(session, "CREATE MATERIALIZED VIEW mv1 AS SELECT * FROM test_table1 JOIN test_table2 USING (a)"); + assertUpdate(session, "REFRESH MATERIALIZED VIEW mv1", 1); + + assertUpdate(session, "CREATE MATERIALIZED VIEW mv2 AS SELECT count(*) c FROM test_table1 JOIN test_table2 USING (a)"); + assertUpdate(session, "REFRESH MATERIALIZED VIEW mv2", 1); + + // Bulk retrieval + assertMetastoreInvocations(session, "SELECT * FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA", + ImmutableMultiset.builder() + .add(GET_TABLES) + .addCopies(GET_TABLE, 4) + .build()); + + // Bulk retrieval without selecting freshness + assertMetastoreInvocations(session, "SELECT schema_name, name FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA", + ImmutableMultiset.builder() + .add(GET_TABLES) + .build()); + + // Bulk retrieval for two schemas + assertMetastoreInvocations(session, "SELECT * FROM system.metadata.materialized_views WHERE schema_name IN (CURRENT_SCHEMA, 'non_existent')", + ImmutableMultiset.builder() + .addCopies(GET_TABLES, 2) + .addCopies(GET_TABLE, 4) + .build()); + + // Pointed lookup + assertMetastoreInvocations(session, "SELECT * FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA AND name = 'mv1'", + ImmutableMultiset.builder() + .addCopies(GET_TABLE, 3) + .build()); + + // Pointed lookup without selecting freshness + assertMetastoreInvocations(session, "SELECT schema_name, name FROM system.metadata.materialized_views WHERE schema_name = CURRENT_SCHEMA AND name = 'mv1'", + ImmutableMultiset.builder() + .add(GET_TABLE) + .build()); + + assertUpdate("DROP SCHEMA " + schemaName + " CASCADE"); + } + + @Test + public void testShowTables() + { + assertMetastoreInvocations("SHOW TABLES", + ImmutableMultiset.builder() + .add(GET_DATABASE) + .add(GET_TABLES) + .build()); + } + + private void assertMetastoreInvocations(@Language("SQL") String query, Multiset expectedInvocations) { assertMetastoreInvocations(getSession(), query, expectedInvocations); } - private void assertMetastoreInvocations(Session session, @Language("SQL") String query, Multiset expectedInvocations) + private void assertMetastoreInvocations(Session session, @Language("SQL") String query, Multiset expectedInvocations) { - CountingAccessHiveMetastoreUtil.assertMetastoreInvocations(metastore, getQueryRunner(), session, query, expectedInvocations); + assertMetastoreInvocationsForQuery(getDistributedQueryRunner(), session, query, expectedInvocations); } private static Session withStatsOnWrite(Session session, boolean enabled) diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioAvroConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioAvroConnectorSmokeTest.java index dcfbdfd2e7ea..98fb44016c7f 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioAvroConnectorSmokeTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioAvroConnectorSmokeTest.java @@ -14,9 +14,10 @@ package io.trino.plugin.iceberg; import io.trino.filesystem.Location; -import org.testng.SkipException; +import org.junit.jupiter.api.Test; import static org.apache.iceberg.FileFormat.AVRO; +import static org.junit.jupiter.api.Assumptions.abort; public class TestIcebergMinioAvroConnectorSmokeTest extends BaseIcebergMinioConnectorSmokeTest @@ -26,16 +27,18 @@ public TestIcebergMinioAvroConnectorSmokeTest() super(AVRO); } + @Test @Override public void testSortedNationTable() { - throw new SkipException("Avro does not support file sorting"); + abort("Avro does not support file sorting"); } + @Test @Override public void testFileSortingWithLargerTable() { - throw new SkipException("Avro does not support file sorting"); + abort("Avro does not support file sorting"); } @Override diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioOrcConnectorTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioOrcConnectorTest.java index daca5365c03b..7f6a39290266 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioOrcConnectorTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioOrcConnectorTest.java @@ -14,18 +14,18 @@ package io.trino.plugin.iceberg; import com.google.common.collect.ImmutableMap; +import com.google.common.io.Resources; import io.trino.Session; import io.trino.filesystem.Location; import io.trino.testing.QueryRunner; import io.trino.testing.containers.Minio; import io.trino.testing.sql.TestTable; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; -import java.io.File; -import java.io.OutputStream; -import java.nio.file.Files; import java.util.List; import java.util.Map; +import java.util.Optional; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.io.Resources.getResource; @@ -33,14 +33,16 @@ import static io.trino.plugin.iceberg.IcebergTestUtils.checkOrcFileSorting; import static io.trino.testing.TestingNames.randomNameSuffix; import static io.trino.testing.containers.Minio.MINIO_ACCESS_KEY; +import static io.trino.testing.containers.Minio.MINIO_REGION; import static io.trino.testing.containers.Minio.MINIO_SECRET_KEY; import static java.lang.String.format; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; /** * Iceberg connector test ORC and with S3-compatible storage (but without real metastore). */ +@Execution(SAME_THREAD) public class TestIcebergMinioOrcConnectorTest extends BaseIcebergConnectorTest { @@ -63,13 +65,17 @@ protected QueryRunner createQueryRunner() .setIcebergProperties( ImmutableMap.builder() .put("iceberg.file-format", format.name()) - .put("hive.s3.aws-access-key", MINIO_ACCESS_KEY) - .put("hive.s3.aws-secret-key", MINIO_SECRET_KEY) - .put("hive.s3.endpoint", minio.getMinioAddress()) - .put("hive.s3.path-style-access", "true") - .put("hive.s3.streaming.part-size", "5MB") // minimize memory usage - .put("hive.s3.max-connections", "8") // verify no leaks + .put("fs.hadoop.enabled", "true") + .put("fs.native-s3.enabled", "true") + .put("s3.aws-access-key", MINIO_ACCESS_KEY) + .put("s3.aws-secret-key", MINIO_SECRET_KEY) + .put("s3.region", MINIO_REGION) + .put("s3.endpoint", minio.getMinioAddress()) + .put("s3.path-style-access", "true") + .put("s3.streaming.part-size", "5MB") // minimize memory usage + .put("s3.max-connections", "8") // verify no leaks .put("iceberg.register-table-procedure.enabled", "true") + .put("iceberg.allowed-extra-properties", "extra.property.one,extra.property.two,extra.property.three") // Allows testing the sorting writer flushing to the file system with smaller tables .put("iceberg.writer-sort-buffer-size", "1MB") .buildOrThrow()) @@ -101,6 +107,13 @@ protected boolean isFileSorted(String path, String sortColumnName) return checkOrcFileSorting(fileSystem, Location.of(path), sortColumnName); } + @Override + protected boolean supportsPhysicalPushdown() + { + // TODO https://github.com/trinodb/trino/issues/17156 + return false; + } + @Test public void testTinyintType() throws Exception @@ -119,11 +132,10 @@ private void testReadSingleIntegerColumnOrcFile(String orcFileResourceName, int throws Exception { checkArgument(expectedValue != 0); - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_read_as_integer", "(\"_col0\") AS VALUES 0, NULL")) { + try (TestTable table = newTrinoTable("test_read_as_integer", "(\"_col0\") AS VALUES 0, NULL")) { String orcFilePath = (String) computeScalar(format("SELECT DISTINCT file_path FROM \"%s$files\"", table.getName())); - try (OutputStream outputStream = fileSystem.newOutputFile(Location.of(orcFilePath)).createOrOverwrite()) { - Files.copy(new File(getResource(orcFileResourceName).toURI()).toPath(), outputStream); - } + byte[] orcFileData = Resources.toByteArray(getResource(orcFileResourceName)); + fileSystem.newOutputFile(Location.of(orcFilePath)).createOrOverwrite(orcFileData); fileSystem.deleteFiles(List.of(Location.of(orcFilePath.replaceAll("/([^/]*)$", ".$1.crc")))); Session ignoreFileSizeFromMetadata = Session.builder(getSession()) @@ -139,7 +151,7 @@ private void testReadSingleIntegerColumnOrcFile(String orcFileResourceName, int public void testTimeType() { // Regression test for https://github.com/trinodb/trino/issues/15603 - try (TestTable table = new TestTable(getQueryRunner()::execute, "test_time", "(col time(6))")) { + try (TestTable table = newTrinoTable("test_time", "(col time(6))")) { assertUpdate("INSERT INTO " + table.getName() + " VALUES (TIME '13:30:00'), (TIME '14:30:00'), (NULL)", 3); assertQuery("SELECT * FROM " + table.getName(), "VALUES '13:30:00', '14:30:00', NULL"); assertQuery( @@ -153,11 +165,14 @@ public void testTimeType() } @Override - public void testDropAmbiguousRowFieldCaseSensitivity() + protected Optional filterTypeCoercionOnCreateTableAsSelectProvider(TypeCoercionTestSetup setup) { - // TODO https://github.com/trinodb/trino/issues/16273 The connector can't read row types having ambiguous field names in ORC files. e.g. row(X int, x int) - assertThatThrownBy(super::testDropAmbiguousRowFieldCaseSensitivity) - .hasMessageContaining("Error opening Iceberg split") - .hasStackTraceContaining("Multiple entries with same key"); + if (setup.sourceValueLiteral().equals("TIMESTAMP '1969-12-31 23:59:59.999999499999'")) { + return Optional.of(setup.withNewValueLiteral("TIMESTAMP '1970-01-01 00:00:00.999999'")); + } + if (setup.sourceValueLiteral().equals("TIMESTAMP '1969-12-31 23:59:59.9999994'")) { + return Optional.of(setup.withNewValueLiteral("TIMESTAMP '1970-01-01 00:00:00.999999'")); + } + return Optional.of(setup); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioParquetCachingConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioParquetCachingConnectorSmokeTest.java new file mode 100644 index 000000000000..71d721b7f018 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMinioParquetCachingConnectorSmokeTest.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableMap; +import com.google.common.io.Closer; +import io.trino.filesystem.Location; +import org.apache.iceberg.FileFormat; +import org.junit.jupiter.api.AfterAll; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; + +import static com.google.common.io.MoreFiles.deleteRecursively; +import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; +import static io.trino.plugin.iceberg.IcebergTestUtils.checkParquetFileSorting; + +public class TestIcebergMinioParquetCachingConnectorSmokeTest + extends BaseIcebergMinioConnectorSmokeTest +{ + private final Path cacheDirectory; + private final Closer closer = Closer.create(); + + TestIcebergMinioParquetCachingConnectorSmokeTest() + throws IOException + { + super(FileFormat.PARQUET); + cacheDirectory = Files.createTempDirectory("cache"); + closer.register(() -> deleteRecursively(cacheDirectory, ALLOW_INSECURE)); + } + + @AfterAll + public void tearDown() + throws Exception + { + closer.close(); + } + + @Override + public Map getAdditionalIcebergProperties() + { + return ImmutableMap.builder() + .put("fs.cache.enabled", "true") + .put("fs.cache.directories", cacheDirectory.toAbsolutePath().toString()) + .put("fs.cache.max-sizes", "100MB") + .buildOrThrow(); + } + + @Override + protected boolean isFileSorted(Location path, String sortColumnName) + { + return checkParquetFileSorting(fileSystem.newInputFile(path), sortColumnName); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMotoConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMotoConnectorSmokeTest.java new file mode 100644 index 000000000000..0cd04a3758c8 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergMotoConnectorSmokeTest.java @@ -0,0 +1,146 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableMap; +import io.trino.filesystem.Location; +import io.trino.testing.QueryRunner; +import io.trino.testing.containers.MotoContainer; +import org.apache.iceberg.FileFormat; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import software.amazon.awssdk.services.glue.GlueClient; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Map; + +import static io.trino.plugin.iceberg.IcebergTestUtils.checkParquetFileSorting; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.containers.MotoContainer.MOTO_ACCESS_KEY; +import static io.trino.testing.containers.MotoContainer.MOTO_REGION; +import static io.trino.testing.containers.MotoContainer.MOTO_SECRET_KEY; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Execution(SAME_THREAD) // Moto is not concurrency safe +public class TestIcebergMotoConnectorSmokeTest + extends BaseIcebergConnectorSmokeTest +{ + private final String bucketName = "test-iceberg-" + randomNameSuffix(); + private final String schemaName = "test_iceberg_smoke_" + randomNameSuffix(); + private GlueClient glueClient; + + public TestIcebergMotoConnectorSmokeTest() + { + super(FileFormat.PARQUET); + } + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + MotoContainer moto = closeAfterClass(new MotoContainer()); + moto.start(); + moto.createBucket(bucketName); + + glueClient = closeAfterClass(GlueClient.builder().applyMutation(moto::updateClient).build()); + + return IcebergQueryRunner.builder() + .setIcebergProperties(ImmutableMap.builder() + .put("iceberg.file-format", format.name()) + .put("iceberg.catalog.type", "glue") + .put("hive.metastore.glue.region", MOTO_REGION) + .put("hive.metastore.glue.endpoint-url", moto.getEndpoint().toString()) + .put("hive.metastore.glue.aws-access-key", MOTO_ACCESS_KEY) + .put("hive.metastore.glue.aws-secret-key", MOTO_SECRET_KEY) + .put("hive.metastore.glue.default-warehouse-dir", "s3://%s/".formatted(bucketName)) + .put("fs.native-s3.enabled", "true") + .put("s3.region", MOTO_REGION) + .put("s3.endpoint", moto.getEndpoint().toString()) + .put("s3.aws-access-key", MOTO_ACCESS_KEY) + .put("s3.aws-secret-key", MOTO_SECRET_KEY) + .put("s3.path-style-access", "true") + .put("iceberg.register-table-procedure.enabled", "true") + .put("iceberg.allowed-extra-properties", "write.metadata.delete-after-commit.enabled,write.metadata.previous-versions-max") + .buildOrThrow()) + .setSchemaInitializer(SchemaInitializer.builder() + .withSchemaName(schemaName) + .withClonedTpchTables(REQUIRED_TPCH_TABLES) + .withSchemaProperties(Map.of("location", "'s3://%s/%s/'".formatted(bucketName, schemaName))) + .build()) + .build(); + } + + @Override + protected void deleteDirectory(String location) + { + try { + fileSystem.deleteDirectory(Location.of(location)); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + protected boolean isFileSorted(Location path, String sortColumnName) + { + return checkParquetFileSorting(fileSystem.newInputFile(path), sortColumnName); + } + + @Override + protected void dropTableFromMetastore(String tableName) + { + glueClient.deleteTable(x -> x.databaseName(schemaName).name(tableName)); + } + + @Override + protected String getMetadataLocation(String tableName) + { + return glueClient.getTable(x -> x.databaseName(schemaName).name(tableName)) + .table().parameters().get("metadata_location"); + } + + @Override + protected String schemaPath() + { + return "s3://%s/%s".formatted(bucketName, schemaName); + } + + @Override + protected boolean locationExists(String location) + { + try { + return fileSystem.directoryExists(Location.of(location)).orElse(false); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Test + @Disabled("Moto is not concurrency safe") + @Override + public void testDeleteRowsConcurrently() {} + + @Test + @Override + public void testRenameSchema() + { + assertThatThrownBy(super::testRenameSchema) + .hasStackTraceContaining("renameNamespace is not supported for Iceberg Glue catalogs"); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergNodeLocalDynamicSplitPruning.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergNodeLocalDynamicSplitPruning.java index 019732d1af00..676b6fe4d5e8 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergNodeLocalDynamicSplitPruning.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergNodeLocalDynamicSplitPruning.java @@ -28,13 +28,13 @@ import io.trino.orc.OrcWriterOptions; import io.trino.orc.OrcWriterStats; import io.trino.orc.OutputStreamOrcDataSink; -import io.trino.plugin.hive.FileFormatDataSourceStats; +import io.trino.plugin.base.metrics.FileFormatDataSourceStats; import io.trino.plugin.hive.HiveTransactionHandle; -import io.trino.plugin.hive.metastore.Column; import io.trino.plugin.hive.orc.OrcReaderConfig; import io.trino.plugin.hive.orc.OrcWriterConfig; import io.trino.plugin.hive.parquet.ParquetReaderConfig; import io.trino.plugin.hive.parquet.ParquetWriterConfig; +import io.trino.plugin.iceberg.catalog.rest.DefaultIcebergFileSystemFactory; import io.trino.spi.Page; import io.trino.spi.SplitWeight; import io.trino.spi.block.BlockBuilder; @@ -42,8 +42,13 @@ import io.trino.spi.connector.ColumnHandle; import io.trino.spi.connector.ConnectorPageSource; import io.trino.spi.connector.DynamicFilter; +import io.trino.spi.connector.SourcePage; import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.Range; import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.predicate.ValueSet; +import io.trino.spi.type.DecimalType; +import io.trino.spi.type.SqlDecimal; import io.trino.spi.type.Type; import io.trino.testing.TestingConnectorSession; import org.apache.iceberg.PartitionSpec; @@ -51,10 +56,12 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.types.Types; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.io.IOException; +import java.math.BigDecimal; import java.nio.file.Files; +import java.time.LocalDate; import java.util.List; import java.util.Map; import java.util.Optional; @@ -65,44 +72,44 @@ import static io.trino.orc.metadata.CompressionKind.NONE; import static io.trino.plugin.hive.HiveTestUtils.HDFS_ENVIRONMENT; import static io.trino.plugin.hive.HiveTestUtils.HDFS_FILE_SYSTEM_STATS; -import static io.trino.plugin.hive.HiveType.HIVE_INT; -import static io.trino.plugin.hive.HiveType.HIVE_STRING; import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.PRIMITIVE; import static io.trino.plugin.iceberg.IcebergFileFormat.ORC; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; +import static io.trino.plugin.iceberg.util.OrcTypeConverter.toOrcType; +import static io.trino.spi.type.DateType.DATE; +import static io.trino.spi.type.Decimals.writeShortDecimal; import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.spi.type.VarcharType.VARCHAR; import static io.trino.testing.TestingHandles.TEST_CATALOG_HANDLE; import static io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER; import static java.util.concurrent.CompletableFuture.completedFuture; import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertNotNull; -import static org.testng.Assert.assertNull; +import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergNodeLocalDynamicSplitPruning { - private static final String SCHEMA_NAME = "test"; - private static final String TABLE_NAME = "test"; - private static final Column KEY_COLUMN = new Column("a_integer", HIVE_INT, Optional.empty()); - private static final ColumnIdentity KEY_COLUMN_IDENTITY = new ColumnIdentity(1, KEY_COLUMN.getName(), PRIMITIVE, ImmutableList.of()); - private static final IcebergColumnHandle KEY_ICEBERG_COLUMN_HANDLE = new IcebergColumnHandle(KEY_COLUMN_IDENTITY, INTEGER, ImmutableList.of(), INTEGER, Optional.empty()); - private static final int KEY_COLUMN_VALUE = 42; - private static final Column DATA_COLUMN = new Column("a_varchar", HIVE_STRING, Optional.empty()); - private static final ColumnIdentity DATA_COLUMN_IDENTITY = new ColumnIdentity(2, DATA_COLUMN.getName(), PRIMITIVE, ImmutableList.of()); - private static final IcebergColumnHandle DATA_ICEBERG_COLUMN_HANDLE = new IcebergColumnHandle(DATA_COLUMN_IDENTITY, VARCHAR, ImmutableList.of(), VARCHAR, Optional.empty()); - private static final String DATA_COLUMN_VALUE = "hello world"; - private static final Schema TABLE_SCHEMA = new Schema( - optional(KEY_COLUMN_IDENTITY.getId(), KEY_COLUMN.getName(), Types.IntegerType.get()), - optional(DATA_COLUMN_IDENTITY.getId(), DATA_COLUMN.getName(), Types.StringType.get())); private static final OrcReaderConfig ORC_READER_CONFIG = new OrcReaderConfig(); private static final OrcWriterConfig ORC_WRITER_CONFIG = new OrcWriterConfig(); private static final ParquetReaderConfig PARQUET_READER_CONFIG = new ParquetReaderConfig(); private static final ParquetWriterConfig PARQUET_WRITER_CONFIG = new ParquetWriterConfig(); @Test - public void testDynamicSplitPruning() + public void testDynamicSplitPruningOnUnpartitionedTable() throws IOException { + String tableName = "unpartitioned_table"; + String keyColumnName = "a_integer"; + ColumnIdentity keyColumnIdentity = new ColumnIdentity(1, keyColumnName, PRIMITIVE, ImmutableList.of()); + IcebergColumnHandle keyColumnHandle = IcebergColumnHandle.optional(keyColumnIdentity).columnType(INTEGER).build(); + int keyColumnValue = 42; + String dataColumnName = "a_varchar"; + ColumnIdentity dataColumnIdentity = new ColumnIdentity(2, dataColumnName, PRIMITIVE, ImmutableList.of()); + IcebergColumnHandle dataColumnHandle = IcebergColumnHandle.optional(dataColumnIdentity).columnType(VARCHAR).build(); + String dataColumnValue = "hello world"; + Schema tableSchema = new Schema( + optional(keyColumnIdentity.getId(), keyColumnName, Types.IntegerType.get()), + optional(dataColumnIdentity.getId(), dataColumnName, Types.StringType.get())); + IcebergConfig icebergConfig = new IcebergConfig(); HiveTransactionHandle transaction = new HiveTransactionHandle(false); try (TempFile file = new TempFile()) { @@ -110,118 +117,482 @@ public void testDynamicSplitPruning() TrinoOutputFile outputFile = new LocalOutputFile(file.file()); TrinoInputFile inputFile = new LocalInputFile(file.file()); - writeOrcContent(outputFile); + List columnNames = ImmutableList.of(keyColumnName, dataColumnName); + List types = ImmutableList.of(INTEGER, VARCHAR); + + try (OrcWriter writer = new OrcWriter( + OutputStreamOrcDataSink.create(outputFile), + columnNames, + types, + toOrcType(tableSchema), + NONE, + new OrcWriterOptions(), + ImmutableMap.of(), + true, + OrcWriteValidation.OrcWriteValidationMode.BOTH, + new OrcWriterStats())) { + BlockBuilder keyBuilder = INTEGER.createFixedSizeBlockBuilder(1); + INTEGER.writeLong(keyBuilder, keyColumnValue); + BlockBuilder dataBuilder = VARCHAR.createBlockBuilder(null, 1); + VARCHAR.writeString(dataBuilder, dataColumnValue); + writer.write(new Page(keyBuilder.build(), dataBuilder.build())); + } + + // Pruning due to IcebergTableHandle#unenforcedPredicate + IcebergSplit split = new IcebergSplit( + inputFile.toString(), + 0, + inputFile.length(), + inputFile.length(), + -1, // invalid; normally known + ORC, + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), + PartitionData.toJson(new PartitionData(new Object[] {})), + ImmutableList.of(), + SplitWeight.standard(), + TupleDomain.all(), + ImmutableMap.of(), + 0); - try (ConnectorPageSource emptyPageSource = createTestingPageSource(transaction, icebergConfig, inputFile, getDynamicFilter(getTupleDomainForSplitPruning()))) { - assertNull(emptyPageSource.getNextPage()); + String tablePath = inputFile.location().fileName(); + TableHandle tableHandle = new TableHandle( + TEST_CATALOG_HANDLE, + new IcebergTableHandle( + CatalogHandle.fromId("iceberg:NORMAL:v12345"), + "test_schema", + tableName, + TableType.DATA, + Optional.empty(), + SchemaParser.toJson(tableSchema), + Optional.of(PartitionSpecParser.toJson(PartitionSpec.unpartitioned())), + 2, + TupleDomain.withColumnDomains(ImmutableMap.of(keyColumnHandle, Domain.singleValue(INTEGER, (long) keyColumnValue))), + TupleDomain.all(), + OptionalLong.empty(), + ImmutableSet.of(keyColumnHandle), + Optional.empty(), + tablePath, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + ImmutableSet.of(), + Optional.of(false)), + transaction); + + TupleDomain splitPruningPredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + keyColumnHandle, + Domain.singleValue(INTEGER, 1L))); + try (ConnectorPageSource emptyPageSource = createTestingPageSource(transaction, icebergConfig, split, tableHandle, ImmutableList.of(keyColumnHandle, dataColumnHandle), getDynamicFilter(splitPruningPredicate))) { + assertThat(emptyPageSource.getNextSourcePage()).isNull(); } - try (ConnectorPageSource nonEmptyPageSource = createTestingPageSource(transaction, icebergConfig, inputFile, getDynamicFilter(getNonSelectiveTupleDomain()))) { - Page page = nonEmptyPageSource.getNextPage(); - assertNotNull(page); - assertEquals(page.getBlock(0).getPositionCount(), 1); - assertEquals(page.getBlock(0).getInt(0, 0), KEY_COLUMN_VALUE); - assertEquals(page.getBlock(1).getPositionCount(), 1); - assertEquals(page.getBlock(1).getSlice(0, 0, page.getBlock(1).getSliceLength(0)).toStringUtf8(), DATA_COLUMN_VALUE); + TupleDomain nonSelectivePredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + keyColumnHandle, + Domain.singleValue(INTEGER, (long) keyColumnValue))); + try (ConnectorPageSource nonEmptyPageSource = createTestingPageSource(transaction, icebergConfig, split, tableHandle, ImmutableList.of(keyColumnHandle, dataColumnHandle), getDynamicFilter(nonSelectivePredicate))) { + SourcePage page = nonEmptyPageSource.getNextSourcePage(); + assertThat(page).isNotNull(); + assertThat(page.getPositionCount()).isEqualTo(1); + assertThat(INTEGER.getInt(page.getBlock(0), 0)).isEqualTo(keyColumnValue); + assertThat(VARCHAR.getSlice(page.getBlock(1), 0).toStringUtf8()).isEqualTo(dataColumnValue); + } + + // Pruning due to IcebergSplit#fileStatisticsDomain + split = new IcebergSplit( + inputFile.toString(), + 0, + inputFile.length(), + inputFile.length(), + -1, // invalid; normally known + ORC, + PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), + PartitionData.toJson(new PartitionData(new Object[] {})), + ImmutableList.of(), + SplitWeight.standard(), + TupleDomain.withColumnDomains(ImmutableMap.of(keyColumnHandle, Domain.singleValue(INTEGER, (long) keyColumnValue))), + ImmutableMap.of(), + 0); + + tableHandle = new TableHandle( + TEST_CATALOG_HANDLE, + new IcebergTableHandle( + CatalogHandle.fromId("iceberg:NORMAL:v12345"), + "test_schema", + tableName, + TableType.DATA, + Optional.empty(), + SchemaParser.toJson(tableSchema), + Optional.of(PartitionSpecParser.toJson(PartitionSpec.unpartitioned())), + 2, + TupleDomain.all(), + TupleDomain.all(), + OptionalLong.empty(), + ImmutableSet.of(keyColumnHandle), + Optional.empty(), + tablePath, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + ImmutableSet.of(), + Optional.of(false)), + transaction); + + try (ConnectorPageSource emptyPageSource = createTestingPageSource(transaction, icebergConfig, split, tableHandle, ImmutableList.of(keyColumnHandle, dataColumnHandle), getDynamicFilter(splitPruningPredicate))) { + assertThat(emptyPageSource.getNextSourcePage()).isNull(); + } + + try (ConnectorPageSource nonEmptyPageSource = createTestingPageSource(transaction, icebergConfig, split, tableHandle, ImmutableList.of(keyColumnHandle, dataColumnHandle), getDynamicFilter(nonSelectivePredicate))) { + SourcePage page = nonEmptyPageSource.getNextSourcePage(); + assertThat(page).isNotNull(); + assertThat(page.getPositionCount()).isEqualTo(1); + assertThat(INTEGER.getInt(page.getBlock(0), 0)).isEqualTo(keyColumnValue); + assertThat(VARCHAR.getSlice(page.getBlock(1), 0).toStringUtf8()).isEqualTo(dataColumnValue); } } } - private static void writeOrcContent(TrinoOutputFile outputFile) + @Test + public void testDynamicSplitPruningWithExplicitPartitionFilter() throws IOException { - List columnNames = ImmutableList.of(KEY_COLUMN.getName(), DATA_COLUMN.getName()); - List types = ImmutableList.of(INTEGER, VARCHAR); - - try (OrcWriter writer = new OrcWriter( - OutputStreamOrcDataSink.create(outputFile), - columnNames, - types, - TypeConverter.toOrcType(TABLE_SCHEMA), - NONE, - new OrcWriterOptions(), - ImmutableMap.of(), - true, - OrcWriteValidation.OrcWriteValidationMode.BOTH, - new OrcWriterStats())) { - BlockBuilder keyBuilder = INTEGER.createBlockBuilder(null, 1); - INTEGER.writeLong(keyBuilder, KEY_COLUMN_VALUE); - BlockBuilder dataBuilder = VARCHAR.createBlockBuilder(null, 1); - VARCHAR.writeString(dataBuilder, DATA_COLUMN_VALUE); - writer.write(new Page(keyBuilder.build(), dataBuilder.build())); + String tableName = "sales_table"; + String dateColumnName = "date"; + ColumnIdentity dateColumnIdentity = new ColumnIdentity(1, dateColumnName, PRIMITIVE, ImmutableList.of()); + IcebergColumnHandle dateColumnHandle = IcebergColumnHandle.optional(dateColumnIdentity).columnType(DATE).build(); + long dateColumnValue = LocalDate.of(2023, 1, 10).toEpochDay(); + String receiptColumnName = "receipt"; + ColumnIdentity receiptColumnIdentity = new ColumnIdentity(2, receiptColumnName, PRIMITIVE, ImmutableList.of()); + IcebergColumnHandle receiptColumnHandle = IcebergColumnHandle.optional(receiptColumnIdentity).columnType(VARCHAR).build(); + String receiptColumnValue = "#12345"; + String amountColumnName = "amount"; + ColumnIdentity amountColumnIdentity = new ColumnIdentity(3, amountColumnName, PRIMITIVE, ImmutableList.of()); + DecimalType amountColumnType = DecimalType.createDecimalType(10, 2); + IcebergColumnHandle amountColumnHandle = IcebergColumnHandle.optional(amountColumnIdentity).columnType(amountColumnType).build(); + BigDecimal amountColumnValue = new BigDecimal("1234567.65"); + Schema tableSchema = new Schema( + optional(dateColumnIdentity.getId(), dateColumnName, Types.DateType.get()), + optional(receiptColumnIdentity.getId(), receiptColumnName, Types.StringType.get()), + optional(amountColumnIdentity.getId(), amountColumnName, Types.DecimalType.of(10, 2))); + PartitionSpec partitionSpec = PartitionSpec.builderFor(tableSchema) + .identity(dateColumnName) + .build(); + + IcebergConfig icebergConfig = new IcebergConfig(); + HiveTransactionHandle transaction = new HiveTransactionHandle(false); + try (TempFile file = new TempFile()) { + Files.delete(file.path()); + + TrinoOutputFile outputFile = new LocalOutputFile(file.file()); + TrinoInputFile inputFile = new LocalInputFile(file.file()); + List columnNames = ImmutableList.of(dateColumnName, receiptColumnName, amountColumnName); + List types = ImmutableList.of(DATE, VARCHAR, amountColumnType); + + try (OrcWriter writer = new OrcWriter( + OutputStreamOrcDataSink.create(outputFile), + columnNames, + types, + toOrcType(tableSchema), + NONE, + new OrcWriterOptions(), + ImmutableMap.of(), + true, + OrcWriteValidation.OrcWriteValidationMode.BOTH, + new OrcWriterStats())) { + BlockBuilder dateBuilder = DATE.createFixedSizeBlockBuilder(1); + DATE.writeLong(dateBuilder, dateColumnValue); + BlockBuilder receiptBuilder = VARCHAR.createBlockBuilder(null, 1); + VARCHAR.writeString(receiptBuilder, receiptColumnValue); + BlockBuilder amountBuilder = amountColumnType.createFixedSizeBlockBuilder(1); + writeShortDecimal(amountBuilder, amountColumnValue.unscaledValue().longValueExact()); + writer.write(new Page(dateBuilder.build(), receiptBuilder.build(), amountBuilder.build())); + } + + IcebergSplit split = new IcebergSplit( + inputFile.toString(), + 0, + inputFile.length(), + inputFile.length(), + -1, // invalid; normally known + ORC, + PartitionSpecParser.toJson(partitionSpec), + PartitionData.toJson(new PartitionData(new Object[] {dateColumnValue})), + ImmutableList.of(), + SplitWeight.standard(), + TupleDomain.all(), + ImmutableMap.of(), + 0); + + String tablePath = inputFile.location().fileName(); + TableHandle tableHandle = new TableHandle( + TEST_CATALOG_HANDLE, + new IcebergTableHandle( + CatalogHandle.fromId("iceberg:NORMAL:v12345"), + "test_schema", + tableName, + TableType.DATA, + Optional.empty(), + SchemaParser.toJson(tableSchema), + Optional.of(PartitionSpecParser.toJson(partitionSpec)), + 2, + TupleDomain.all(), + TupleDomain.all(), + OptionalLong.empty(), + ImmutableSet.of(dateColumnHandle), + Optional.empty(), + tablePath, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + ImmutableSet.of(), + Optional.of(false)), + transaction); + + // Simulate situations where the dynamic filter (e.g.: while performing a JOIN with another table) reduces considerably + // the amount of data to be processed from the current table + + TupleDomain differentDatePredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + dateColumnHandle, + Domain.singleValue(DATE, LocalDate.of(2023, 2, 2).toEpochDay()))); + TupleDomain nonOverlappingDatePredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + dateColumnHandle, + Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(DATE, LocalDate.of(2023, 2, 2).toEpochDay())), true))); + for (TupleDomain partitionPredicate : List.of(differentDatePredicate, nonOverlappingDatePredicate)) { + try (ConnectorPageSource emptyPageSource = createTestingPageSource( + transaction, + icebergConfig, + split, + tableHandle, + ImmutableList.of(dateColumnHandle, receiptColumnHandle, amountColumnHandle), + getDynamicFilter(partitionPredicate))) { + assertThat(emptyPageSource.getNextSourcePage()).isNull(); + } + } + + TupleDomain sameDatePredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + dateColumnHandle, + Domain.singleValue(DATE, dateColumnValue))); + TupleDomain overlappingDatePredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + dateColumnHandle, + Domain.create(ValueSet.ofRanges(Range.range(DATE, LocalDate.of(2023, 1, 1).toEpochDay(), true, LocalDate.of(2023, 2, 1).toEpochDay(), false)), true))); + for (TupleDomain partitionPredicate : List.of(sameDatePredicate, overlappingDatePredicate)) { + try (ConnectorPageSource nonEmptyPageSource = createTestingPageSource( + transaction, + icebergConfig, + split, + tableHandle, + ImmutableList.of(dateColumnHandle, receiptColumnHandle, amountColumnHandle), + getDynamicFilter(partitionPredicate))) { + SourcePage page = nonEmptyPageSource.getNextSourcePage(); + assertThat(page).isNotNull(); + assertThat(page.getPositionCount()).isEqualTo(1); + assertThat(INTEGER.getInt(page.getBlock(0), 0)).isEqualTo(dateColumnValue); + assertThat(VARCHAR.getSlice(page.getBlock(1), 0).toStringUtf8()).isEqualTo(receiptColumnValue); + assertThat(((SqlDecimal) amountColumnType.getObjectValue(page.getBlock(2), 0)).toBigDecimal()).isEqualTo(amountColumnValue); + } + } } } - private static ConnectorPageSource createTestingPageSource(HiveTransactionHandle transaction, IcebergConfig icebergConfig, TrinoInputFile inputFile, DynamicFilter dynamicFilter) + @Test + public void testDynamicSplitPruningWithExplicitPartitionFilterPartitionEvolution() throws IOException { - IcebergSplit split = new IcebergSplit( - inputFile.toString(), - 0, - inputFile.length(), - inputFile.length(), - ORC, - PartitionSpecParser.toJson(PartitionSpec.unpartitioned()), - PartitionData.toJson(new PartitionData(new Object[] {})), - ImmutableList.of(), - SplitWeight.standard()); - - String tablePath = inputFile.location().fileName(); - TableHandle tableHandle = new TableHandle( - TEST_CATALOG_HANDLE, - new IcebergTableHandle( - CatalogHandle.fromId("iceberg:NORMAL:v12345"), - SCHEMA_NAME, - TABLE_NAME, - TableType.DATA, - Optional.empty(), - SchemaParser.toJson(TABLE_SCHEMA), - Optional.of(PartitionSpecParser.toJson(PartitionSpec.unpartitioned())), - 2, - TupleDomain.withColumnDomains(ImmutableMap.of(KEY_ICEBERG_COLUMN_HANDLE, Domain.singleValue(INTEGER, (long) KEY_COLUMN_VALUE))), - TupleDomain.all(), - OptionalLong.empty(), - ImmutableSet.of(KEY_ICEBERG_COLUMN_HANDLE), - Optional.empty(), - tablePath, - ImmutableMap.of(), - false, - Optional.empty()), - transaction); + String tableName = "sales_table"; + String yearColumnName = "year"; + ColumnIdentity yearColumnIdentity = new ColumnIdentity(1, yearColumnName, PRIMITIVE, ImmutableList.of()); + IcebergColumnHandle yearColumnHandle = IcebergColumnHandle.optional(yearColumnIdentity).columnType(INTEGER).build(); + long yearColumnValue = 2023L; + String monthColumnName = "month"; + ColumnIdentity monthColumnIdentity = new ColumnIdentity(2, monthColumnName, PRIMITIVE, ImmutableList.of()); + IcebergColumnHandle monthColumnHandle = IcebergColumnHandle.optional(monthColumnIdentity).columnType(INTEGER).build(); + long monthColumnValue = 1L; + String receiptColumnName = "receipt"; + ColumnIdentity receiptColumnIdentity = new ColumnIdentity(3, receiptColumnName, PRIMITIVE, ImmutableList.of()); + IcebergColumnHandle receiptColumnHandle = IcebergColumnHandle.optional(receiptColumnIdentity).columnType(VARCHAR).build(); + String receiptColumnValue = "#12345"; + String amountColumnName = "amount"; + ColumnIdentity amountColumnIdentity = new ColumnIdentity(4, amountColumnName, PRIMITIVE, ImmutableList.of()); + DecimalType amountColumnType = DecimalType.createDecimalType(10, 2); + IcebergColumnHandle amountColumnHandle = IcebergColumnHandle.optional(amountColumnIdentity).columnType(amountColumnType).build(); + BigDecimal amountColumnValue = new BigDecimal("1234567.65"); + Schema tableSchema = new Schema( + optional(yearColumnIdentity.getId(), yearColumnName, Types.IntegerType.get()), + optional(monthColumnIdentity.getId(), monthColumnName, Types.IntegerType.get()), + optional(receiptColumnIdentity.getId(), receiptColumnName, Types.StringType.get()), + optional(amountColumnIdentity.getId(), amountColumnName, Types.DecimalType.of(10, 2))); + PartitionSpec partitionSpec = PartitionSpec.builderFor(tableSchema) + .identity(yearColumnName) + .build(); + IcebergConfig icebergConfig = new IcebergConfig(); + HiveTransactionHandle transaction = new HiveTransactionHandle(false); + try (TempFile file = new TempFile()) { + Files.delete(file.path()); + + TrinoOutputFile outputFile = new LocalOutputFile(file.file()); + TrinoInputFile inputFile = new LocalInputFile(file.file()); + List columnNames = ImmutableList.of(yearColumnName, monthColumnName, receiptColumnName, amountColumnName); + List types = ImmutableList.of(INTEGER, INTEGER, VARCHAR, amountColumnType); + + try (OrcWriter writer = new OrcWriter( + OutputStreamOrcDataSink.create(outputFile), + columnNames, + types, + toOrcType(tableSchema), + NONE, + new OrcWriterOptions(), + ImmutableMap.of(), + true, + OrcWriteValidation.OrcWriteValidationMode.BOTH, + new OrcWriterStats())) { + BlockBuilder yearBuilder = INTEGER.createFixedSizeBlockBuilder(1); + INTEGER.writeLong(yearBuilder, yearColumnValue); + BlockBuilder monthBuilder = INTEGER.createFixedSizeBlockBuilder(1); + INTEGER.writeLong(monthBuilder, monthColumnValue); + BlockBuilder receiptBuilder = VARCHAR.createBlockBuilder(null, 1); + VARCHAR.writeString(receiptBuilder, receiptColumnValue); + BlockBuilder amountBuilder = amountColumnType.createFixedSizeBlockBuilder(1); + writeShortDecimal(amountBuilder, amountColumnValue.unscaledValue().longValueExact()); + writer.write(new Page(yearBuilder.build(), monthBuilder.build(), receiptBuilder.build(), amountBuilder.build())); + } + + IcebergSplit split = new IcebergSplit( + inputFile.toString(), + 0, + inputFile.length(), + inputFile.length(), + -1, // invalid; normally known + ORC, + PartitionSpecParser.toJson(partitionSpec), + PartitionData.toJson(new PartitionData(new Object[] {yearColumnValue})), + ImmutableList.of(), + SplitWeight.standard(), + TupleDomain.all(), + ImmutableMap.of(), + 0); + + String tablePath = inputFile.location().fileName(); + // Simulate the situation where `month` column is added at a later phase as partitioning column + // in addition to the `year` column, which leads to use it as unenforced predicate in the table handle + // after applying the filter + TableHandle tableHandle = new TableHandle( + TEST_CATALOG_HANDLE, + new IcebergTableHandle( + CatalogHandle.fromId("iceberg:NORMAL:v12345"), + "test_schema", + tableName, + TableType.DATA, + Optional.empty(), + SchemaParser.toJson(tableSchema), + Optional.of(PartitionSpecParser.toJson(partitionSpec)), + 2, + TupleDomain.withColumnDomains( + ImmutableMap.of( + yearColumnHandle, + Domain.create(ValueSet.ofRanges(Range.range(INTEGER, 2023L, true, 2024L, true)), true))), + TupleDomain.withColumnDomains( + ImmutableMap.of( + monthColumnHandle, + Domain.create(ValueSet.ofRanges(Range.range(INTEGER, 1L, true, 12L, true)), true))), + OptionalLong.empty(), + ImmutableSet.of(yearColumnHandle, monthColumnHandle, receiptColumnHandle, amountColumnHandle), + Optional.empty(), + tablePath, + ImmutableMap.of(), + Optional.empty(), + false, + Optional.empty(), + ImmutableSet.of(), + Optional.of(false)), + transaction); + // Simulate situations where the dynamic filter (e.g.: while performing a JOIN with another table) reduces considerably + // the amount of data to be processed from the current table + TupleDomain differentYearPredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + yearColumnHandle, + Domain.singleValue(INTEGER, 2024L))); + TupleDomain sameYearAndDifferentMonthPredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + yearColumnHandle, + Domain.singleValue(INTEGER, 2023L), + monthColumnHandle, + Domain.singleValue(INTEGER, 2L))); + for (TupleDomain partitionPredicate : List.of(differentYearPredicate, sameYearAndDifferentMonthPredicate)) { + try (ConnectorPageSource emptyPageSource = createTestingPageSource( + transaction, + icebergConfig, + split, + tableHandle, + ImmutableList.of(yearColumnHandle, monthColumnHandle, receiptColumnHandle, amountColumnHandle), + getDynamicFilter(partitionPredicate))) { + assertThat(emptyPageSource.getNextSourcePage()).isNull(); + } + } + + TupleDomain sameYearPredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + yearColumnHandle, + Domain.singleValue(INTEGER, 2023L))); + TupleDomain sameYearAndMonthPredicate = TupleDomain.withColumnDomains( + ImmutableMap.of( + yearColumnHandle, + Domain.singleValue(INTEGER, 2023L), + monthColumnHandle, + Domain.singleValue(INTEGER, 1L))); + for (TupleDomain partitionPredicate : List.of(sameYearPredicate, sameYearAndMonthPredicate)) { + try (ConnectorPageSource nonEmptyPageSource = createTestingPageSource( + transaction, + icebergConfig, + split, + tableHandle, + ImmutableList.of(yearColumnHandle, monthColumnHandle, receiptColumnHandle, amountColumnHandle), + getDynamicFilter(partitionPredicate))) { + SourcePage page = nonEmptyPageSource.getNextSourcePage(); + assertThat(page).isNotNull(); + assertThat(page.getPositionCount()).isEqualTo(1); + assertThat(INTEGER.getInt(page.getBlock(0), 0)).isEqualTo(2023L); + assertThat(INTEGER.getInt(page.getBlock(1), 0)).isEqualTo(1L); + assertThat(VARCHAR.getSlice(page.getBlock(2), 0).toStringUtf8()).isEqualTo(receiptColumnValue); + assertThat(((SqlDecimal) amountColumnType.getObjectValue(page.getBlock(3), 0)).toBigDecimal()).isEqualTo(amountColumnValue); + } + } + } + } + + private static ConnectorPageSource createTestingPageSource( + HiveTransactionHandle transaction, + IcebergConfig icebergConfig, + IcebergSplit split, + TableHandle tableHandle, + List columns, + DynamicFilter dynamicFilter) + { FileFormatDataSourceStats stats = new FileFormatDataSourceStats(); - IcebergPageSourceProvider provider = new IcebergPageSourceProvider( - new HdfsFileSystemFactory(HDFS_ENVIRONMENT, HDFS_FILE_SYSTEM_STATS), + IcebergPageSourceProviderFactory factory = new IcebergPageSourceProviderFactory( + new DefaultIcebergFileSystemFactory(new HdfsFileSystemFactory(HDFS_ENVIRONMENT, HDFS_FILE_SYSTEM_STATS)), + FILE_IO_FACTORY, stats, ORC_READER_CONFIG, PARQUET_READER_CONFIG, TESTING_TYPE_MANAGER); - - return provider.createPageSource( + return factory.createPageSourceProvider().createPageSource( transaction, getSession(icebergConfig), split, - tableHandle.getConnectorHandle(), - ImmutableList.of(KEY_ICEBERG_COLUMN_HANDLE, DATA_ICEBERG_COLUMN_HANDLE), + tableHandle.connectorHandle(), + columns, dynamicFilter); } - private static TupleDomain getTupleDomainForSplitPruning() - { - return TupleDomain.withColumnDomains( - ImmutableMap.of( - KEY_ICEBERG_COLUMN_HANDLE, - Domain.singleValue(INTEGER, 1L))); - } - - private static TupleDomain getNonSelectiveTupleDomain() - { - return TupleDomain.withColumnDomains( - ImmutableMap.of( - KEY_ICEBERG_COLUMN_HANDLE, - Domain.singleValue(INTEGER, (long) KEY_COLUMN_VALUE))); - } - private static TestingConnectorSession getSession(IcebergConfig icebergConfig) { return TestingConnectorSession.builder() diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcMetricsCollection.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcMetricsCollection.java index 0432d970640c..c80ec99a91ac 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcMetricsCollection.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcMetricsCollection.java @@ -16,16 +16,15 @@ import com.google.common.collect.ImmutableMap; import io.trino.Session; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.plugin.base.CatalogName; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.cache.CachingHiveMetastore; import io.trino.plugin.hive.TrinoViewHiveMetastore; -import io.trino.plugin.hive.metastore.HiveMetastore; -import io.trino.plugin.hive.metastore.cache.CachingHiveMetastore; import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.file.FileMetastoreTableOperationsProvider; -import io.trino.plugin.iceberg.catalog.file.TestingIcebergFileMetastoreCatalogModule; import io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog; import io.trino.plugin.tpch.TpchPlugin; +import io.trino.spi.catalog.CatalogName; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.type.TestingTypeManager; import io.trino.testing.AbstractTestQueryFramework; @@ -33,29 +32,28 @@ import io.trino.testing.MaterializedResult; import io.trino.testing.MaterializedRow; import io.trino.testing.QueryRunner; -import io.trino.testing.TestingConnectorSession; import org.apache.iceberg.FileContent; import org.apache.iceberg.Table; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.util.List; import java.util.Map; -import java.util.Optional; -import static com.google.inject.util.Modules.EMPTY_MODULE; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static io.trino.SystemSessionProperties.INITIAL_SPLITS_PER_NODE; import static io.trino.SystemSessionProperties.MAX_DRIVERS_PER_TASK; import static io.trino.SystemSessionProperties.TASK_CONCURRENCY; -import static io.trino.SystemSessionProperties.TASK_PARTITIONED_WRITER_COUNT; -import static io.trino.SystemSessionProperties.TASK_WRITER_COUNT; -import static io.trino.plugin.hive.metastore.cache.CachingHiveMetastore.memoizeMetastore; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static io.trino.SystemSessionProperties.TASK_MAX_WRITER_COUNT; +import static io.trino.SystemSessionProperties.TASK_MIN_WRITER_COUNT; +import static io.trino.metastore.cache.CachingHiveMetastore.createPerTransactionCache; import static io.trino.plugin.iceberg.DataFileRecord.toDataFileRecord; import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; import static io.trino.testing.TestingSession.testSessionBuilder; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertNull; +import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergOrcMetricsCollection extends AbstractTestQueryFramework @@ -71,34 +69,40 @@ protected QueryRunner createQueryRunner() .setCatalog("iceberg") .setSchema("test_schema") .setSystemProperty(TASK_CONCURRENCY, "1") - .setSystemProperty(TASK_WRITER_COUNT, "1") - .setSystemProperty(TASK_PARTITIONED_WRITER_COUNT, "1") + .setSystemProperty(TASK_MIN_WRITER_COUNT, "1") + .setSystemProperty(TASK_MAX_WRITER_COUNT, "1") + .setSystemProperty(INITIAL_SPLITS_PER_NODE, "1") .setSystemProperty(MAX_DRIVERS_PER_TASK, "1") .setCatalogSessionProperty("iceberg", "orc_string_statistics_limit", Integer.MAX_VALUE + "B") .build(); - DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(session) - .setNodeCount(1) + QueryRunner queryRunner = DistributedQueryRunner.builder(session) + .setWorkerCount(0) .build(); File baseDir = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data").toFile(); - HiveMetastore metastore = createTestingFileHiveMetastore(baseDir); - queryRunner.installPlugin(new TestingIcebergPlugin(Optional.of(new TestingIcebergFileMetastoreCatalogModule(metastore)), Optional.empty(), EMPTY_MODULE)); + queryRunner.installPlugin(new TestingIcebergPlugin(baseDir.toPath())); queryRunner.createCatalog(ICEBERG_CATALOG, "iceberg", ImmutableMap.of("iceberg.file-format", "ORC")); TrinoFileSystemFactory fileSystemFactory = getFileSystemFactory(queryRunner); - tableOperationsProvider = new FileMetastoreTableOperationsProvider(fileSystemFactory); - CachingHiveMetastore cachingHiveMetastore = memoizeMetastore(metastore, 1000); + tableOperationsProvider = new FileMetastoreTableOperationsProvider(fileSystemFactory, FILE_IO_FACTORY); + + HiveMetastore metastore = getHiveMetastore(queryRunner); + + CachingHiveMetastore cachingHiveMetastore = createPerTransactionCache(metastore, 1000); trinoCatalog = new TrinoHiveCatalog( new CatalogName("catalog"), cachingHiveMetastore, new TrinoViewHiveMetastore(cachingHiveMetastore, false, "trino-version", "test"), fileSystemFactory, + FILE_IO_FACTORY, new TestingTypeManager(), tableOperationsProvider, false, false, - false); + false, + new IcebergConfig().isHideMaterializedViewStorageTable(), + directExecutor()); queryRunner.installPlugin(new TpchPlugin()); queryRunner.createCatalog("tpch", "tpch"); @@ -112,7 +116,7 @@ protected QueryRunner createQueryRunner() public void testMetrics() { assertUpdate("create table no_metrics (c1 varchar, c2 varchar)"); - Table table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, TestingConnectorSession.SESSION, + Table table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, IcebergTestUtils.SESSION, new SchemaTableName("test_schema", "no_metrics")); // skip metrics for all columns table.updateProperties().set("write.metadata.metrics.default", "none").commit(); @@ -120,16 +124,16 @@ public void testMetrics() assertUpdate("insert into no_metrics values ('abcd', 'a')", 1); List materializedRows = computeActual("select * from \"no_metrics$files\"").getMaterializedRows(); DataFileRecord datafile = toDataFileRecord(materializedRows.get(0)); - assertEquals(datafile.getRecordCount(), 1); - assertNull(datafile.getValueCounts()); - assertNull(datafile.getNullValueCounts()); - assertNull(datafile.getUpperBounds()); - assertNull(datafile.getLowerBounds()); - assertNull(datafile.getColumnSizes()); + assertThat(datafile.getRecordCount()).isEqualTo(1); + assertThat(datafile.getValueCounts()).isNull(); + assertThat(datafile.getNullValueCounts()).isNull(); + assertThat(datafile.getUpperBounds()).isNull(); + assertThat(datafile.getLowerBounds()).isNull(); + assertThat(datafile.getColumnSizes()).isNull(); // keep c1 metrics assertUpdate("create table c1_metrics (c1 varchar, c2 varchar)"); - table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, TestingConnectorSession.SESSION, + table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, IcebergTestUtils.SESSION, new SchemaTableName("test_schema", "c1_metrics")); table.updateProperties() .set("write.metadata.metrics.default", "none") @@ -139,15 +143,15 @@ public void testMetrics() assertUpdate("insert into c1_metrics values ('b', 'a')", 1); materializedRows = computeActual("select * from \"c1_metrics$files\"").getMaterializedRows(); datafile = toDataFileRecord(materializedRows.get(0)); - assertEquals(datafile.getRecordCount(), 1); - assertEquals(datafile.getValueCounts().size(), 1); - assertEquals(datafile.getNullValueCounts().size(), 1); - assertEquals(datafile.getUpperBounds().size(), 1); - assertEquals(datafile.getLowerBounds().size(), 1); + assertThat(datafile.getRecordCount()).isEqualTo(1); + assertThat(datafile.getValueCounts()).hasSize(1); + assertThat(datafile.getNullValueCounts()).hasSize(1); + assertThat(datafile.getUpperBounds()).hasSize(1); + assertThat(datafile.getLowerBounds()).hasSize(1); // set c1 metrics mode to count assertUpdate("create table c1_metrics_count (c1 varchar, c2 varchar)"); - table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, TestingConnectorSession.SESSION, + table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, IcebergTestUtils.SESSION, new SchemaTableName("test_schema", "c1_metrics_count")); table.updateProperties() .set("write.metadata.metrics.default", "none") @@ -157,15 +161,15 @@ public void testMetrics() assertUpdate("insert into c1_metrics_count values ('b', 'a')", 1); materializedRows = computeActual("select * from \"c1_metrics_count$files\"").getMaterializedRows(); datafile = toDataFileRecord(materializedRows.get(0)); - assertEquals(datafile.getRecordCount(), 1); - assertEquals(datafile.getValueCounts().size(), 1); - assertEquals(datafile.getNullValueCounts().size(), 1); - assertNull(datafile.getUpperBounds()); - assertNull(datafile.getLowerBounds()); + assertThat(datafile.getRecordCount()).isEqualTo(1); + assertThat(datafile.getValueCounts()).hasSize(1); + assertThat(datafile.getNullValueCounts()).hasSize(1); + assertThat(datafile.getUpperBounds()).isNull(); + assertThat(datafile.getLowerBounds()).isNull(); // set c1 metrics mode to truncate(10) assertUpdate("create table c1_metrics_truncate (c1 varchar, c2 varchar)"); - table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, TestingConnectorSession.SESSION, + table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, IcebergTestUtils.SESSION, new SchemaTableName("test_schema", "c1_metrics_truncate")); table.updateProperties() .set("write.metadata.metrics.default", "none") @@ -175,17 +179,15 @@ public void testMetrics() assertUpdate("insert into c1_metrics_truncate values ('abcaabcaabcaabca', 'a')", 1); materializedRows = computeActual("select * from \"c1_metrics_truncate$files\"").getMaterializedRows(); datafile = toDataFileRecord(materializedRows.get(0)); - assertEquals(datafile.getRecordCount(), 1); - assertEquals(datafile.getValueCounts().size(), 1); - assertEquals(datafile.getNullValueCounts().size(), 1); - datafile.getUpperBounds().forEach((k, v) -> { - assertEquals(v.length(), 10); }); - datafile.getLowerBounds().forEach((k, v) -> { - assertEquals(v.length(), 10); }); + assertThat(datafile.getRecordCount()).isEqualTo(1); + assertThat(datafile.getValueCounts()).hasSize(1); + assertThat(datafile.getNullValueCounts()).hasSize(1); + datafile.getUpperBounds().forEach((k, v) -> assertThat(v.length()).isEqualTo(10)); + datafile.getLowerBounds().forEach((k, v) -> assertThat(v.length()).isEqualTo(10)); // keep both c1 and c2 metrics assertUpdate("create table c_metrics (c1 varchar, c2 varchar)"); - table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, TestingConnectorSession.SESSION, + table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, IcebergTestUtils.SESSION, new SchemaTableName("test_schema", "c_metrics")); table.updateProperties() .set("write.metadata.metrics.column.c1", "full") @@ -194,15 +196,15 @@ public void testMetrics() assertUpdate("insert into c_metrics values ('b', 'a')", 1); materializedRows = computeActual("select * from \"c_metrics$files\"").getMaterializedRows(); datafile = toDataFileRecord(materializedRows.get(0)); - assertEquals(datafile.getRecordCount(), 1); - assertEquals(datafile.getValueCounts().size(), 2); - assertEquals(datafile.getNullValueCounts().size(), 2); - assertEquals(datafile.getUpperBounds().size(), 2); - assertEquals(datafile.getLowerBounds().size(), 2); + assertThat(datafile.getRecordCount()).isEqualTo(1); + assertThat(datafile.getValueCounts()).hasSize(2); + assertThat(datafile.getNullValueCounts()).hasSize(2); + assertThat(datafile.getUpperBounds()).hasSize(2); + assertThat(datafile.getLowerBounds()).hasSize(2); // keep all metrics assertUpdate("create table metrics (c1 varchar, c2 varchar)"); - table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, TestingConnectorSession.SESSION, + table = IcebergUtil.loadIcebergTable(trinoCatalog, tableOperationsProvider, IcebergTestUtils.SESSION, new SchemaTableName("test_schema", "metrics")); table.updateProperties() .set("write.metadata.metrics.default", "full") @@ -210,11 +212,11 @@ public void testMetrics() assertUpdate("insert into metrics values ('b', 'a')", 1); materializedRows = computeActual("select * from \"metrics$files\"").getMaterializedRows(); datafile = toDataFileRecord(materializedRows.get(0)); - assertEquals(datafile.getRecordCount(), 1); - assertEquals(datafile.getValueCounts().size(), 2); - assertEquals(datafile.getNullValueCounts().size(), 2); - assertEquals(datafile.getUpperBounds().size(), 2); - assertEquals(datafile.getLowerBounds().size(), 2); + assertThat(datafile.getRecordCount()).isEqualTo(1); + assertThat(datafile.getValueCounts()).hasSize(2); + assertThat(datafile.getNullValueCounts()).hasSize(2); + assertThat(datafile.getUpperBounds()).hasSize(2); + assertThat(datafile.getLowerBounds()).hasSize(2); } @Test @@ -222,51 +224,51 @@ public void testBasic() { assertUpdate("CREATE TABLE orders WITH (format = 'ORC') AS SELECT * FROM tpch.tiny.orders", 15000); MaterializedResult materializedResult = computeActual("SELECT * FROM \"orders$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); DataFileRecord datafile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); // check content - assertEquals(datafile.getContent(), FileContent.DATA.id()); + assertThat(datafile.getContent()).isEqualTo(FileContent.DATA.id()); // Check file format - assertEquals(datafile.getFileFormat(), "ORC"); + assertThat(datafile.getFileFormat()).isEqualTo("ORC"); // Check file row count - assertEquals(datafile.getRecordCount(), 15000L); + assertThat(datafile.getRecordCount()).isEqualTo(15000L); // Check per-column value count - datafile.getValueCounts().values().forEach(valueCount -> assertEquals(valueCount, (Long) 15000L)); + datafile.getValueCounts().values().forEach(valueCount -> assertThat(valueCount).isEqualTo((Long) 15000L)); // Check per-column null value count - datafile.getNullValueCounts().values().forEach(nullValueCount -> assertEquals(nullValueCount, (Long) 0L)); + datafile.getNullValueCounts().values().forEach(nullValueCount -> assertThat(nullValueCount).isEqualTo((Long) 0L)); // Check NaN value count // TODO: add more checks after NaN info is collected - assertNull(datafile.getNanValueCounts()); + assertThat(datafile.getNanValueCounts()).isNull(); // Check per-column lower bound Map lowerBounds = datafile.getLowerBounds(); - assertEquals(lowerBounds.get(1), "1"); - assertEquals(lowerBounds.get(2), "1"); - assertEquals(lowerBounds.get(3), "F"); - assertEquals(lowerBounds.get(4), "874.89"); - assertEquals(lowerBounds.get(5), "1992-01-01"); - assertEquals(lowerBounds.get(6), "1-URGENT"); - assertEquals(lowerBounds.get(7), "Clerk#000000001"); - assertEquals(lowerBounds.get(8), "0"); - assertEquals(lowerBounds.get(9), " about the accou"); + assertThat(lowerBounds).containsEntry(1, "1"); + assertThat(lowerBounds).containsEntry(2, "1"); + assertThat(lowerBounds).containsEntry(3, "F"); + assertThat(lowerBounds).containsEntry(4, "874.89"); + assertThat(lowerBounds).containsEntry(5, "1992-01-01"); + assertThat(lowerBounds).containsEntry(6, "1-URGENT"); + assertThat(lowerBounds).containsEntry(7, "Clerk#000000001"); + assertThat(lowerBounds).containsEntry(8, "0"); + assertThat(lowerBounds).containsEntry(9, " about the accou"); // Check per-column upper bound Map upperBounds = datafile.getUpperBounds(); - assertEquals(upperBounds.get(1), "60000"); - assertEquals(upperBounds.get(2), "1499"); - assertEquals(upperBounds.get(3), "P"); - assertEquals(upperBounds.get(4), "466001.28"); - assertEquals(upperBounds.get(5), "1998-08-02"); - assertEquals(upperBounds.get(6), "5-LOW"); - assertEquals(upperBounds.get(7), "Clerk#000001000"); - assertEquals(upperBounds.get(8), "0"); - assertEquals(upperBounds.get(9), "zzle. carefully!"); + assertThat(upperBounds).containsEntry(1, "60000"); + assertThat(upperBounds).containsEntry(2, "1499"); + assertThat(upperBounds).containsEntry(3, "P"); + assertThat(upperBounds).containsEntry(4, "466001.28"); + assertThat(upperBounds).containsEntry(5, "1998-08-02"); + assertThat(upperBounds).containsEntry(6, "5-LOW"); + assertThat(upperBounds).containsEntry(7, "Clerk#000001000"); + assertThat(upperBounds).containsEntry(8, "0"); + assertThat(upperBounds).containsEntry(9, "zzle. carefully!"); assertUpdate("DROP TABLE orders"); } @@ -281,41 +283,41 @@ public void testWithNulls() "(4, null, 'ccc', null)," + "(null, null, 'ddd', null)", 4); MaterializedResult materializedResult = computeActual("SELECT * FROM \"test_with_nulls$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); DataFileRecord datafile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); // Check per-column value count - datafile.getValueCounts().values().forEach(valueCount -> assertEquals(valueCount, (Long) 4L)); + datafile.getValueCounts().values().forEach(valueCount -> assertThat(valueCount).isEqualTo((Long) 4L)); // Check per-column null value count - assertEquals(datafile.getNullValueCounts().get(1), (Long) 1L); - assertEquals(datafile.getNullValueCounts().get(2), (Long) 2L); - assertEquals(datafile.getNullValueCounts().get(3), (Long) 0L); - assertEquals(datafile.getNullValueCounts().get(4), (Long) 2L); + assertThat(datafile.getNullValueCounts()).containsEntry(1, (Long) 1L); + assertThat(datafile.getNullValueCounts()).containsEntry(2, (Long) 2L); + assertThat(datafile.getNullValueCounts()).containsEntry(3, (Long) 0L); + assertThat(datafile.getNullValueCounts()).containsEntry(4, (Long) 2L); // Check per-column lower bound - assertEquals(datafile.getLowerBounds().get(1), "3"); - assertEquals(datafile.getLowerBounds().get(2), "3.4"); - assertEquals(datafile.getLowerBounds().get(3), "aaa"); - assertEquals(datafile.getLowerBounds().get(4), "2020-01-01T00:00:00.123"); + assertThat(datafile.getLowerBounds()).containsEntry(1, "3"); + assertThat(datafile.getLowerBounds()).containsEntry(2, "3.4"); + assertThat(datafile.getLowerBounds()).containsEntry(3, "aaa"); + assertThat(datafile.getLowerBounds()).containsEntry(4, "2020-01-01T00:00:00.123"); assertUpdate("DROP TABLE test_with_nulls"); assertUpdate("CREATE TABLE test_all_nulls (_integer INTEGER)"); assertUpdate("INSERT INTO test_all_nulls VALUES null, null, null", 3); materializedResult = computeActual("SELECT * FROM \"test_all_nulls$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); datafile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); // Check per-column value count - assertEquals(datafile.getValueCounts().get(1), (Long) 3L); + assertThat(datafile.getValueCounts()).containsEntry(1, (Long) 3L); // Check per-column null value count - assertEquals(datafile.getNullValueCounts().get(1), (Long) 3L); + assertThat(datafile.getNullValueCounts()).containsEntry(1, (Long) 3L); // Check that lower bounds and upper bounds are nulls. (There's no non-null record) - assertNull(datafile.getLowerBounds()); - assertNull(datafile.getUpperBounds()); + assertThat(datafile.getLowerBounds()).isNull(); + assertThat(datafile.getUpperBounds()).isNull(); assertUpdate("DROP TABLE test_all_nulls"); } @@ -326,21 +328,21 @@ public void testWithNaNs() assertUpdate("CREATE TABLE test_with_nans (_int INTEGER, _real REAL, _double DOUBLE)"); assertUpdate("INSERT INTO test_with_nans VALUES (1, 1.1, 1.1), (2, nan(), 4.5), (3, 4.6, -nan())", 3); MaterializedResult materializedResult = computeActual("SELECT * FROM \"test_with_nans$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); DataFileRecord datafile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); // Check per-column value count - datafile.getValueCounts().values().forEach(valueCount -> assertEquals(valueCount, (Long) 3L)); + datafile.getValueCounts().values().forEach(valueCount -> assertThat(valueCount).isEqualTo((Long) 3L)); // Check per-column nan value count - assertEquals(datafile.getNanValueCounts().size(), 2); - assertEquals(datafile.getNanValueCounts().get(2), (Long) 1L); - assertEquals(datafile.getNanValueCounts().get(3), (Long) 1L); + assertThat(datafile.getNanValueCounts()).hasSize(2); + assertThat(datafile.getNanValueCounts()).containsEntry(2, (Long) 1L); + assertThat(datafile.getNanValueCounts()).containsEntry(3, (Long) 1L); - assertNull(datafile.getLowerBounds().get(2)); - assertNull(datafile.getLowerBounds().get(3)); - assertNull(datafile.getUpperBounds().get(2)); - assertNull(datafile.getUpperBounds().get(3)); + assertThat(datafile.getLowerBounds().get(2)).isNull(); + assertThat(datafile.getLowerBounds().get(3)).isNull(); + assertThat(datafile.getUpperBounds().get(2)).isNull(); + assertThat(datafile.getUpperBounds().get(3)).isNull(); assertUpdate("DROP TABLE test_with_nans"); } @@ -355,7 +357,7 @@ public void testNestedTypes() "(8, ROW(0, ARRAY[14, 17, 21], 3.9)), " + "(3, ROW(10, ARRAY[15, 18, 22], 4.9))", 4); MaterializedResult materializedResult = computeActual("SELECT * FROM \"test_nested_types$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); DataFileRecord datafile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); Map lowerBounds = datafile.getLowerBounds(); @@ -365,20 +367,20 @@ public void testNestedTypes() // 1. top-level primitive columns // 2. and nested primitive fields that are not descendants of LISTs or MAPs // should appear in lowerBounds or UpperBounds - assertEquals(lowerBounds.size(), 3); - assertEquals(upperBounds.size(), 3); + assertThat(lowerBounds).hasSize(3); + assertThat(upperBounds).hasSize(3); // col1 - assertEquals(lowerBounds.get(1), "-9"); - assertEquals(upperBounds.get(1), "8"); + assertThat(lowerBounds).containsEntry(1, "-9"); + assertThat(upperBounds).containsEntry(1, "8"); // col2.f1 (key in lowerBounds/upperBounds is Iceberg ID) - assertEquals(lowerBounds.get(3), "0"); - assertEquals(upperBounds.get(3), "10"); + assertThat(lowerBounds).containsEntry(3, "0"); + assertThat(upperBounds).containsEntry(3, "10"); // col2.f3 (key in lowerBounds/upperBounds is Iceberg ID) - assertEquals(lowerBounds.get(5), "-2.9"); - assertEquals(upperBounds.get(5), "4.9"); + assertThat(lowerBounds).containsEntry(5, "-2.9"); + assertThat(upperBounds).containsEntry(5, "4.9"); assertUpdate("DROP TABLE test_nested_types"); } @@ -392,27 +394,27 @@ public void testWithTimestamps() "(TIMESTAMP '2021-01-01 00:00:00.222222'), " + "(TIMESTAMP '2021-01-31 00:00:00.333333')", 3); MaterializedResult materializedResult = computeActual("SELECT * FROM \"test_timestamp$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); DataFileRecord datafile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); // Check file format - assertEquals(datafile.getFileFormat(), "ORC"); + assertThat(datafile.getFileFormat()).isEqualTo("ORC"); // Check file row count - assertEquals(datafile.getRecordCount(), 3L); + assertThat(datafile.getRecordCount()).isEqualTo(3L); // Check per-column value count - datafile.getValueCounts().values().forEach(valueCount -> assertEquals(valueCount, (Long) 3L)); + datafile.getValueCounts().values().forEach(valueCount -> assertThat(valueCount).isEqualTo((Long) 3L)); // Check per-column null value count - datafile.getNullValueCounts().values().forEach(nullValueCount -> assertEquals(nullValueCount, (Long) 0L)); + datafile.getNullValueCounts().values().forEach(nullValueCount -> assertThat(nullValueCount).isEqualTo((Long) 0L)); // Check column lower bound. Min timestamp doesn't rely on file-level statistics and will not be truncated to milliseconds. - assertEquals(datafile.getLowerBounds().get(1), "2021-01-01T00:00:00.111"); + assertThat(datafile.getLowerBounds()).containsEntry(1, "2021-01-01T00:00:00.111"); assertQuery("SELECT min(_timestamp) FROM test_timestamp", "VALUES '2021-01-01 00:00:00.111111'"); // Check column upper bound. Max timestamp doesn't rely on file-level statistics and will not be truncated to milliseconds. - assertEquals(datafile.getUpperBounds().get(1), "2021-01-31T00:00:00.333999"); + assertThat(datafile.getUpperBounds()).containsEntry(1, "2021-01-31T00:00:00.333999"); assertQuery("SELECT max(_timestamp) FROM test_timestamp", "VALUES '2021-01-31 00:00:00.333333'"); assertUpdate("DROP TABLE test_timestamp"); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcWithBloomFilters.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcWithBloomFilters.java index 38d3652852ef..44f51b4c75e6 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcWithBloomFilters.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergOrcWithBloomFilters.java @@ -15,8 +15,15 @@ import io.trino.testing.BaseOrcWithBloomFiltersTest; import io.trino.testing.QueryRunner; +import io.trino.testing.sql.TestTable; +import org.junit.jupiter.api.Test; +import java.util.Map; + +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergOrcWithBloomFilters extends BaseOrcWithBloomFiltersTest @@ -26,6 +33,7 @@ protected QueryRunner createQueryRunner() throws Exception { return IcebergQueryRunner.builder() + .addIcebergProperty("iceberg.file-format", "ORC") .addIcebergProperty("hive.orc.bloom-filters.enabled", "true") .addIcebergProperty("hive.orc.default-bloom-filter-fpp", "0.001") .build(); @@ -39,4 +47,70 @@ protected String getTableProperties(String bloomFilterColumnName, String bucketi bloomFilterColumnName, bucketingColumnName); } + + @Test + public void testBloomFilterPropertiesArePersistedDuringCreate() + { + String tableName = "test_metadata_write_properties_" + randomNameSuffix(); + assertQuerySucceeds("CREATE TABLE " + tableName + " (a bigint, b bigint, c bigint) WITH (" + + "format = 'orc'," + + "orc_bloom_filter_columns = array['a','b']," + + "orc_bloom_filter_fpp = 0.1)"); + + assertThat(getTableProperties(tableName)) + .containsEntry("write.orc.bloom.filter.columns", "a,b") + .containsEntry("write.orc.bloom.filter.fpp", "0.1"); + + assertThat((String) computeScalar("SHOW CREATE TABLE " + tableName)) + .contains("orc_bloom_filter_columns", "orc_bloom_filter_fpp"); + } + + @Test + void testBloomFilterPropertiesArePersistedDuringSetProperties() + { + String tableName = "test_metadata_write_properties_" + randomNameSuffix(); + assertQuerySucceeds("CREATE TABLE " + tableName + "(A bigint, b bigint, c bigint)"); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES orc_bloom_filter_columns = ARRAY['a','B']"); + assertThat(getTableProperties(tableName)) + .containsEntry("write.orc.bloom.filter.columns", "a,b"); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES orc_bloom_filter_columns = ARRAY['a']"); + assertThat(getTableProperties(tableName)) + .containsEntry("write.orc.bloom.filter.columns", "a"); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES orc_bloom_filter_columns = ARRAY[]"); + assertThat(getTableProperties(tableName)) + .doesNotContainKey("write.orc.bloom.filter.columns"); + } + + @Test + void testInvalidBloomFilterProperties() + { + String tableName = "test_invalid_bloom_filter_properties_" + randomNameSuffix(); + assertQueryFails( + "CREATE TABLE " + tableName + "(x int) WITH (orc_bloom_filter_columns = ARRAY['missing_column'])", + "\\QOrc bloom filter columns [missing_column] not present in schema"); + + assertQuerySucceeds("CREATE TABLE " + tableName + "(x array(integer))"); + assertQueryFails( + "ALTER TABLE " + tableName + " SET PROPERTIES orc_bloom_filter_columns = ARRAY['missing_column']", + "\\QOrc bloom filter columns [missing_column] not present in schema"); + } + + @Test + void testInvalidOrcBloomFilterPropertiesOnParquet() + { + try (TestTable table = newTrinoTable("test_orc_bloom_filter", "(x int) WITH (format = 'PARQUET')")) { + assertQueryFails( + "ALTER TABLE " + table.getName() + " SET PROPERTIES orc_bloom_filter_columns = ARRAY['x']", + "Cannot specify orc_bloom_filter_columns table property for storage format: PARQUET"); + } + } + + private Map getTableProperties(String tableName) + { + return computeActual("SELECT key, value FROM \"" + tableName + "$properties\"").getMaterializedRows().stream() + .collect(toImmutableMap(row -> (String) row.getField(0), row -> (String) row.getField(1))); + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetComplexTypesPredicatePushDown.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetComplexTypesPredicatePushDown.java new file mode 100644 index 000000000000..87d916362b4b --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetComplexTypesPredicatePushDown.java @@ -0,0 +1,62 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import io.trino.Session; +import io.trino.testing.BaseComplexTypesPredicatePushDownTest; +import io.trino.testing.QueryRunner; +import org.junit.jupiter.api.Test; + +import static io.trino.plugin.iceberg.IcebergTestUtils.withSmallRowGroups; +import static io.trino.testing.TestingNames.randomNameSuffix; + +public class TestIcebergParquetComplexTypesPredicatePushDown + extends BaseComplexTypesPredicatePushDownTest +{ + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return IcebergQueryRunner.builder() + .addIcebergProperty("iceberg.file-format", "PARQUET") + .build(); + } + + @Override + protected final Session getSession() + { + return withSmallRowGroups(super.getSession()); + } + + // The Iceberg table scan differs from Hive in that the coordinator also uses file statistics when generating the splits . + // As a result, if the predicates fall outside the bounds of the file statistics, + // the split is not created for the worker and worker won't call getParquetTupleDomain(). + // The test increased the number of row groups and introduced predicates that are within the file statistics but outside the statistics of the row groups. + @Test + public void testIcebergParquetRowTypeRowGroupPruning() + { + String tableName = "test_nested_column_pruning_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (col1Row ROW(a BIGINT, b BIGINT), col2 BIGINT) WITH (sorted_by=ARRAY['col2'])"); + assertUpdate("INSERT INTO " + tableName + " SELECT * FROM unnest(transform(SEQUENCE(1, 10000), x -> ROW(ROW(x*2, 100), x)))", 10000); + + // col1Row.a only contains even numbers, in the range of [2, 20000]. + // The test has roughly 50 rows per row group due to withSmallRowGroups, [2, 100], [102, 200], ... [19902, 20000] + // 101 is a value between [2, 20000] but is an odd number, so won't be discarded by Iceberg table's statistics. + // At the same time, 101 is not within the bound of any row group. So can be discarded by Parquet's row group statistics. + assertNoDataRead("SELECT * FROM " + tableName + " WHERE col1Row.a = 101"); + assertNoDataRead("SELECT * FROM " + tableName + " WHERE col1Row.a IS NULL"); + + assertUpdate("DROP TABLE " + tableName); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetConnectorTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetConnectorTest.java index 16633c9f091f..29ed4e0eaa22 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetConnectorTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetConnectorTest.java @@ -13,19 +13,31 @@ */ package io.trino.plugin.iceberg; +import io.trino.Session; +import io.trino.execution.QueryManagerConfig; +import io.trino.filesystem.Location; +import io.trino.operator.OperatorStats; +import io.trino.parquet.metadata.ParquetMetadata; import io.trino.testing.MaterializedResult; +import io.trino.testing.QueryRunner; +import io.trino.testing.QueryRunner.MaterializedResultWithPlan; import io.trino.testing.sql.TestTable; -import org.testng.annotations.Test; +import org.intellij.lang.annotations.Language; +import org.junit.jupiter.api.Test; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.IntStream; import static io.trino.plugin.iceberg.IcebergFileFormat.PARQUET; import static io.trino.plugin.iceberg.IcebergTestUtils.checkParquetFileSorting; +import static io.trino.plugin.iceberg.IcebergTestUtils.getParquetFileMetadata; import static io.trino.plugin.iceberg.IcebergTestUtils.withSmallRowGroups; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; +import static io.trino.testing.QueryAssertions.assertEqualsIgnoreOrder; +import static java.time.ZoneOffset.UTC; +import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergParquetConnectorTest extends BaseIcebergConnectorTest @@ -45,21 +57,16 @@ protected boolean supportsIcebergFileStatistics(String typeName) protected boolean supportsRowGroupStatistics(String typeName) { return !(typeName.equalsIgnoreCase("varbinary") || + typeName.equalsIgnoreCase("time") || typeName.equalsIgnoreCase("time(6)") || + typeName.equalsIgnoreCase("timestamp(3) with time zone") || typeName.equalsIgnoreCase("timestamp(6) with time zone")); } - @Override - protected boolean supportsPhysicalPushdown() - { - return true; - } - @Test public void testRowGroupResetDictionary() { - try (TestTable table = new TestTable( - getQueryRunner()::execute, + try (TestTable table = newTrinoTable( "test_row_group_reset_dictionary", "(plain_col varchar, dict_col int)")) { String tableName = table.getName(); @@ -68,8 +75,8 @@ public void testRowGroupResetDictionary() .collect(Collectors.joining(", ")); assertUpdate(withSmallRowGroups(getSession()), "INSERT INTO " + tableName + " VALUES " + values, 100); - MaterializedResult result = getDistributedQueryRunner().execute(String.format("SELECT * FROM %s", tableName)); - assertEquals(result.getRowCount(), 100); + MaterializedResult result = getDistributedQueryRunner().execute("SELECT * FROM " + tableName); + assertThat(result.getRowCount()).isEqualTo(100); } } @@ -84,18 +91,118 @@ protected Optional filterSetColumnTypesDataProvider(SetColum return super.filterSetColumnTypesDataProvider(setup); } - @Override - public void testDropAmbiguousRowFieldCaseSensitivity() + @Test + public void testIgnoreParquetStatistics() + { + try (TestTable table = newTrinoTable( + "test_ignore_parquet_statistics", + "WITH (sorted_by = ARRAY['custkey']) AS TABLE tpch.tiny.customer WITH NO DATA")) { + assertUpdate( + withSmallRowGroups(getSession()), + "INSERT INTO " + table.getName() + " TABLE tpch.tiny.customer", + "VALUES 1500"); + + @Language("SQL") String query = "SELECT * FROM " + table.getName() + " WHERE custkey = 100"; + + QueryRunner queryRunner = getDistributedQueryRunner(); + MaterializedResultWithPlan resultWithoutParquetStatistics = queryRunner.executeWithPlan( + Session.builder(getSession()) + .setCatalogSessionProperty(getSession().getCatalog().orElseThrow(), "parquet_ignore_statistics", "true") + .build(), + query); + OperatorStats queryStatsWithoutParquetStatistics = getOperatorStats(resultWithoutParquetStatistics.queryId()); + assertThat(queryStatsWithoutParquetStatistics.getPhysicalInputPositions()).isGreaterThan(0); + + MaterializedResultWithPlan resultWithParquetStatistics = queryRunner.executeWithPlan(getSession(), query); + OperatorStats queryStatsWithParquetStatistics = getOperatorStats(resultWithParquetStatistics.queryId()); + assertThat(queryStatsWithParquetStatistics.getPhysicalInputPositions()).isGreaterThan(0); + assertThat(queryStatsWithParquetStatistics.getPhysicalInputPositions()) + .isLessThan(queryStatsWithoutParquetStatistics.getPhysicalInputPositions()); + + assertEqualsIgnoreOrder(resultWithParquetStatistics.result(), resultWithoutParquetStatistics.result()); + } + } + + @Test + public void testPushdownPredicateToParquetAfterColumnRename() + { + try (TestTable table = newTrinoTable( + "test_pushdown_predicate_statistics", + "WITH (sorted_by = ARRAY['custkey']) AS TABLE tpch.tiny.customer WITH NO DATA")) { + assertUpdate( + withSmallRowGroups(getSession()), + "INSERT INTO " + table.getName() + " TABLE tpch.tiny.customer", + "VALUES 1500"); + + assertUpdate("ALTER TABLE " + table.getName() + " RENAME COLUMN custkey TO custkey1"); + + QueryRunner queryRunner = getDistributedQueryRunner(); + MaterializedResultWithPlan resultWithoutPredicate = queryRunner.executeWithPlan(getSession(), "TABLE " + table.getName()); + OperatorStats queryStatsWithoutPredicate = getOperatorStats(resultWithoutPredicate.queryId()); + assertThat(queryStatsWithoutPredicate.getPhysicalInputPositions()).isGreaterThan(0); + assertThat(resultWithoutPredicate.result()).hasSize(1500); + + @Language("SQL") String selectiveQuery = "SELECT * FROM " + table.getName() + " WHERE custkey1 = 100"; + MaterializedResultWithPlan selectiveQueryResult = queryRunner.executeWithPlan(getSession(), selectiveQuery); + OperatorStats queryStatsSelectiveQuery = getOperatorStats(selectiveQueryResult.queryId()); + assertThat(queryStatsSelectiveQuery.getPhysicalInputPositions()).isGreaterThan(0); + assertThat(queryStatsSelectiveQuery.getPhysicalInputPositions()) + .isLessThan(queryStatsWithoutPredicate.getPhysicalInputPositions()); + assertThat(selectiveQueryResult.result()).hasSize(1); + } + } + + @Test + void testTableChangesOnMultiRowGroups() + throws Exception + { + try (TestTable table = newTrinoTable( + "test_table_changes_function_multi_row_groups_", + "AS SELECT orderkey, partkey, suppkey FROM tpch.tiny.lineitem WITH NO DATA")) { + long initialSnapshot = getMostRecentSnapshotId(table.getName()); + assertUpdate( + withSmallRowGroups(getSession()), + "INSERT INTO %s SELECT orderkey, partkey, suppkey FROM tpch.tiny.lineitem".formatted(table.getName()), + 60175L); + long snapshotAfterInsert = getMostRecentSnapshotId(table.getName()); + DateTimeFormatter instantMillisFormatter = DateTimeFormatter.ofPattern("uuuu-MM-dd'T'HH:mm:ss.SSSVV").withZone(UTC); + String snapshotAfterInsertTime = getSnapshotTime(table.getName(), snapshotAfterInsert).format(instantMillisFormatter); + + // make sure splits are processed in more than one batch + // Decrease parquet row groups size or add more columns if this test fails + String filePath = getOnlyTableFilePath(table.getName()); + ParquetMetadata parquetMetadata = getParquetFileMetadata(fileSystem.newInputFile(Location.of(filePath))); + int blocksSize = parquetMetadata.getBlocks().size(); + int splitBatchSize = new QueryManagerConfig().getScheduleSplitBatchSize(); + assertThat(blocksSize > splitBatchSize && blocksSize % splitBatchSize != 0).isTrue(); + + assertQuery( + """ + SELECT orderkey, partkey, suppkey, _change_type, _change_version_id, to_iso8601(_change_timestamp), _change_ordinal + FROM TABLE(system.table_changes(CURRENT_SCHEMA, '%s', %s, %s)) + """.formatted(table.getName(), initialSnapshot, snapshotAfterInsert), + "SELECT orderkey, partkey, suppkey, 'insert', %s, '%s', 0 FROM lineitem".formatted(snapshotAfterInsert, snapshotAfterInsertTime)); + } + } + + private String getOnlyTableFilePath(String tableName) + { + return (String) computeScalar("SELECT file_path FROM \"" + tableName + "$files\""); + } + + private long getMostRecentSnapshotId(String tableName) + { + return (long) computeScalar("SELECT snapshot_id FROM \"" + tableName + "$snapshots\" ORDER BY committed_at DESC LIMIT 1"); + } + + private ZonedDateTime getSnapshotTime(String tableName, long snapshotId) { - // TODO https://github.com/trinodb/trino/issues/16273 The connector can't read row types having ambiguous field names in Parquet files. e.g. row(X int, x int) - assertThatThrownBy(super::testDropAmbiguousRowFieldCaseSensitivity) - .hasMessageContaining("Error opening Iceberg split") - .hasStackTraceContaining("Multiple entries with same key"); + return (ZonedDateTime) computeScalar("SELECT committed_at FROM \"" + tableName + "$snapshots\" WHERE snapshot_id = " + snapshotId); } @Override protected boolean isFileSorted(String path, String sortColumnName) { - return checkParquetFileSorting(path, sortColumnName); + return checkParquetFileSorting(fileSystem.newInputFile(Location.of(path)), sortColumnName); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetWithBloomFilters.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetWithBloomFilters.java index 54b3c936b1d1..a3f4648c3a7b 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetWithBloomFilters.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetWithBloomFilters.java @@ -13,20 +13,21 @@ */ package io.trino.plugin.iceberg; -import com.google.common.collect.ImmutableMap; -import io.trino.plugin.hive.TestingHivePlugin; +import com.google.common.base.Joiner; import io.trino.spi.connector.CatalogSchemaTableName; import io.trino.spi.connector.SchemaTableName; import io.trino.testing.BaseTestParquetWithBloomFilters; -import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.MaterializedResult; import io.trino.testing.QueryRunner; +import org.junit.jupiter.api.Test; -import java.nio.file.Path; import java.util.List; -import static io.trino.plugin.hive.parquet.TestHiveParquetWithBloomFilters.writeParquetFileWithBloomFilter; +import static io.trino.testing.MaterializedResult.resultBuilder; +import static io.trino.testing.QueryAssertions.assertContains; import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; public class TestIcebergParquetWithBloomFilters extends BaseTestParquetWithBloomFilters @@ -35,18 +36,7 @@ public class TestIcebergParquetWithBloomFilters protected QueryRunner createQueryRunner() throws Exception { - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder().build(); - dataDirectory = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data"); - - // create hive catalog - queryRunner.installPlugin(new TestingHivePlugin()); - queryRunner.createCatalog("hive", "hive", ImmutableMap.builder() - .put("hive.metastore", "file") - .put("hive.metastore.catalog.dir", dataDirectory.toString()) - .put("hive.security", "allow-all") - .buildOrThrow()); - - return queryRunner; + return IcebergQueryRunner.builder().build(); } @Override @@ -54,18 +44,72 @@ protected CatalogSchemaTableName createParquetTableWithBloomFilter(String column { // create the managed table String tableName = "parquet_with_bloom_filters_" + randomNameSuffix(); - CatalogSchemaTableName hiveCatalogSchemaTableName = new CatalogSchemaTableName("hive", new SchemaTableName("tpch", tableName)); - CatalogSchemaTableName icebergCatalogSchemaTableName = new CatalogSchemaTableName("iceberg", new SchemaTableName("tpch", tableName)); - assertUpdate(format("CREATE TABLE %s (%s INT) WITH (format = 'PARQUET')", hiveCatalogSchemaTableName, columnName)); + CatalogSchemaTableName catalogSchemaTableName = new CatalogSchemaTableName("iceberg", new SchemaTableName("tpch", tableName)); + assertUpdate(format("CREATE TABLE %s WITH (format = 'PARQUET', parquet_bloom_filter_columns = ARRAY['%s']) AS SELECT * FROM (VALUES %s) t(%s)", catalogSchemaTableName, columnName, Joiner.on(", ").join(testValues), columnName), testValues.size()); - // directly write data to the managed table - Path tableLocation = Path.of("%s/tpch/%s".formatted(dataDirectory, tableName)); - Path fileLocation = tableLocation.resolve("bloomFilterFile.parquet"); - writeParquetFileWithBloomFilter(fileLocation.toFile(), columnName, testValues); + return catalogSchemaTableName; + } + + @Test + public void testBloomFilterPropertiesArePersistedDuringCreate() + { + String tableName = "test_metadata_write_properties_" + randomNameSuffix(); + assertQuerySucceeds("CREATE TABLE " + tableName + " (A bigint, b bigint, c bigint) WITH (" + + "format = 'parquet'," + + "parquet_bloom_filter_columns = array['a','B'])"); + + verifyTableProperties(tableName); + } + + @Test + void testBloomFilterPropertiesArePersistedDuringSetProperties() + { + String tableName = "test_metadata_write_properties_" + randomNameSuffix(); + assertQuerySucceeds("CREATE TABLE " + tableName + "(A bigint, b bigint, c bigint)"); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES parquet_bloom_filter_columns = ARRAY['a','B']"); + verifyTableProperties(tableName); - // migrate the hive table to the iceberg table - assertUpdate("CALL iceberg.system.migrate('tpch', '" + tableName + "', 'false')"); + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES parquet_bloom_filter_columns = ARRAY['a']"); + assertThat((String) computeScalar("SHOW CREATE TABLE " + tableName)) + .contains("parquet_bloom_filter_columns = ARRAY['a']"); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES parquet_bloom_filter_columns = ARRAY[]"); + assertThat((String) computeScalar("SHOW CREATE TABLE " + tableName)) + .doesNotContain("parquet_bloom_filter_columns"); + } + + @Test + void testInvalidBloomFilterProperties() + { + String tableName = "test_invalid_bloom_filter_properties_" + randomNameSuffix(); + assertQueryFails( + "CREATE TABLE " + tableName + "(x int) WITH (parquet_bloom_filter_columns = ARRAY['missing_column'])", + "Parquet Bloom filter column missing_column not present in schema"); + assertQueryFails( + "CREATE TABLE " + tableName + "(x array(int)) WITH (parquet_bloom_filter_columns = ARRAY['x'])", + "\\QParquet Bloom filter column x has unsupported type array(integer)"); + + assertQuerySucceeds("CREATE TABLE " + tableName + "(x array(integer))"); + assertQueryFails( + "ALTER TABLE " + tableName + " SET PROPERTIES parquet_bloom_filter_columns = ARRAY['missing_column']", + "Parquet Bloom filter column missing_column not present in schema"); + assertQueryFails( + "ALTER TABLE " + tableName + " SET PROPERTIES parquet_bloom_filter_columns = ARRAY['x']", + "\\QParquet Bloom filter column x has unsupported type array(integer)"); + } + + private void verifyTableProperties(String tableName) + { + MaterializedResult actualProperties = computeActual("SELECT * FROM \"" + tableName + "$properties\""); + assertThat(actualProperties).isNotNull(); + MaterializedResult expectedProperties = resultBuilder(getSession()) + .row("write.parquet.bloom-filter-enabled.column.a", "true") + .row("write.parquet.bloom-filter-enabled.column.b", "true") + .build(); + assertContains(actualProperties, expectedProperties); - return icebergCatalogSchemaTableName; + assertThat((String) computeScalar("SHOW CREATE TABLE " + tableName)) + .contains("parquet_bloom_filter_columns"); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetWithBloomFiltersMixedCase.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetWithBloomFiltersMixedCase.java new file mode 100644 index 000000000000..9e2e3196d127 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergParquetWithBloomFiltersMixedCase.java @@ -0,0 +1,110 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableMap; +import io.trino.spi.connector.CatalogSchemaTableName; +import io.trino.spi.connector.SchemaTableName; +import io.trino.testing.BaseTestParquetWithBloomFilters; +import io.trino.testing.MaterializedResult; +import io.trino.testing.QueryRunner; +import io.trino.testing.containers.Minio; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.TestInstance; + +import java.util.List; + +import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.testing.MaterializedResult.resultBuilder; +import static io.trino.testing.QueryAssertions.assertContains; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.containers.Minio.MINIO_ACCESS_KEY; +import static io.trino.testing.containers.Minio.MINIO_REGION; +import static io.trino.testing.containers.Minio.MINIO_SECRET_KEY; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; + +@TestInstance(PER_CLASS) +public class TestIcebergParquetWithBloomFiltersMixedCase + extends BaseTestParquetWithBloomFilters +{ + private static final String BUCKET_NAME = "test-bucket-mixed-case"; + + private Minio minio; + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + minio = closeAfterClass(Minio.builder().build()); + minio.start(); + minio.createBucket(BUCKET_NAME); + + QueryRunner queryRunner = IcebergQueryRunner.builder() + .setIcebergProperties( + ImmutableMap.builder() + .put("fs.native-s3.enabled", "true") + .put("s3.aws-access-key", MINIO_ACCESS_KEY) + .put("s3.aws-secret-key", MINIO_SECRET_KEY) + .put("s3.region", MINIO_REGION) + .put("s3.endpoint", minio.getMinioAddress()) + .put("s3.path-style-access", "true") + .put("iceberg.register-table-procedure.enabled", "true") + .buildOrThrow()) + .build(); + + queryRunner.execute("CREATE SCHEMA IF NOT EXISTS " + ICEBERG_CATALOG + ".tpch"); + return queryRunner; + } + + @Override + protected CatalogSchemaTableName createParquetTableWithBloomFilter(String columnName, List testValues) + { + minio.copyResources("iceberg/mixed_case_bloom_filter", BUCKET_NAME, "mixed_case_bloom_filter"); + String tableName = "test_iceberg_write_mixed_case_bloom_filter" + randomNameSuffix(); + assertUpdate(format( + "CALL system.register_table(CURRENT_SCHEMA, '%s', '%s')", + tableName, + format("s3://%s/mixed_case_bloom_filter", BUCKET_NAME))); + + CatalogSchemaTableName catalogSchemaTableName = new CatalogSchemaTableName("iceberg", new SchemaTableName("tpch", tableName)); + assertUpdate(format("INSERT INTO %s SELECT * FROM (VALUES %s) t(%s)", catalogSchemaTableName, Joiner.on(", ").join(testValues), columnName), testValues.size()); + + checkTableProperties(tableName); + + return catalogSchemaTableName; + } + + private void checkTableProperties(String tableName) + { + MaterializedResult actualProperties = computeActual("SELECT * FROM \"" + tableName + "$properties\""); + assertThat(actualProperties).isNotNull(); + MaterializedResult expectedProperties = resultBuilder(getSession()) + .row("write.parquet.bloom-filter-enabled.column.dataColumn", "true") + .build(); + assertContains(actualProperties, expectedProperties); + + assertThat((String) computeScalar("SHOW CREATE TABLE " + tableName)) + .contains("parquet_bloom_filter_columns"); + } + + @AfterAll + public void destroy() + throws Exception + { + minio = null; // closed by closeAfterClass + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPartitionEvolution.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPartitionEvolution.java index cef36f670cf0..e6a84857053f 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPartitionEvolution.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPartitionEvolution.java @@ -18,7 +18,7 @@ import io.trino.testing.MaterializedRow; import io.trino.testing.QueryRunner; import io.trino.tpch.TpchTable; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.util.List; @@ -26,7 +26,6 @@ import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.Math.toIntExact; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertEquals; public class TestIcebergPartitionEvolution extends AbstractTestQueryFramework @@ -59,10 +58,10 @@ public void testRemovePartitioning() int expectedFileCount = computeActual("SELECT DISTINCT regionkey, substring(name, 1, 1) FROM nation WHERE nationkey < 10").getRowCount(); assertThat(partitionedFiles).hasSize(expectedFileCount); - assertEquals(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum(), 10L); + assertThat(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(10L); assertThat(unpartitionedFiles).hasSize(1); - assertEquals((long) unpartitionedFiles.get(0).getField(1), 15); + assertThat((long) unpartitionedFiles.get(0).getField(1)).isEqualTo(15); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation"); // Most partitions have one record each. regionkey=2, trunc_name=I has two records, and 15 records are unpartitioned @@ -90,11 +89,11 @@ public void testAddPartitionColumn() int expectedInitialFiles = toIntExact((long) computeActual("SELECT count(distinct regionkey) FROM nation WHERE nationkey < 10").getOnlyValue()); assertThat(initialFiles).hasSize(expectedInitialFiles); - assertEquals(initialFiles.stream().mapToLong(row -> (long) row.getField(1)).sum(), 10L); + assertThat(initialFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(10L); int expectedFinalFileCount = computeActual("SELECT DISTINCT regionkey, substring(name, 1, 1) FROM nation WHERE nationkey >= 10").getRowCount(); assertThat(partitionedFiles).hasSize(expectedFinalFileCount); - assertEquals(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum(), 15L); + assertThat(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(15L); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation"); assertUpdate("DROP TABLE " + tableName); @@ -115,11 +114,11 @@ public void testAddPartitionColumn() expectedInitialFiles = computeActual("SELECT DISTINCT substring(name, 1, 1) FROM nation WHERE nationkey < 10").getRowCount(); assertThat(initialFiles).hasSize(expectedInitialFiles); - assertEquals(initialFiles.stream().mapToLong(row -> (long) row.getField(1)).sum(), 10L); + assertThat(initialFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(10L); expectedFinalFileCount = computeActual("SELECT DISTINCT regionkey, substring(name, 1, 1) FROM nation WHERE nationkey >= 10").getRowCount(); assertThat(partitionedFiles).hasSize(expectedFinalFileCount); - assertEquals(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum(), 15L); + assertThat(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(15L); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation"); assertUpdate("DROP TABLE " + tableName); @@ -156,13 +155,127 @@ public void testChangePartitionTransform() } @Test - public void testUnsupportedNestedFieldPartition() + public void testAddNestedPartitioning() { - String tableName = "test_unsupported_nested_field_partition_" + randomNameSuffix(); - assertUpdate("CREATE TABLE " + tableName + "(parent ROW(child VARCHAR))"); - assertQueryFails( - "ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY['\"parent.child\"']", - "Partitioning by nested field is unsupported: parent.child"); + String tableName = "test_add_nested_partition_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INT, district ROW(name VARCHAR), state ROW(name VARCHAR)) WITH (partitioning = ARRAY['\"state.name\"'])"); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('Patna'), ROW('BH')), " + + "(2, ROW('Gaya'), ROW('BH')), " + + "(3, ROW('Bengaluru'), ROW('KA')), " + + "(4, ROW('Mengaluru'), ROW('KA'))", + 4); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY['\"state.name\"', '\"district.name\"']"); + + assertThat((String) computeActual("SHOW CREATE TABLE " + tableName).getOnlyValue()).contains("partitioning = ARRAY['\"state.name\"','\"district.name\"']"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('Patna'), ROW('BH')), " + + "(2, ROW('Patna'), ROW('BH')), " + + "(3, ROW('Bengaluru'), ROW('KA')), " + + "(4, ROW('Mengaluru'), ROW('KA'))", + 4); + + List files = computeActual("SELECT file_path, record_count FROM \"" + tableName + "$files\"").getMaterializedRows(); + List initialPartitionedFiles = files.stream() + .filter(file -> !((String) file.getField(0)).contains("district.name=")) + .collect(toImmutableList()); + + List laterPartitionedFiles = files.stream() + .filter(file -> ((String) file.getField(0)).contains("district.name=")) + .collect(toImmutableList()); + + assertThat(initialPartitionedFiles).hasSize(2); + assertThat(initialPartitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(4L); + + assertThat(laterPartitionedFiles).hasSize(3); + assertThat(laterPartitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(4L); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testRemoveNestedPartitioning() + { + String tableName = "test_remove_nested_partition_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INT, district ROW(name VARCHAR), state ROW(name VARCHAR)) WITH (partitioning = ARRAY['\"state.name\"'])"); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('Patna'), ROW('BH')), " + + "(2, ROW('Gaya'), ROW('BH')), " + + "(3, ROW('Bengaluru'), ROW('KA')), " + + "(4, ROW('Mengaluru'), ROW('KA'))", + 4); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY[]"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('Patna'), ROW('BH')), " + + "(2, ROW('Gaya'), ROW('BH')), " + + "(3, ROW('Bengaluru'), ROW('KA')), " + + "(4, ROW('Mengaluru'), ROW('KA'))", + 4); + + List files = computeActual("SELECT file_path, record_count FROM \"" + tableName + "$files\"").getMaterializedRows(); + List unpartitionedFiles = files.stream() + .filter(file -> !((String) file.getField(0)).contains("state.name=")) + .collect(toImmutableList()); + + List partitionedFiles = files.stream() + .filter(file -> ((String) file.getField(0)).contains("state.name=")) + .collect(toImmutableList()); + + assertThat(partitionedFiles).hasSize(2); + assertThat(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(4L); + + assertThat(unpartitionedFiles).hasSize(1); + assertThat((long) unpartitionedFiles.get(0).getField(1)).isEqualTo(4); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testNestedFieldChangePartitionTransform() + { + String tableName = "test_nested_field_change_partition_transform_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (grandparent ROW(parent ROW(ts TIMESTAMP(6), a INT), b INT), c INT) " + + "WITH (partitioning = ARRAY['year(\"grandparent.parent.ts\")'])"); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(ROW(ROW(TIMESTAMP '2021-01-01 01:01:01.111111', 1), 1), 1), " + + "(ROW(ROW(TIMESTAMP '2022-02-02 02:02:02.222222', 2), 2), 2), " + + "(ROW(ROW(TIMESTAMP '2023-03-03 03:03:03.333333', 3), 3), 3)", + 3); + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY['month(\"grandparent.parent.ts\")']"); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(ROW(ROW(TIMESTAMP '2024-04-04 04:04:04.444444', 4), 4), 4), " + + "(ROW(ROW(TIMESTAMP '2025-05-05 05:05:05.555555', 5), 5), 5)", + 2); + + assertThat((String) computeActual("SHOW CREATE TABLE " + tableName).getOnlyValue()).contains("partitioning = ARRAY['month(\"grandparent.parent.ts\")']"); + + List files = computeActual("SELECT file_path, record_count FROM \"" + tableName + "$files\"").getMaterializedRows(); + List yearPartitionedFiles = files.stream() + .filter(file -> { + String filePath = ((String) file.getField(0)); + return filePath.contains("grandparent.parent.ts_year=") && !filePath.contains("grandparent.parent.ts_month="); + }) + .collect(toImmutableList()); + + List monthPartitionedFiles = files.stream() + .filter(file -> { + String filePath = ((String) file.getField(0)); + return !filePath.contains("grandparent.parent.ts_year=") && filePath.contains("grandparent.parent.ts_month="); + }) + .collect(toImmutableList()); + + assertThat(yearPartitionedFiles).hasSize(3); + assertThat(monthPartitionedFiles).hasSize(2); assertUpdate("DROP TABLE " + tableName); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPlugin.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPlugin.java index 8395130d2174..f24a0f64c343 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPlugin.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergPlugin.java @@ -14,18 +14,18 @@ package io.trino.plugin.iceberg; import com.google.common.collect.ImmutableMap; +import com.google.inject.CreationException; import io.airlift.bootstrap.ApplicationConfigurationException; import io.trino.spi.connector.Connector; import io.trino.spi.connector.ConnectorFactory; import io.trino.testing.TestingConnectorContext; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.io.File; import java.nio.file.Files; import java.util.Map; import static com.google.common.collect.Iterables.getOnlyElement; -import static io.trino.plugin.hive.HiveConfig.HIVE_VIEWS_ENABLED; import static org.assertj.core.api.Assertions.assertThatThrownBy; public class TestIcebergPlugin @@ -35,7 +35,12 @@ public void testCreateConnector() { ConnectorFactory factory = getConnectorFactory(); // simplest possible configuration - factory.create("test", Map.of("hive.metastore.uri", "thrift://foo:1234"), new TestingConnectorContext()).shutdown(); + factory.create( + "test", + Map.of( + "hive.metastore.uri", "thrift://foo:1234", + "bootstrap.quiet", "true"), + new TestingConnectorContext()).shutdown(); } @Test @@ -46,7 +51,8 @@ public void testTestingFileMetastore() "test", Map.of( "iceberg.catalog.type", "TESTING_FILE_METASTORE", - "hive.metastore.catalog.dir", "/tmp"), + "hive.metastore.catalog.dir", "/tmp", + "bootstrap.quiet", "true"), new TestingConnectorContext()) .shutdown(); } @@ -60,7 +66,8 @@ public void testThriftMetastore() "test", Map.of( "iceberg.catalog.type", "HIVE_METASTORE", - "hive.metastore.uri", "thrift://foo:1234"), + "hive.metastore.uri", "thrift://foo:1234", + "bootstrap.quiet", "true"), new TestingConnectorContext()) .shutdown(); @@ -69,7 +76,8 @@ public void testThriftMetastore() "test", Map.of( "hive.metastore.uri", "thrift://foo:1234", - "hive.metastore.glue.region", "us-east"), + "hive.metastore.glue.region", "us-east", + "bootstrap.quiet", "true"), new TestingConnectorContext())) .hasMessageContaining("Configuration property 'hive.metastore.glue.region' was not used"); } @@ -83,7 +91,8 @@ public void testHiveMetastoreRejected() "test", Map.of( "hive.metastore", "thrift", - "hive.metastore.uri", "thrift://foo:1234"), + "hive.metastore.uri", "thrift://foo:1234", + "bootstrap.quiet", "true"), new TestingConnectorContext())) .hasMessageContaining("Error: Configuration property 'hive.metastore' was not used"); } @@ -97,7 +106,8 @@ public void testGlueMetastore() "test", Map.of( "iceberg.catalog.type", "glue", - "hive.metastore.glue.region", "us-east-1"), + "hive.metastore.glue.region", "us-east-1", + "bootstrap.quiet", "true"), new TestingConnectorContext()) .shutdown(); @@ -105,7 +115,8 @@ public void testGlueMetastore() "test", Map.of( "iceberg.catalog.type", "glue", - "hive.metastore.uri", "thrift://foo:1234"), + "hive.metastore.uri", "thrift://foo:1234", + "bootstrap.quiet", "true"), new TestingConnectorContext())) .hasMessageContaining("Error: Configuration property 'hive.metastore.uri' was not used"); @@ -114,47 +125,12 @@ public void testGlueMetastore() Map.of( "iceberg.catalog.type", "glue", "hive.metastore.glue.catalogid", "123", - "hive.metastore.glue.region", "us-east-1"), + "hive.metastore.glue.region", "us-east-1", + "bootstrap.quiet", "true"), new TestingConnectorContext()) .shutdown(); } - @Test - public void testRecordingMetastore() - { - ConnectorFactory factory = getConnectorFactory(); - - // recording with thrift - factory.create( - "test", - Map.of( - "iceberg.catalog.type", "HIVE_METASTORE", - "hive.metastore.uri", "thrift://foo:1234", - "hive.metastore-recording-path", "/tmp"), - new TestingConnectorContext()) - .shutdown(); - - // recording with glue - assertThatThrownBy(() -> factory.create( - "test", - Map.of( - "iceberg.catalog.type", "glue", - "hive.metastore.glue.region", "us-east-2", - "hive.metastore-recording-path", "/tmp"), - new TestingConnectorContext())) - .hasMessageContaining("Configuration property 'hive.metastore-recording-path' was not used"); - - // recording with nessie - assertThatThrownBy(() -> factory.create( - "test", - Map.of( - "iceberg.catalog.type", "nessie", - "hive.metastore.nessie.region", "us-east-2", - "hive.metastore-recording-path", "/tmp"), - new TestingConnectorContext())) - .hasMessageContaining("Configuration property 'hive.metastore-recording-path' was not used"); - } - @Test public void testAllowAllAccessControl() { @@ -166,6 +142,7 @@ public void testAllowAllAccessControl() .put("iceberg.catalog.type", "HIVE_METASTORE") .put("hive.metastore.uri", "thrift://foo:1234") .put("iceberg.security", "allow-all") + .put("bootstrap.quiet", "true") .buildOrThrow(), new TestingConnectorContext()) .shutdown(); @@ -182,6 +159,7 @@ public void testReadOnlyAllAccessControl() .put("iceberg.catalog.type", "HIVE_METASTORE") .put("hive.metastore.uri", "thrift://foo:1234") .put("iceberg.security", "read-only") + .put("bootstrap.quiet", "true") .buildOrThrow(), new TestingConnectorContext()) .shutdown(); @@ -198,6 +176,7 @@ public void testSystemAccessControl() .put("iceberg.catalog.type", "HIVE_METASTORE") .put("hive.metastore.uri", "thrift://foo:1234") .put("iceberg.security", "system") + .put("bootstrap.quiet", "true") .buildOrThrow(), new TestingConnectorContext()); assertThatThrownBy(connector::getAccessControl).isInstanceOf(UnsupportedOperationException.class); @@ -220,6 +199,7 @@ public void testFileBasedAccessControl() .put("hive.metastore.uri", "thrift://foo:1234") .put("iceberg.security", "file") .put("security.config-file", tempFile.getAbsolutePath()) + .put("bootstrap.quiet", "true") .buildOrThrow(), new TestingConnectorContext()) .shutdown(); @@ -234,8 +214,9 @@ public void testIcebergPluginFailsWhenIncorrectPropertyProvided() "test", Map.of( "iceberg.catalog.type", "HIVE_METASTORE", - HIVE_VIEWS_ENABLED, "true", - "hive.metastore.uri", "thrift://foo:1234"), + "hive.hive-views.enabled", "true", + "hive.metastore.uri", "thrift://foo:1234", + "bootstrap.quiet", "true"), new TestingConnectorContext()) .shutdown()) .isInstanceOf(ApplicationConfigurationException.class) @@ -251,11 +232,31 @@ public void testRestCatalog() "test", Map.of( "iceberg.catalog.type", "rest", - "iceberg.rest-catalog.uri", "https://foo:1234"), + "iceberg.rest-catalog.uri", "https://foo:1234", + "bootstrap.quiet", "true"), new TestingConnectorContext()) .shutdown(); } + @Test + public void testRestCatalogValidations() + { + ConnectorFactory factory = getConnectorFactory(); + + assertThatThrownBy(() -> factory.create( + "test", + Map.of( + "iceberg.catalog.type", "rest", + "iceberg.register-table-procedure.enabled", "true", + "iceberg.rest-catalog.uri", "https://foo:1234", + "iceberg.rest-catalog.vended-credentials-enabled", "true", + "bootstrap.quiet", "true"), + new TestingConnectorContext()) + .shutdown()) + .isInstanceOf(ApplicationConfigurationException.class) + .hasMessageContaining("Using the `register_table` procedure with vended credentials is currently not supported"); + } + @Test public void testJdbcCatalog() { @@ -268,7 +269,8 @@ public void testJdbcCatalog() "iceberg.jdbc-catalog.driver-class", "org.postgresql.Driver", "iceberg.jdbc-catalog.connection-url", "jdbc:postgresql://localhost:5432/test", "iceberg.jdbc-catalog.catalog-name", "test", - "iceberg.jdbc-catalog.default-warehouse-dir", "s3://bucket"), + "iceberg.jdbc-catalog.default-warehouse-dir", "s3://bucket", + "bootstrap.quiet", "true"), new TestingConnectorContext()) .shutdown(); } @@ -283,7 +285,97 @@ public void testNessieCatalog() Map.of( "iceberg.catalog.type", "nessie", "iceberg.nessie-catalog.default-warehouse-dir", "/tmp", - "iceberg.nessie-catalog.uri", "http://foo:1234"), + "iceberg.nessie-catalog.uri", "http://foo:1234", + "iceberg.nessie-catalog.client-api-version", "V1", + "bootstrap.quiet", "true"), + new TestingConnectorContext()) + .shutdown(); + } + + @Test + public void testNessieCatalogWithBearerAuth() + { + ConnectorFactory factory = getConnectorFactory(); + + factory.create( + "test", + Map.of( + "iceberg.catalog.type", "nessie", + "iceberg.nessie-catalog.default-warehouse-dir", "/tmp", + "iceberg.nessie-catalog.uri", "http://foo:1234", + "iceberg.nessie-catalog.client-api-version", "V2", + "iceberg.nessie-catalog.authentication.type", "BEARER", + "iceberg.nessie-catalog.authentication.token", "someToken"), + new TestingConnectorContext()) + .shutdown(); + } + + @Test + public void testNessieCatalogWithNoAuthAndAccessToken() + { + ConnectorFactory factory = getConnectorFactory(); + + assertThatThrownBy(() -> factory.create( + "test", + Map.of( + "iceberg.catalog.type", "nessie", + "iceberg.nessie-catalog.uri", "nessieUri", + "iceberg.nessie-catalog.default-warehouse-dir", "/tmp", + "iceberg.nessie-catalog.authentication.token", "someToken"), + new TestingConnectorContext()) + .shutdown()) + .isInstanceOf(ApplicationConfigurationException.class) + .hasMessageContaining("'iceberg.nessie-catalog.authentication.token' must be configured only with 'iceberg.nessie-catalog.authentication.type' BEARER"); + } + + @Test + public void testNessieCatalogWithNoAccessToken() + { + ConnectorFactory factory = getConnectorFactory(); + + assertThatThrownBy(() -> factory.create( + "test", + Map.of( + "iceberg.catalog.type", "nessie", + "iceberg.nessie-catalog.uri", "nessieUri", + "iceberg.nessie-catalog.default-warehouse-dir", "/tmp", + "iceberg.nessie-catalog.authentication.type", "BEARER"), + new TestingConnectorContext()) + .shutdown()) + .isInstanceOf(ApplicationConfigurationException.class) + .hasMessageContaining("'iceberg.nessie-catalog.authentication.token' must be configured with 'iceberg.nessie-catalog.authentication.type' BEARER"); + } + + @Test + public void testNessieCatalogClientAPIVersion() + { + ConnectorFactory factory = getConnectorFactory(); + + assertThatThrownBy(() -> factory.create( + "test", + Map.of( + "iceberg.catalog.type", "nessie", + "iceberg.nessie-catalog.uri", "http://foo:1234", + "iceberg.nessie-catalog.default-warehouse-dir", "/tmp"), + new TestingConnectorContext()) + .shutdown()) + .isInstanceOf(CreationException.class) + .hasMessageContaining("URI doesn't end with the version: http://foo:1234. Please configure `client-api-version` in the catalog properties explicitly."); + } + + @Test + public void testSnowflakeCatalog() + { + ConnectorFactory factory = getConnectorFactory(); + + factory.create( + "test", + Map.of( + "iceberg.catalog.type", "snowflake", + "iceberg.snowflake-catalog.account-uri", "jdbc:snowflake://sample.url", + "iceberg.snowflake-catalog.user", "user", + "iceberg.snowflake-catalog.password", "password", + "iceberg.snowflake-catalog.database", "database"), new TestingConnectorContext()) .shutdown(); } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergProjectionPushdownPlans.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergProjectionPushdownPlans.java index 672ae53495eb..e00b9f41f9f1 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergProjectionPushdownPlans.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergProjectionPushdownPlans.java @@ -17,20 +17,30 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import io.trino.Session; -import io.trino.metadata.InternalFunctionBundle; import io.trino.metadata.QualifiedObjectName; +import io.trino.metadata.ResolvedFunction; import io.trino.metadata.TableHandle; -import io.trino.plugin.hive.metastore.Database; -import io.trino.plugin.hive.metastore.HiveMetastore; -import io.trino.plugin.iceberg.catalog.file.TestingIcebergFileMetastoreCatalogModule; +import io.trino.metadata.TestingFunctionResolution; +import io.trino.metastore.Database; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.HiveMetastoreFactory; import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.function.OperatorType; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.TupleDomain; import io.trino.spi.security.PrincipalType; +import io.trino.spi.type.RowType; +import io.trino.sql.ir.Call; +import io.trino.sql.ir.Cast; +import io.trino.sql.ir.Comparison; +import io.trino.sql.ir.Constant; +import io.trino.sql.ir.FieldReference; +import io.trino.sql.ir.Logical; +import io.trino.sql.ir.Reference; import io.trino.sql.planner.assertions.BasePushdownPlanTest; -import io.trino.testing.LocalQueryRunner; -import org.testng.annotations.AfterClass; -import org.testng.annotations.Test; +import io.trino.testing.PlanTester; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -43,9 +53,10 @@ import static com.google.common.base.Predicates.equalTo; import static com.google.common.io.MoreFiles.deleteRecursively; import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; -import static com.google.inject.util.Modules.EMPTY_MODULE; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; import static io.trino.spi.type.BigintType.BIGINT; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.sql.ir.Comparison.Operator.EQUAL; +import static io.trino.sql.ir.Logical.Operator.AND; import static io.trino.sql.planner.assertions.PlanMatchPattern.any; import static io.trino.sql.planner.assertions.PlanMatchPattern.anyTree; import static io.trino.sql.planner.assertions.PlanMatchPattern.expression; @@ -53,7 +64,7 @@ import static io.trino.sql.planner.assertions.PlanMatchPattern.join; import static io.trino.sql.planner.assertions.PlanMatchPattern.project; import static io.trino.sql.planner.assertions.PlanMatchPattern.tableScan; -import static io.trino.sql.planner.plan.JoinNode.Type.INNER; +import static io.trino.sql.planner.plan.JoinType.INNER; import static io.trino.testing.TestingNames.randomNameSuffix; import static io.trino.testing.TestingSession.testSessionBuilder; import static java.lang.String.format; @@ -62,12 +73,15 @@ public class TestIcebergProjectionPushdownPlans extends BasePushdownPlanTest { + private static final TestingFunctionResolution FUNCTIONS = new TestingFunctionResolution(); + private static final ResolvedFunction ADD_INTEGER = FUNCTIONS.resolveOperator(OperatorType.ADD, ImmutableList.of(INTEGER, INTEGER)); + private static final String CATALOG = "iceberg"; private static final String SCHEMA = "schema"; private File metastoreDir; @Override - protected LocalQueryRunner createLocalQueryRunner() + protected PlanTester createPlanTester() { Session session = testSessionBuilder() .setCatalog(CATALOG) @@ -80,17 +94,13 @@ protected LocalQueryRunner createLocalQueryRunner() catch (IOException e) { throw new UncheckedIOException(e); } - HiveMetastore metastore = createTestingFileHiveMetastore(metastoreDir); - LocalQueryRunner queryRunner = LocalQueryRunner.create(session); - - InternalFunctionBundle.InternalFunctionBundleBuilder functions = InternalFunctionBundle.builder(); - new IcebergPlugin().getFunctions().forEach(functions::functions); - queryRunner.addFunctions(functions.build()); + PlanTester planTester = PlanTester.create(session); + planTester.installPlugin(new TestingIcebergPlugin(metastoreDir.toPath())); + planTester.createCatalog(CATALOG, "iceberg", ImmutableMap.of()); - queryRunner.createCatalog( - CATALOG, - new TestingIcebergConnectorFactory(Optional.of(new TestingIcebergFileMetastoreCatalogModule(metastore)), Optional.empty(), EMPTY_MODULE), - ImmutableMap.of()); + HiveMetastore metastore = ((IcebergConnector) planTester.getConnector(CATALOG)).getInjector() + .getInstance(HiveMetastoreFactory.class) + .createMetastore(Optional.empty()); Database database = Database.builder() .setDatabaseName(SCHEMA) @@ -99,10 +109,10 @@ protected LocalQueryRunner createLocalQueryRunner() .build(); metastore.createDatabase(database); - return queryRunner; + return planTester; } - @AfterClass(alwaysRun = true) + @AfterAll public void cleanup() throws Exception { @@ -116,11 +126,11 @@ public void testPushdownDisabled() { String testTable = "test_disabled_pushdown" + randomNameSuffix(); - Session session = Session.builder(getQueryRunner().getDefaultSession()) + Session session = Session.builder(getPlanTester().getDefaultSession()) .setCatalogSessionProperty(CATALOG, "projection_pushdown_enabled", "false") .build(); - getQueryRunner().execute(format( + getPlanTester().executeStatement(format( "CREATE TABLE %s (col0) AS SELECT CAST(row(5, 6) AS row(a bigint, b bigint)) AS col0 WHERE false", testTable)); @@ -129,7 +139,7 @@ public void testPushdownDisabled() session, any( project( - ImmutableMap.of("expr", expression("col0[1]"), "expr_2", expression("col0[2]")), + ImmutableMap.of("expr", expression(new FieldReference(new Reference(RowType.anonymousRow(BIGINT, BIGINT), "col0"), 0)), "expr_2", expression(new FieldReference(new Reference(RowType.anonymousRow(BIGINT, BIGINT), "col0"), 1))), tableScan(testTable, ImmutableMap.of("col0", "col0"))))); } @@ -139,12 +149,12 @@ public void testDereferencePushdown() String testTable = "test_simple_projection_pushdown" + randomNameSuffix(); QualifiedObjectName completeTableName = new QualifiedObjectName(CATALOG, SCHEMA, testTable); - getQueryRunner().execute(format( + getPlanTester().executeStatement(format( "CREATE TABLE %s (col0, col1) WITH (partitioning = ARRAY['col1']) AS" + " SELECT CAST(row(5, 6) AS row(x bigint, y bigint)) AS col0, 5 AS col1", testTable)); - Session session = getQueryRunner().getDefaultSession(); + Session session = getPlanTester().getDefaultSession(); Optional tableHandle = getTableHandle(session, completeTableName); assertThat(tableHandle).as("expected the table handle to be present").isPresent(); @@ -154,24 +164,20 @@ public void testDereferencePushdown() IcebergColumnHandle column0Handle = (IcebergColumnHandle) columns.get("col0"); IcebergColumnHandle column1Handle = (IcebergColumnHandle) columns.get("col1"); - IcebergColumnHandle columnX = new IcebergColumnHandle( - column0Handle.getColumnIdentity(), - column0Handle.getType(), - ImmutableList.of(column0Handle.getColumnIdentity().getChildren().get(0).getId()), - BIGINT, - Optional.empty()); - IcebergColumnHandle columnY = new IcebergColumnHandle( - column0Handle.getColumnIdentity(), - column0Handle.getType(), - ImmutableList.of(column0Handle.getColumnIdentity().getChildren().get(1).getId()), - BIGINT, - Optional.empty()); + IcebergColumnHandle columnX = IcebergColumnHandle.optional(column0Handle.getColumnIdentity()) + .fieldType(column0Handle.getType(), BIGINT) + .path(column0Handle.getColumnIdentity().getChildren().get(0).getId()) + .build(); + IcebergColumnHandle columnY = IcebergColumnHandle.optional(column0Handle.getColumnIdentity()) + .fieldType(column0Handle.getType(), BIGINT) + .path(column0Handle.getColumnIdentity().getChildren().get(1).getId()) + .build(); // Simple Projection pushdown assertPlan( "SELECT col0.x expr_x, col0.y expr_y FROM " + testTable, any(tableScan( - equalTo(((IcebergTableHandle) tableHandle.get().getConnectorHandle()).withProjectedColumns(Set.of(columnX, columnY))), + equalTo(((IcebergTableHandle) tableHandle.get().connectorHandle()).withProjectedColumns(Set.of(columnX, columnY))), TupleDomain.all(), ImmutableMap.of("col0#x", equalTo(columnX), "col0#y", equalTo(columnY))))); @@ -180,7 +186,7 @@ public void testDereferencePushdown() format("SELECT col0.x FROM %s WHERE col0.x = col1 + 3 and col0.y = 2", testTable), anyTree( filter( - "y = BIGINT '2' AND (x = CAST((col1 + 3) AS BIGINT))", + new Logical(AND, ImmutableList.of(new Comparison(EQUAL, new Reference(BIGINT, "y"), new Constant(BIGINT, 2L)), new Comparison(EQUAL, new Reference(BIGINT, "x"), new Cast(new Call(ADD_INTEGER, ImmutableList.of(new Reference(INTEGER, "col1"), new Constant(INTEGER, 3L))), BIGINT)))), tableScan( table -> { IcebergTableHandle icebergTableHandle = (IcebergTableHandle) table; @@ -196,7 +202,7 @@ public void testDereferencePushdown() format("SELECT col0, col0.y expr_y FROM %s WHERE col0.x = 5", testTable), anyTree( filter( - "x = BIGINT '5'", + new Comparison(EQUAL, new Reference(BIGINT, "x"), new Constant(BIGINT, 5L)), tableScan( table -> { IcebergTableHandle icebergTableHandle = (IcebergTableHandle) table; @@ -213,15 +219,21 @@ public void testDereferencePushdown() anyTree( project( ImmutableMap.of( - "expr_0_x", expression("expr_0[1]"), - "expr_0", expression("expr_0"), - "expr_0_y", expression("expr_0[2]")), + "expr_0_x", expression(new FieldReference(new Reference(RowType.anonymousRow(BIGINT, BIGINT), "expr_0"), 0)), + "expr_0", expression(new Reference(RowType.anonymousRow(BIGINT, BIGINT), "expr_0")), + "expr_0_y", expression(new FieldReference(new Reference(RowType.anonymousRow(BIGINT, BIGINT), "expr_0"), 1))), join(INNER, builder -> builder - .equiCriteria("t_expr_1", "s_expr_1") + .equiCriteria("s_expr_1", "t_expr_1") .left( + anyTree( + tableScan( + equalTo(((IcebergTableHandle) tableHandle.get().connectorHandle()).withProjectedColumns(Set.of(column1Handle))), + TupleDomain.all(), + ImmutableMap.of("s_expr_1", equalTo(column1Handle))))) + .right( anyTree( filter( - "x = BIGINT '2'", + new Comparison(EQUAL, new Reference(BIGINT, "x"), new Constant(BIGINT, 2L)), tableScan( table -> { IcebergTableHandle icebergTableHandle = (IcebergTableHandle) table; @@ -233,12 +245,6 @@ public void testDereferencePushdown() unenforcedConstraint.equals(expectedUnenforcedConstraint); }, TupleDomain.all(), - ImmutableMap.of("x", equalTo(columnX), "expr_0", equalTo(column0Handle), "t_expr_1", equalTo(column1Handle)))))) - .right( - anyTree( - tableScan( - equalTo(((IcebergTableHandle) tableHandle.get().getConnectorHandle()).withProjectedColumns(Set.of(column1Handle))), - TupleDomain.all(), - ImmutableMap.of("s_expr_1", equalTo(column1Handle))))))))); + ImmutableMap.of("x", equalTo(columnX), "expr_0", equalTo(column0Handle), "t_expr_1", equalTo(column1Handle)))))))))); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergQueryFailureRecoveryTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergQueryFailureRecoveryTest.java index ca2861e3ab6e..90c76cde9c8c 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergQueryFailureRecoveryTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergQueryFailureRecoveryTest.java @@ -13,19 +13,26 @@ */ package io.trino.plugin.iceberg; +import com.google.inject.Module; import io.trino.operator.RetryPolicy; import io.trino.plugin.exchange.filesystem.FileSystemExchangePlugin; import io.trino.plugin.exchange.filesystem.containers.MinioStorage; import io.trino.testing.QueryRunner; import io.trino.tpch.TpchTable; -import org.testng.annotations.AfterClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.parallel.Execution; import java.util.List; import java.util.Map; import static io.trino.plugin.exchange.filesystem.containers.MinioStorage.getExchangeManagerProperties; import static io.trino.testing.TestingNames.randomNameSuffix; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +import static org.junit.jupiter.api.parallel.ExecutionMode.CONCURRENT; +@TestInstance(PER_CLASS) +@Execution(CONCURRENT) public class TestIcebergQueryFailureRecoveryTest extends BaseIcebergFailureRecoveryTest { @@ -40,30 +47,29 @@ protected TestIcebergQueryFailureRecoveryTest() protected QueryRunner createQueryRunner( List> requiredTpchTables, Map configProperties, - Map coordinatorProperties) + Map coordinatorProperties, + Module failureInjectionModule) throws Exception { - this.minioStorage = new MinioStorage("test-exchange-spooling-" + randomNameSuffix()); + this.minioStorage = closeAfterClass(new MinioStorage("test-exchange-spooling-" + randomNameSuffix())); minioStorage.start(); return IcebergQueryRunner.builder() - .setInitialTables(requiredTpchTables) .setCoordinatorProperties(coordinatorProperties) .setExtraProperties(configProperties) .setAdditionalSetup(runner -> { runner.installPlugin(new FileSystemExchangePlugin()); runner.loadExchangeManager("filesystem", getExchangeManagerProperties(minioStorage)); }) + .setAdditionalModule(failureInjectionModule) + .setInitialTables(requiredTpchTables) .build(); } - @AfterClass(alwaysRun = true) + @AfterAll public void destroy() throws Exception { - if (minioStorage != null) { - minioStorage.close(); - minioStorage = null; - } + minioStorage = null; // closed by closeAfterClass } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergReadVersionedTable.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergReadVersionedTable.java index b06c4d03c4a3..be0da7ff6a03 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergReadVersionedTable.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergReadVersionedTable.java @@ -14,18 +14,20 @@ package io.trino.plugin.iceberg; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DistributedQueryRunner; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import io.trino.testing.QueryRunner; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import java.time.Instant; import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; -import static io.trino.plugin.iceberg.IcebergQueryRunner.createIcebergQueryRunner; import static java.lang.String.format; import static java.time.ZoneOffset.UTC; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public class TestIcebergReadVersionedTable extends AbstractTestQueryFramework { @@ -36,13 +38,13 @@ public class TestIcebergReadVersionedTable private long incorrectSnapshotId; @Override - protected DistributedQueryRunner createQueryRunner() + protected QueryRunner createQueryRunner() throws Exception { - return createIcebergQueryRunner(); + return IcebergQueryRunner.builder().build(); } - @BeforeClass + @BeforeAll public void setUp() throws InterruptedException { @@ -87,10 +89,10 @@ public void testSelectTableWithEndLongTimestampWithTimezone() public void testEndVersionInTableNameAndForClauseShouldFail() { assertQueryFails("SELECT * FROM \"test_iceberg_read_versioned_table@" + v1SnapshotId + "\" FOR VERSION AS OF " + v1SnapshotId, - "Invalid Iceberg table name: test_iceberg_read_versioned_table@%d".formatted(v1SnapshotId)); + "line 1:15: Table 'iceberg.tpch.\"test_iceberg_read_versioned_table@%d\"' does not exist".formatted(v1SnapshotId)); assertQueryFails("SELECT * FROM \"test_iceberg_read_versioned_table@" + v1SnapshotId + "\" FOR TIMESTAMP AS OF " + timestampLiteral(v1EpochMillis, 9), - "Invalid Iceberg table name: test_iceberg_read_versioned_table@%d".formatted(v1SnapshotId)); + "line 1:15: Table 'iceberg.tpch.\"test_iceberg_read_versioned_table@%d\"' does not exist".formatted(v1SnapshotId)); } @Test diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergReadVersionedTableByTemporal.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergReadVersionedTableByTemporal.java new file mode 100644 index 000000000000..cc5cfa5eb1c9 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergReadVersionedTableByTemporal.java @@ -0,0 +1,147 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableMap; +import io.trino.Session; +import io.trino.spi.type.TimeZoneKey; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.QueryRunner; +import io.trino.testing.containers.Minio; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; + +import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.containers.Minio.MINIO_ACCESS_KEY; +import static io.trino.testing.containers.Minio.MINIO_REGION; +import static io.trino.testing.containers.Minio.MINIO_SECRET_KEY; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; + +@TestInstance(PER_CLASS) +public class TestIcebergReadVersionedTableByTemporal + extends AbstractTestQueryFramework +{ + private static final String BUCKET_NAME = "test-bucket-time-travel"; + + private Minio minio; + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + minio = closeAfterClass(Minio.builder().build()); + minio.start(); + minio.createBucket(BUCKET_NAME); + + QueryRunner queryRunner = IcebergQueryRunner.builder() + .setIcebergProperties( + ImmutableMap.builder() + .put("fs.native-s3.enabled", "true") + .put("s3.aws-access-key", MINIO_ACCESS_KEY) + .put("s3.aws-secret-key", MINIO_SECRET_KEY) + .put("s3.region", MINIO_REGION) + .put("s3.endpoint", minio.getMinioAddress()) + .put("s3.path-style-access", "true") + .put("iceberg.register-table-procedure.enabled", "true") + .buildOrThrow()) + .build(); + + queryRunner.execute("CREATE SCHEMA IF NOT EXISTS " + ICEBERG_CATALOG + ".tpch"); + return queryRunner; + } + + @AfterAll + public void destroy() + throws Exception + { + minio = null; // closed by closeAfterClass + } + + @Test + public void testSelectTableWithEndVersionAsTemporal() + { + String tableName = "test_iceberg_read_versioned_table_" + randomNameSuffix(); + + minio.copyResources("iceberg/timetravel", BUCKET_NAME, "timetravel"); + assertUpdate(format( + "CALL system.register_table(CURRENT_SCHEMA, '%s', '%s')", + tableName, + format("s3://%s/timetravel", BUCKET_NAME))); + + assertThat(query("SELECT * FROM " + tableName)) + .matches("VALUES 1, 2, 3"); + + Session utcSession = Session.builder(getSession()).setTimeZoneKey(TimeZoneKey.UTC_KEY).build(); + assertThat(query(utcSession, "SELECT made_current_at FROM \"" + tableName + "$history\"")) + .matches("VALUES" + + " TIMESTAMP '2023-06-30 05:01:46.265 UTC'," + // CREATE TABLE timetravel(data integer) + " TIMESTAMP '2023-07-01 05:02:43.954 UTC'," + // INSERT INTO timetravel VALUES 1 + " TIMESTAMP '2023-07-02 05:03:39.586 UTC'," + // INSERT INTO timetravel VALUES 2 + " TIMESTAMP '2023-07-03 05:03:42.434 UTC'"); // INSERT INTO timetravel VALUES 3 + + assertUpdate("INSERT INTO " + tableName + " VALUES 4", 1); + + assertThat(query("SELECT * FROM " + tableName)).matches("VALUES 1, 2, 3, 4"); + Session viennaSession = Session.builder(getSession()).setTimeZoneKey(TimeZoneKey.getTimeZoneKey("Europe/Vienna")).build(); + Session losAngelesSession = Session.builder(getSession()).setTimeZoneKey(TimeZoneKey.getTimeZoneKey("America/Los_Angeles")).build(); + + // version as date + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-01'")) + .returnsEmptyResult(); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-01'")) + .matches("VALUES 1"); + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-02'")) + .matches("VALUES 1"); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-02'")) + .matches("VALUES 1, 2"); + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-03'")) + .matches("VALUES 1, 2"); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-03'")) + .matches("VALUES 1, 2, 3"); + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-04'")) + .matches("VALUES 1, 2, 3"); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF DATE '2023-07-04'")) + .matches("VALUES 1, 2, 3"); + + // version as timestamp + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-01 00:00:00'")) + .returnsEmptyResult(); + assertThat(query(utcSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-01 05:02:43.953'")) + .returnsEmptyResult(); + assertThat(query(utcSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-01 05:02:43.954'")) + .matches("VALUES 1"); + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-01 07:02:43.954'")) + .matches("VALUES 1"); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-01 00:00:00.1'")) + .matches("VALUES 1"); + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-02 01:00:00.12'")) + .matches("VALUES 1"); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-02 01:00:00.123'")) + .matches("VALUES 1, 2"); + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-03 02:00:00.123'")) + .matches("VALUES 1, 2"); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-03 02:00:00.123456'")) + .matches("VALUES 1, 2, 3"); + assertThat(query(viennaSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-04 03:00:00.123456789'")) + .matches("VALUES 1, 2, 3"); + assertThat(query(losAngelesSession, "SELECT * FROM " + tableName + " FOR TIMESTAMP AS OF TIMESTAMP '2023-07-04 03:00:00.123456789012'")) + .matches("VALUES 1, 2, 3"); + + assertUpdate("DROP TABLE " + tableName); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java index 701480917dbf..38271aff2090 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergRegisterTableProcedure.java @@ -19,10 +19,14 @@ import io.trino.filesystem.FileIterator; import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; -import io.trino.plugin.hive.metastore.HiveMetastore; +import io.trino.metastore.HiveMetastore; +import io.trino.plugin.iceberg.catalog.file.TestingIcebergFileMetastoreCatalogModule; +import io.trino.plugin.tpch.TpchPlugin; import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.DistributedQueryRunner; import io.trino.testing.MaterializedResult; import io.trino.testing.QueryRunner; +import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.FileFormat; @@ -32,29 +36,34 @@ import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.types.Types; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Stream; import static com.google.common.base.Verify.verify; import static com.google.common.io.MoreFiles.deleteRecursively; import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; -import static io.trino.hadoop.ConfigurationInstantiator.newEmptyConfiguration; +import static io.trino.plugin.hive.HiveTestUtils.HDFS_FILE_SYSTEM_FACTORY; import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; import static io.trino.plugin.iceberg.IcebergUtil.METADATA_FOLDER_NAME; -import static io.trino.plugin.iceberg.procedure.RegisterTableProcedure.getLatestMetadataLocation; +import static io.trino.plugin.iceberg.IcebergUtil.getLatestMetadataLocation; +import static io.trino.testing.TestingAccessControlManager.TestingPrivilegeType.CREATE_TABLE; +import static io.trino.testing.TestingAccessControlManager.privilege; import static io.trino.testing.TestingConnectorSession.SESSION; import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.TestingSession.testSessionBuilder; import static java.lang.String.format; import static java.util.Locale.ENGLISH; import static org.apache.iceberg.Files.localInput; @@ -66,6 +75,7 @@ public class TestIcebergRegisterTableProcedure private HiveMetastore metastore; private File metastoreDir; private TrinoFileSystem fileSystem; + private Path dataDir; @Override protected QueryRunner createQueryRunner() @@ -73,35 +83,39 @@ protected QueryRunner createQueryRunner() { metastoreDir = Files.createTempDirectory("test_iceberg_register_table").toFile(); metastoreDir.deleteOnExit(); - metastore = createTestingFileHiveMetastore(metastoreDir); - return IcebergQueryRunner.builder() - .setMetastoreDirectory(metastoreDir) - .setIcebergProperties(ImmutableMap.of("iceberg.register-table-procedure.enabled", "true")) - .build(); + metastore = createTestingFileHiveMetastore(HDFS_FILE_SYSTEM_FACTORY, Location.of(metastoreDir.getAbsolutePath())); + + // TODO: convert to IcebergQueryRunner when there is a replacement for HadoopTables that works with TrinoFileSystem + QueryRunner queryRunner = DistributedQueryRunner.builder(testSessionBuilder() + .setCatalog(ICEBERG_CATALOG) + .setSchema("tpch") + .build()).build(); + + queryRunner.installPlugin(new TpchPlugin()); + queryRunner.createCatalog("tpch", "tpch"); + + dataDir = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data"); + queryRunner.installPlugin(new TestingIcebergPlugin(dataDir, Optional.of(new TestingIcebergFileMetastoreCatalogModule(metastore)))); + queryRunner.createCatalog(ICEBERG_CATALOG, "iceberg", ImmutableMap.of("fs.hadoop.enabled", "true", "iceberg.register-table-procedure.enabled", "true")); + queryRunner.execute("CREATE SCHEMA iceberg.tpch"); + return queryRunner; } - @BeforeClass + @BeforeAll public void initFileSystem() { fileSystem = getFileSystemFactory(getDistributedQueryRunner()).create(SESSION); } - @AfterClass(alwaysRun = true) + @AfterAll public void tearDown() throws IOException { deleteRecursively(metastoreDir.toPath(), ALLOW_INSECURE); } - @DataProvider - public static Object[][] fileFormats() - { - return Stream.of(IcebergFileFormat.values()) - .map(icebergFileFormat -> new Object[] {icebergFileFormat}) - .toArray(Object[][]::new); - } - - @Test(dataProvider = "fileFormats") + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterTableWithTableLocation(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_table_with_table_location_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -123,7 +137,37 @@ public void testRegisterTableWithTableLocation(IcebergFileFormat icebergFileForm assertUpdate(format("DROP TABLE %s", tableName)); } - @Test(dataProvider = "fileFormats") + @Test + public void testRegisterTableTrailingSlash() + { + testRegisterTableTrailingSlash("test_dir", "test_dir"); + testRegisterTableTrailingSlash("test_dir", "test_dir/"); + testRegisterTableTrailingSlash("test_dir/", "test_dir"); + testRegisterTableTrailingSlash("test_dir/", "test_dir/"); + } + + private void testRegisterTableTrailingSlash(String tableDir, String registeredTableDir) + { + String tableName = "test_register_table_trailing_slash_" + randomNameSuffix(); + + String tableLocation = format("%s/%s/%s", dataDir, tableName, tableDir); + String registeredTableLocation = format("%s/%s/%s", dataDir, tableName, registeredTableDir); + + assertUpdate(format("CREATE TABLE %s (a int) WITH (location = '%s')", tableName, tableLocation)); + assertUpdate(format("INSERT INTO %s VALUES 1", tableName), 1); + + // Drop table from metastore and use the same table name to register again with the metadata + dropTableFromMetastore(tableName); + + assertUpdate(format("CALL iceberg.system.register_table (CURRENT_SCHEMA, '%s', '%s')", tableName, registeredTableLocation)); + + assertThat(query("SELECT * FROM " + tableName)) + .matches("VALUES 1"); + assertUpdate("DROP TABLE " + tableName); + } + + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterPartitionedTable(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_partitioned_table_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -145,7 +189,8 @@ public void testRegisterPartitionedTable(IcebergFileFormat icebergFileFormat) assertUpdate("DROP TABLE " + tableName); } - @Test(dataProvider = "fileFormats") + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterTableWithComments(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_table_with_comments_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -171,7 +216,8 @@ public void testRegisterTableWithComments(IcebergFileFormat icebergFileFormat) assertUpdate(format("DROP TABLE %s", tableName)); } - @Test(dataProvider = "fileFormats") + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterTableWithShowCreateTable(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_table_with_show_create_table_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -191,7 +237,8 @@ public void testRegisterTableWithShowCreateTable(IcebergFileFormat icebergFileFo assertUpdate(format("DROP TABLE %s", tableName)); } - @Test(dataProvider = "fileFormats") + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterTableWithReInsert(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_table_with_re_insert_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -215,7 +262,8 @@ public void testRegisterTableWithReInsert(IcebergFileFormat icebergFileFormat) assertUpdate(format("DROP TABLE %s", tableName)); } - @Test(dataProvider = "fileFormats") + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterTableWithDroppedTable(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_table_with_dropped_table_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -233,7 +281,8 @@ public void testRegisterTableWithDroppedTable(IcebergFileFormat icebergFileForma ".*No versioned metadata file exists at location.*"); } - @Test(dataProvider = "fileFormats") + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterTableWithDifferentTableName(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_table_with_different_table_name_old_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -258,7 +307,8 @@ public void testRegisterTableWithDifferentTableName(IcebergFileFormat icebergFil assertUpdate(format("DROP TABLE %s", tableNameNew)); } - @Test(dataProvider = "fileFormats") + @ParameterizedTest + @EnumSource(IcebergFileFormat.class) public void testRegisterTableWithMetadataFile(IcebergFileFormat icebergFileFormat) { String tableName = "test_register_table_with_metadata_file_" + icebergFileFormat.name().toLowerCase(ENGLISH) + "_" + randomNameSuffix(); @@ -449,7 +499,7 @@ public void testRegisterHadoopTableAndRead() // create hadoop table String hadoopTableName = "hadoop_table_" + randomNameSuffix(); String hadoopTableLocation = metastoreDir.getPath() + "/" + hadoopTableName; - HadoopTables hadoopTables = new HadoopTables(newEmptyConfiguration()); + HadoopTables hadoopTables = new HadoopTables(new Configuration(false)); Schema schema = new Schema(ImmutableList.of( Types.NestedField.optional(1, "id", Types.IntegerType.get()), Types.NestedField.optional(2, "name", Types.StringType.get()))); @@ -493,6 +543,20 @@ public void testRegisterHadoopTableAndRead() assertUpdate("DROP TABLE " + tempTableName); } + @Test + void testRegisterTableAccessControl() + { + String tableName = "test_register_table_access_control_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT 1 a", 1); + String tableLocation = getTableLocation(tableName); + assertUpdate("CALL system.unregister_table(CURRENT_SCHEMA, '" + tableName + "')"); + + assertAccessDenied( + "CALL system.register_table(CURRENT_SCHEMA, '" + tableName + "', '" + tableLocation + "')", + "Cannot create table .*", + privilege(tableName, CREATE_TABLE)); + } + private String getTableLocation(String tableName) { Pattern locationPattern = Pattern.compile(".*location = '(.*?)'.*", Pattern.DOTALL); @@ -511,11 +575,6 @@ private void dropTableFromMetastore(String tableName) assertThat(metastore.getTable(getSession().getSchema().orElseThrow(), tableName)).as("Table in metastore should be dropped").isEmpty(); } - private String getTableComment(String tableName) - { - return (String) computeScalar("SELECT comment FROM system.metadata.table_comments WHERE catalog_name = 'iceberg' AND schema_name = '" + getSession().getSchema().orElseThrow() + "' AND table_name = '" + tableName + "'"); - } - private String getColumnComment(String tableName, String columnName) { return (String) computeScalar("SELECT comment FROM information_schema.columns WHERE table_schema = '" + getSession().getSchema().orElseThrow() + "' AND table_name = '" + tableName + "' AND column_name = '" + columnName + "'"); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSecurityConfig.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSecurityConfig.java index 7fdf9f6dcde0..7638da2037df 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSecurityConfig.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSecurityConfig.java @@ -14,7 +14,7 @@ package io.trino.plugin.iceberg; import com.google.common.collect.ImmutableMap; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.util.Map; diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSplitSource.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSplitSource.java index af02fe6ec1ef..7457266c7f3b 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSplitSource.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergSplitSource.java @@ -18,15 +18,23 @@ import com.google.common.collect.ImmutableSet; import io.airlift.units.Duration; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.plugin.base.CatalogName; +import io.trino.filesystem.cache.DefaultCachingHostAddressProvider; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.cache.CachingHiveMetastore; import io.trino.plugin.hive.TrinoViewHiveMetastore; -import io.trino.plugin.hive.metastore.HiveMetastore; -import io.trino.plugin.hive.metastore.cache.CachingHiveMetastore; +import io.trino.plugin.hive.orc.OrcReaderConfig; +import io.trino.plugin.hive.orc.OrcWriterConfig; +import io.trino.plugin.hive.parquet.ParquetReaderConfig; +import io.trino.plugin.hive.parquet.ParquetWriterConfig; import io.trino.plugin.iceberg.catalog.TrinoCatalog; import io.trino.plugin.iceberg.catalog.file.FileMetastoreTableOperationsProvider; import io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog; +import io.trino.plugin.iceberg.catalog.rest.DefaultIcebergFileSystemFactory; +import io.trino.spi.SplitWeight; +import io.trino.spi.catalog.CatalogName; import io.trino.spi.connector.CatalogHandle; import io.trino.spi.connector.ColumnHandle; +import io.trino.spi.connector.ConnectorSession; import io.trino.spi.connector.DynamicFilter; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.predicate.Domain; @@ -36,45 +44,73 @@ import io.trino.spi.predicate.ValueSet; import io.trino.spi.type.TestingTypeManager; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DistributedQueryRunner; import io.trino.testing.QueryRunner; +import io.trino.testing.TestingConnectorSession; +import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.PartitionSpecParser; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.metrics.InMemoryMetricsReporter; +import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -import org.testng.annotations.AfterClass; -import org.testng.annotations.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.Timeout; +import java.io.Closeable; import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.file.Files; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.OptionalLong; import java.util.Set; +import java.util.UUID; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; import static com.google.common.io.MoreFiles.deleteRecursively; import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; -import static io.trino.plugin.hive.metastore.cache.CachingHiveMetastore.memoizeMetastore; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static com.google.common.util.concurrent.MoreExecutors.directExecutor; +import static com.google.common.util.concurrent.MoreExecutors.newDirectExecutorService; +import static io.trino.metastore.cache.CachingHiveMetastore.createPerTransactionCache; +import static io.trino.plugin.iceberg.IcebergSplitSource.createFileStatisticsDomain; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.plugin.iceberg.util.EqualityDeleteUtils.writeEqualityDeleteForTable; import static io.trino.spi.connector.Constraint.alwaysTrue; import static io.trino.spi.type.BigintType.BIGINT; -import static io.trino.testing.TestingConnectorSession.SESSION; import static io.trino.tpch.TpchTable.NATION; import static java.util.concurrent.TimeUnit.SECONDS; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public class TestIcebergSplitSource extends AbstractTestQueryFramework { + private static final ConnectorSession SESSION = TestingConnectorSession.builder() + .setPropertyMetadata(new IcebergSessionProperties( + new IcebergConfig(), + new OrcReaderConfig(), + new OrcWriterConfig(), + new ParquetReaderConfig(), + new ParquetWriterConfig()) + .getSessionProperties()) + .build(); + private File metastoreDir; private TrinoFileSystemFactory fileSystemFactory; private TrinoCatalog catalog; @@ -85,66 +121,56 @@ protected QueryRunner createQueryRunner() { File tempDir = Files.createTempDirectory("test_iceberg_split_source").toFile(); this.metastoreDir = new File(tempDir, "iceberg_data"); - HiveMetastore metastore = createTestingFileHiveMetastore(metastoreDir); - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() + QueryRunner queryRunner = IcebergQueryRunner.builder() .setInitialTables(NATION) .setMetastoreDirectory(metastoreDir) .build(); + HiveMetastore metastore = getHiveMetastore(queryRunner); + this.fileSystemFactory = getFileSystemFactory(queryRunner); - CachingHiveMetastore cachingHiveMetastore = memoizeMetastore(metastore, 1000); + CachingHiveMetastore cachingHiveMetastore = createPerTransactionCache(metastore, 1000); this.catalog = new TrinoHiveCatalog( new CatalogName("hive"), cachingHiveMetastore, new TrinoViewHiveMetastore(cachingHiveMetastore, false, "trino-version", "test"), fileSystemFactory, + FILE_IO_FACTORY, new TestingTypeManager(), - new FileMetastoreTableOperationsProvider(fileSystemFactory), + new FileMetastoreTableOperationsProvider(fileSystemFactory, FILE_IO_FACTORY), false, false, - false); + false, + new IcebergConfig().isHideMaterializedViewStorageTable(), + directExecutor()); return queryRunner; } - @AfterClass(alwaysRun = true) + @AfterAll public void tearDown() throws IOException { deleteRecursively(metastoreDir.getParentFile().toPath(), ALLOW_INSECURE); } - @Test(timeOut = 30_000) + @Test + @Timeout(30) public void testIncompleteDynamicFilterTimeout() throws Exception { long startMillis = System.currentTimeMillis(); SchemaTableName schemaTableName = new SchemaTableName("tpch", "nation"); Table nationTable = catalog.loadTable(SESSION, schemaTableName); - IcebergTableHandle tableHandle = new IcebergTableHandle( - CatalogHandle.fromId("iceberg:NORMAL:v12345"), - schemaTableName.getSchemaName(), - schemaTableName.getTableName(), - TableType.DATA, - Optional.empty(), - SchemaParser.toJson(nationTable.schema()), - Optional.of(PartitionSpecParser.toJson(nationTable.spec())), - 1, - TupleDomain.all(), - TupleDomain.all(), - OptionalLong.empty(), - ImmutableSet.of(), - Optional.empty(), - nationTable.location(), - nationTable.properties(), - false, - Optional.empty()); + IcebergTableHandle tableHandle = createTableHandle(schemaTableName, nationTable, TupleDomain.all()); + CompletableFuture isBlocked = new CompletableFuture<>(); try (IcebergSplitSource splitSource = new IcebergSplitSource( - fileSystemFactory, + new DefaultIcebergFileSystemFactory(fileSystemFactory), SESSION, tableHandle, + nationTable, nationTable.newScan(), Optional.empty(), new DynamicFilter() @@ -158,14 +184,7 @@ public Set getColumnsCovered() @Override public CompletableFuture isBlocked() { - return CompletableFuture.runAsync(() -> { - try { - TimeUnit.HOURS.sleep(1); - } - catch (InterruptedException e) { - throw new IllegalStateException(e); - } - }); + return isBlocked; } @Override @@ -190,7 +209,10 @@ public TupleDomain getCurrentPredicate() alwaysTrue(), new TestingTypeManager(), false, - new IcebergConfig().getMinimumAssignedSplitWeight())) { + new IcebergConfig().getMinimumAssignedSplitWeight(), + new DefaultCachingHostAddressProvider(), + new InMemoryMetricsReporter(), + newDirectExecutorService())) { ImmutableList.Builder splits = ImmutableList.builder(); while (!splitSource.isFinished()) { splitSource.getNextBatch(100).get() @@ -200,189 +222,261 @@ public TupleDomain getCurrentPredicate() .forEach(splits::add); } assertThat(splits.build().size()).isGreaterThan(0); - assertTrue(splitSource.isFinished()); + assertThat(splitSource.isFinished()).isTrue(); assertThat(System.currentTimeMillis() - startMillis) .as("IcebergSplitSource failed to wait for dynamicFilteringWaitTimeout") .isGreaterThanOrEqualTo(2000); } + finally { + isBlocked.complete(null); + } + } + + @Test + public void testFileStatisticsDomain() + throws Exception + { + SchemaTableName schemaTableName = new SchemaTableName("tpch", "nation"); + Table nationTable = catalog.loadTable(SESSION, schemaTableName); + IcebergTableHandle tableHandle = createTableHandle(schemaTableName, nationTable, TupleDomain.all()); + + IcebergSplit split = generateSplit(nationTable, tableHandle, DynamicFilter.EMPTY); + assertThat(split.getFileStatisticsDomain()).isEqualTo(TupleDomain.all()); + + IcebergColumnHandle nationKey = IcebergColumnHandle.optional(new ColumnIdentity(1, "nationkey", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of())) + .columnType(BIGINT) + .build(); + tableHandle = createTableHandle(schemaTableName, nationTable, TupleDomain.fromFixedValues(ImmutableMap.of(nationKey, NullableValue.of(BIGINT, 1L)))); + split = generateSplit(nationTable, tableHandle, DynamicFilter.EMPTY); + assertThat(split.getFileStatisticsDomain()).isEqualTo(TupleDomain.withColumnDomains( + ImmutableMap.of(nationKey, Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 0L, true, 24L, true)), false)))); + + IcebergColumnHandle regionKey = IcebergColumnHandle.optional(new ColumnIdentity(3, "regionkey", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of())) + .columnType(BIGINT) + .build(); + split = generateSplit(nationTable, tableHandle, new DynamicFilter() + { + @Override + public Set getColumnsCovered() + { + return ImmutableSet.of(regionKey); + } + + @Override + public CompletableFuture isBlocked() + { + return NOT_BLOCKED; + } + + @Override + public boolean isComplete() + { + return false; + } + + @Override + public boolean isAwaitable() + { + return true; + } + + @Override + public TupleDomain getCurrentPredicate() + { + return TupleDomain.all(); + } + }); + assertThat(split.getFileStatisticsDomain()).isEqualTo(TupleDomain.withColumnDomains( + ImmutableMap.of( + nationKey, Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 0L, true, 24L, true)), false), + regionKey, Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 0L, true, 4L, true)), false)))); } @Test public void testBigintPartitionPruning() { - IcebergColumnHandle bigintColumn = new IcebergColumnHandle( - new ColumnIdentity(1, "name", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of()), - BIGINT, - ImmutableList.of(), - BIGINT, - Optional.empty()); - assertFalse(IcebergSplitSource.partitionMatchesPredicate( + IcebergColumnHandle bigintColumn = IcebergColumnHandle.optional(new ColumnIdentity(1, "name", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of())) + .columnType(BIGINT) + .build(); + assertThat(IcebergSplitSource.partitionMatchesPredicate( ImmutableSet.of(bigintColumn), () -> ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 1000L)), - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 100L))))); - assertTrue(IcebergSplitSource.partitionMatchesPredicate( + TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 100L))))).isFalse(); + assertThat(IcebergSplitSource.partitionMatchesPredicate( ImmutableSet.of(bigintColumn), () -> ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 1000L)), - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 1000L))))); - assertFalse(IcebergSplitSource.partitionMatchesPredicate( + TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 1000L))))).isTrue(); + assertThat(IcebergSplitSource.partitionMatchesPredicate( ImmutableSet.of(bigintColumn), () -> ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 1000L)), - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.asNull(BIGINT))))); + TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.asNull(BIGINT))))).isFalse(); } @Test public void testBigintStatisticsPruning() { - IcebergColumnHandle bigintColumn = new IcebergColumnHandle( - new ColumnIdentity(1, "name", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of()), - BIGINT, - ImmutableList.of(), - BIGINT, - Optional.empty()); + IcebergColumnHandle bigintColumn = IcebergColumnHandle.optional(new ColumnIdentity(1, "name", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of())) + .columnType(BIGINT) + .build(); Map primitiveTypes = ImmutableMap.of(1, Types.LongType.get()); Map lowerBound = ImmutableMap.of(1, Conversions.toByteBuffer(Types.LongType.get(), 1000L)); Map upperBound = ImmutableMap.of(1, Conversions.toByteBuffer(Types.LongType.get(), 2000L)); + TupleDomain domainLowerUpperBound = TupleDomain.withColumnDomains( + ImmutableMap.of(bigintColumn, Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 1000L, true, 2000L, true)), false))); + List predicatedColumns = ImmutableList.of(bigintColumn); + + assertThat(createFileStatisticsDomain(primitiveTypes, lowerBound, upperBound, ImmutableMap.of(1, 0L), predicatedColumns)) + .isEqualTo(domainLowerUpperBound); - assertFalse(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 0L))), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 1000L))), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 1500L))), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 2000L))), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertFalse(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 3000L))), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - - Domain outsideStatisticsRangeAllowNulls = Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 0L, true, 100L, true)), true); - assertFalse(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, outsideStatisticsRangeAllowNulls)), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, outsideStatisticsRangeAllowNulls)), - lowerBound, - upperBound, - ImmutableMap.of(1, 1L))); - - Domain outsideStatisticsRangeNoNulls = Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 0L, true, 100L, true)), false); - assertFalse(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, outsideStatisticsRangeNoNulls)), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertFalse(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, outsideStatisticsRangeNoNulls)), - lowerBound, - upperBound, - ImmutableMap.of(1, 1L))); - - Domain insideStatisticsRange = Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 1001L, true, 1002L, true)), false); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, insideStatisticsRange)), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, insideStatisticsRange)), - lowerBound, - upperBound, - ImmutableMap.of(1, 1L))); - - Domain overlappingStatisticsRange = Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 990L, true, 1010L, true)), false); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, overlappingStatisticsRange)), - lowerBound, - upperBound, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, overlappingStatisticsRange)), - lowerBound, - upperBound, - ImmutableMap.of(1, 1L))); + TupleDomain domainLowerUpperBoundAllowNulls = TupleDomain.withColumnDomains( + ImmutableMap.of(bigintColumn, Domain.create(ValueSet.ofRanges(Range.range(BIGINT, 1000L, true, 2000L, true)), true))); + assertThat(createFileStatisticsDomain(primitiveTypes, lowerBound, upperBound, ImmutableMap.of(1, 1L), predicatedColumns)) + .isEqualTo(domainLowerUpperBoundAllowNulls); } @Test public void testNullStatisticsMaps() { - IcebergColumnHandle bigintColumn = new IcebergColumnHandle( - new ColumnIdentity(1, "name", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of()), - BIGINT, - ImmutableList.of(), - BIGINT, - Optional.empty()); + IcebergColumnHandle bigintColumn = IcebergColumnHandle.optional(new ColumnIdentity(1, "name", ColumnIdentity.TypeCategory.PRIMITIVE, ImmutableList.of())) + .columnType(BIGINT) + .build(); Map primitiveTypes = ImmutableMap.of(1, Types.LongType.get()); Map lowerBound = ImmutableMap.of(1, Conversions.toByteBuffer(Types.LongType.get(), -1000L)); Map upperBound = ImmutableMap.of(1, Conversions.toByteBuffer(Types.LongType.get(), 2000L)); - TupleDomain domainOfZero = TupleDomain.fromFixedValues(ImmutableMap.of(bigintColumn, NullableValue.of(BIGINT, 0L))); - - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - domainOfZero, - null, - upperBound, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - domainOfZero, - ImmutableMap.of(), - upperBound, - ImmutableMap.of(1, 0L))); - - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - domainOfZero, - lowerBound, - null, - ImmutableMap.of(1, 0L))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - domainOfZero, - lowerBound, - ImmutableMap.of(), - ImmutableMap.of(1, 0L))); - - TupleDomain onlyNull = TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, Domain.onlyNull(BIGINT))); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - onlyNull, - ImmutableMap.of(), - ImmutableMap.of(), - null)); - assertTrue(IcebergSplitSource.fileMatchesPredicate( - primitiveTypes, - onlyNull, - ImmutableMap.of(), - ImmutableMap.of(), - ImmutableMap.of())); + TupleDomain domainLessThanUpperBound = TupleDomain.withColumnDomains( + ImmutableMap.of(bigintColumn, Domain.create(ValueSet.ofRanges(Range.lessThanOrEqual(BIGINT, 2000L)), false))); + List predicatedColumns = ImmutableList.of(bigintColumn); + + assertThat(createFileStatisticsDomain(primitiveTypes, null, upperBound, ImmutableMap.of(1, 0L), predicatedColumns)) + .isEqualTo(domainLessThanUpperBound); + assertThat(createFileStatisticsDomain(primitiveTypes, ImmutableMap.of(), upperBound, ImmutableMap.of(1, 0L), predicatedColumns)) + .isEqualTo(domainLessThanUpperBound); + + TupleDomain domainGreaterThanLessBound = TupleDomain.withColumnDomains( + ImmutableMap.of(bigintColumn, Domain.create(ValueSet.ofRanges(Range.greaterThanOrEqual(BIGINT, -1000L)), false))); + assertThat(createFileStatisticsDomain(primitiveTypes, lowerBound, null, ImmutableMap.of(1, 0L), predicatedColumns)) + .isEqualTo(domainGreaterThanLessBound); + assertThat(createFileStatisticsDomain(primitiveTypes, lowerBound, ImmutableMap.of(), ImmutableMap.of(1, 0L), predicatedColumns)) + .isEqualTo(domainGreaterThanLessBound); + + assertThat(createFileStatisticsDomain(primitiveTypes, ImmutableMap.of(), ImmutableMap.of(), null, predicatedColumns)) + .isEqualTo(TupleDomain.all()); + assertThat(createFileStatisticsDomain(primitiveTypes, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of(), predicatedColumns)) + .isEqualTo(TupleDomain.all()); + assertThat(createFileStatisticsDomain(primitiveTypes, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of(1, 1L), predicatedColumns)) + .isEqualTo(TupleDomain.all()); + + assertThat(createFileStatisticsDomain(primitiveTypes, ImmutableMap.of(), ImmutableMap.of(), ImmutableMap.of(1, 0L), predicatedColumns)) + .isEqualTo(TupleDomain.withColumnDomains(ImmutableMap.of(bigintColumn, Domain.notNull(BIGINT)))); + } + + @Test + public void testSplitWeight() + throws Exception + { + SchemaTableName schemaTableName = new SchemaTableName("tpch", "nation"); + Table nationTable = catalog.loadTable(SESSION, schemaTableName); + // Decrease target split size so that changes in split weight are significant enough to be detected + nationTable.updateProperties() + .set(TableProperties.SPLIT_SIZE, "10000") + .commit(); + IcebergTableHandle tableHandle = createTableHandle(schemaTableName, nationTable, TupleDomain.all()); + + IcebergSplit split = generateSplit(nationTable, tableHandle, DynamicFilter.EMPTY); + SplitWeight weightWithoutDelete = split.getSplitWeight(); + + String dataFilePath = (String) computeActual("SELECT file_path FROM \"" + schemaTableName.getTableName() + "$files\" LIMIT 1").getOnlyValue(); + + // Write position delete file + FileIO fileIo = FILE_IO_FACTORY.create(fileSystemFactory.create(SESSION)); + PositionDeleteWriter writer = Parquet.writeDeletes(fileIo.newOutputFile("local:///delete_file_" + UUID.randomUUID())) + .createWriterFunc(GenericParquetWriter::create) + .forTable(nationTable) + .overwrite() + .rowSchema(nationTable.schema()) + .withSpec(PartitionSpec.unpartitioned()) + .buildPositionWriter(); + PositionDelete positionDelete = PositionDelete.create(); + PositionDelete record = positionDelete.set(dataFilePath, 0, GenericRecord.create(nationTable.schema())); + try (Closeable ignored = writer) { + writer.write(record); + } + nationTable.newRowDelta().addDeletes(writer.toDeleteFile()).commit(); + + split = generateSplit(nationTable, tableHandle, DynamicFilter.EMPTY); + SplitWeight splitWeightWithPositionDelete = split.getSplitWeight(); + assertThat(splitWeightWithPositionDelete.getRawValue()).isGreaterThan(weightWithoutDelete.getRawValue()); + + // Write equality delete file + writeEqualityDeleteForTable( + nationTable, + fileSystemFactory, + Optional.of(nationTable.spec()), + Optional.of(new PartitionData(new Long[] {1L})), + ImmutableMap.of("regionkey", 1L), + Optional.empty()); + + split = generateSplit(nationTable, tableHandle, DynamicFilter.EMPTY); + assertThat(split.getSplitWeight().getRawValue()).isGreaterThan(splitWeightWithPositionDelete.getRawValue()); + } + + private IcebergSplit generateSplit(Table nationTable, IcebergTableHandle tableHandle, DynamicFilter dynamicFilter) + throws Exception + { + try (IcebergSplitSource splitSource = new IcebergSplitSource( + new DefaultIcebergFileSystemFactory(fileSystemFactory), + SESSION, + tableHandle, + nationTable, + nationTable.newScan(), + Optional.empty(), + dynamicFilter, + new Duration(0, SECONDS), + alwaysTrue(), + new TestingTypeManager(), + false, + 0, + new DefaultCachingHostAddressProvider(), + new InMemoryMetricsReporter(), + newDirectExecutorService())) { + ImmutableList.Builder builder = ImmutableList.builder(); + while (!splitSource.isFinished()) { + splitSource.getNextBatch(100).get() + .getSplits() + .stream() + .map(IcebergSplit.class::cast) + .forEach(builder::add); + } + List splits = builder.build(); + assertThat(splits).hasSize(1); + assertThat(splitSource.isFinished()).isTrue(); + + return splits.getFirst(); + } + } + + private static IcebergTableHandle createTableHandle(SchemaTableName schemaTableName, Table nationTable, TupleDomain unenforcedPredicate) + { + return new IcebergTableHandle( + CatalogHandle.fromId("iceberg:NORMAL:v12345"), + schemaTableName.getSchemaName(), + schemaTableName.getTableName(), + TableType.DATA, + Optional.empty(), + SchemaParser.toJson(nationTable.schema()), + Optional.of(PartitionSpecParser.toJson(nationTable.spec())), + 1, + unenforcedPredicate, + TupleDomain.all(), + OptionalLong.empty(), + ImmutableSet.of(), + Optional.empty(), + nationTable.location(), + nationTable.properties(), + Optional.empty(), + false, + Optional.empty(), + ImmutableSet.of(), + Optional.of(false)); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergStatistics.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergStatistics.java index 30bb2574bb8b..aaa5af59e926 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergStatistics.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergStatistics.java @@ -17,14 +17,16 @@ import com.google.common.math.IntMath; import io.trino.Session; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DataProviders; import io.trino.testing.QueryRunner; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; +import org.junit.jupiter.params.provider.ValueSource; import java.util.List; import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.MoreCollectors.onlyElement; import static io.trino.plugin.iceberg.IcebergSessionProperties.COLLECT_EXTENDED_STATISTICS_ON_WRITE; import static io.trino.plugin.iceberg.IcebergSessionProperties.EXPIRE_SNAPSHOTS_MIN_RETENTION; import static io.trino.testing.DataProviders.cartesianProduct; @@ -37,9 +39,6 @@ import static java.math.RoundingMode.UP; import static java.util.stream.Collectors.joining; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertNotEquals; public class TestIcebergStatistics extends AbstractTestQueryFramework @@ -53,19 +52,21 @@ protected QueryRunner createQueryRunner() .build(); } - @Test(dataProviderClass = DataProviders.class, dataProvider = "trueFalse") + @ParameterizedTest + @ValueSource(booleans = {true, false}) public void testAnalyze(boolean collectOnStatsOnWrites) { Session writeSession = withStatsOnWrite(getSession(), collectOnStatsOnWrites); String tableName = "test_analyze_" + collectOnStatsOnWrites; assertUpdate(writeSession, "CREATE TABLE " + tableName + " AS SELECT * FROM tpch.sf1.nation", 25); - String goodStatsInitial = """ + String goodStatsInitial = + """ VALUES ('nationkey', null, 25, 0, null, '0', '24'), ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 2178.0, 25, 0, null, null, null), - ('name', 594.0, 25, 0, null, null, null), + ('comment', 2162.0, 25, 0, null, null, null), + ('name', 583.0, 25, 0, null, null, null), (null, null, null, null, 25, null, null)"""; if (collectOnStatsOnWrites) { @@ -75,12 +76,12 @@ public void testAnalyze(boolean collectOnStatsOnWrites) assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, null, 0, null, '0', '24'), - ('regionkey', null, null, 0, null, '0', '4'), - ('comment', 2178.0, null, 0, null, null, null), - ('name', 594.0, null, 0, null, null, null), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, null, 0, null, '0', '24'), + ('regionkey', null, null, 0, null, '0', '4'), + ('comment', 2162.0, null, 0, null, null, null), + ('name', 583.0, null, 0, null, null, null), + (null, null, null, null, 25, null, null)"""); } assertUpdate("ANALYZE " + tableName); @@ -92,25 +93,28 @@ public void testAnalyze(boolean collectOnStatsOnWrites) // insert one more copy; should not influence stats other than rowcount assertUpdate(writeSession, "INSERT INTO " + tableName + " SELECT * FROM tpch.sf1.nation", 25); - String goodStatsAfterFirstInsert = """ + String goodStatsAfterFirstInsert = + """ VALUES ('nationkey', null, 25, 0, null, '0', '24'), ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 4357.0, 25, 0, null, null, null), - ('name', 1188.0, 25, 0, null, null, null), + ('comment', 4325.0, 25, 0, null, null, null), + ('name', 1166.0, 25, 0, null, null, null), (null, null, null, null, 50, null, null)"""; assertUpdate("ANALYZE " + tableName); assertQuery("SHOW STATS FOR " + tableName, goodStatsAfterFirstInsert); // insert modified rows assertUpdate(writeSession, "INSERT INTO " + tableName + " SELECT nationkey + 25, reverse(name), regionkey + 5, reverse(comment) FROM tpch.sf1.nation", 25); - String goodStatsAfterSecondInsert = """ + String goodStatsAfterSecondInsert = + """ VALUES ('nationkey', null, 50, 0, null, '0', '49'), ('regionkey', null, 10, 0, null, '0', '9'), - ('comment', 6517.0, 50, 0, null, null, null), - ('name', 1800.0, 50, 0, null, null, null), - (null, null, null, null, 75, null, null)"""; + ('comment', 6463.0, 50, 0, null, null, null), + ('name', 1768.0, 50, 0, null, null, null), + (null, null, null, null, 75, null, null) + """; if (collectOnStatsOnWrites) { assertQuery("SHOW STATS FOR " + tableName, goodStatsAfterSecondInsert); @@ -120,12 +124,13 @@ public void testAnalyze(boolean collectOnStatsOnWrites) assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '49'), - ('regionkey', null, 5, 0, null, '0', '9'), - ('comment', 6517.0, 25, 0, null, null, null), - ('name', 1800.0, 25, 0, null, null, null), - (null, null, null, null, 75, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '49'), + ('regionkey', null, 5, 0, null, '0', '9'), + ('comment', 6463.0, 25, 0, null, null, null), + ('name', 1768.0, 25, 0, null, null, null), + (null, null, null, null, 75, null, null) + """); } // with analyze we should get new NDV @@ -148,43 +153,56 @@ public void testAnalyzeWithSchemaEvolution() assertUpdate("ALTER TABLE " + tableName + " DROP COLUMN comment"); // schema changed, ANALYZE hasn't been re-run yet + double nameDataSize = (double) computeActual("SHOW STATS FOR " + tableName).getMaterializedRows().stream() + .filter(row -> "name".equals(row.getField(0))) + .collect(onlyElement()).getField(1); + assertThat(nameDataSize).isBetween(1000.0, 3000.0); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('name', 1908.0, 25, 0, null, null, null), - ('info', null, null, null, null, null, null), - (null, null, null, null, 50, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('name', %s, 25, 0, null, null, null), + ('info', null, null, null, null, null, null), + (null, null, null, null, 50, null, null) + """.formatted(nameDataSize)); assertUpdate("ANALYZE " + tableName); + double infoDataSize = (double) computeActual("SHOW STATS FOR " + tableName).getMaterializedRows().stream() + .filter(row -> "info".equals(row.getField(0))) + .collect(onlyElement()).getField(1); + assertThat(infoDataSize).isBetween(2000.0, 5000.0); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('name', 1908.0, 25, 0, null, null, null), - ('info', 4417.0, 25, 0.1, null, null, null), - (null, null, null, null, 50, null, null)"""); // Row count statistics do not yet account for position deletes + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('name', %s, 25, 0, null, null, null), + ('info', %s, 25, 0.1, null, null, null), + (null, null, null, null, 50, null, null) + """.formatted(nameDataSize, infoDataSize)); // Row count statistics do not yet account for position deletes assertUpdate("DROP TABLE " + tableName); } - @Test(dataProviderClass = DataProviders.class, dataProvider = "trueFalse") + @ParameterizedTest + @ValueSource(booleans = {true, false}) public void testAnalyzePartitioned(boolean collectOnStatsOnWrites) { Session writeSession = withStatsOnWrite(getSession(), collectOnStatsOnWrites); String tableName = "test_analyze_partitioned_" + collectOnStatsOnWrites; assertUpdate(writeSession, "CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.sf1.nation", 25); - String goodStatsInitial = """ + String goodStatsInitial = + """ VALUES ('nationkey', null, 25, 0, null, '0', '24'), ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 3558.0, 25, 0, null, null, null), - ('name', 1231.0, 25, 0, null, null, null), - (null, null, null, null, 25, null, null)"""; + ('comment', 3507.0, 25, 0, null, null, null), + ('name', 1182.0, 25, 0, null, null, null), + (null, null, null, null, 25, null, null) + """; if (collectOnStatsOnWrites) { assertQuery("SHOW STATS FOR " + tableName, goodStatsInitial); @@ -193,12 +211,13 @@ public void testAnalyzePartitioned(boolean collectOnStatsOnWrites) assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, null, 0, null, '0', '24'), - ('regionkey', null, null, 0, null, '0', '4'), - ('comment', 3558.0, null, 0, null, null, null), - ('name', 1231.0, null, 0, null, null, null), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, null, 0, null, '0', '24'), + ('regionkey', null, null, 0, null, '0', '4'), + ('comment', 3507.0, null, 0, null, null, null), + ('name', 1182.0, null, 0, null, null, null), + (null, null, null, null, 25, null, null) + """); } assertUpdate("ANALYZE " + tableName); @@ -211,22 +230,25 @@ public void testAnalyzePartitioned(boolean collectOnStatsOnWrites) assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 7117.0, 25, 0, null, null, null), - ('name', 2462.0, 25, 0, null, null, null), - (null, null, null, null, 50, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', 7014.0, 25, 0, null, null, null), + ('name', 2365.0, 25, 0, null, null, null), + (null, null, null, null, 50, null, null) + """); // insert modified rows assertUpdate(writeSession, "INSERT INTO " + tableName + " SELECT nationkey + 25, reverse(name), regionkey + 5, reverse(comment) FROM tpch.sf1.nation", 25); - String goodStatsAfterSecondInsert = """ + String goodStatsAfterSecondInsert = + """ VALUES ('nationkey', null, 50, 0, null, '0', '49'), ('regionkey', null, 10, 0, null, '0', '9'), - ('comment', 10659.0, 50, 0, null, null, null), - ('name', 3715.0, 50, 0, null, null, null), - (null, null, null, null, 75, null, null)"""; + ('comment', 10493.999999999998, 50, 0, null, null, null), + ('name', 3564.0000000000005, 50, 0, null, null, null), + (null, null, null, null, 75, null, null) + """; if (collectOnStatsOnWrites) { assertQuery("SHOW STATS FOR " + tableName, goodStatsAfterSecondInsert); @@ -236,12 +258,13 @@ public void testAnalyzePartitioned(boolean collectOnStatsOnWrites) assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '49'), - ('regionkey', null, 5, 0, null, '0', '9'), - ('comment', 10659.0, 25, 0, null, null, null), - ('name', 3715.0, 25, 0, null, null, null), - (null, null, null, null, 75, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '49'), + ('regionkey', null, 5, 0, null, '0', '9'), + ('comment', 10493.999999999998, 25, 0, null, null, null), + ('name', 3564.0000000000005, 25, 0, null, null, null), + (null, null, null, null, 75, null, null) + """); } // with analyze we should get new NDV @@ -262,23 +285,25 @@ public void testAnalyzeEmpty() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', 0, 0, 1, null, null, null), - ('regionkey', 0, 0, 1, null, null, null), - ('comment', 0, 0, 1, null, null, null), - ('name', 0, 0, 1, null, null, null), - (null, null, null, null, 0, null, null)"""); + VALUES + ('nationkey', 0, 0, 1, null, null, null), + ('regionkey', 0, 0, 1, null, null, null), + ('comment', 0, 0, 1, null, null, null), + ('name', 0, 0, 1, null, null, null), + (null, null, null, null, 0, null, null) + """); assertUpdate("ANALYZE " + tableName); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', 0, 0, 1, null, null, null), - ('regionkey', 0, 0, 1, null, null, null), - ('comment', 0, 0, 1, null, null, null), - ('name', 0, 0, 1, null, null, null), - (null, null, null, null, 0, null, null)"""); + VALUES + ('nationkey', 0, 0, 1, null, null, null), + ('regionkey', 0, 0, 1, null, null, null), + ('comment', 0, 0, 1, null, null, null), + ('name', 0, 0, 1, null, null, null), + (null, null, null, null, 0, null, null) + """); // add some data and reanalyze @@ -288,17 +313,19 @@ public void testAnalyzeEmpty() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 2178.0, 25, 0, null, null, null), - ('name', 594.0, 25, 0, null, null, null), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', 2162.0, 25, 0, null, null, null), + ('name', 583.0, 25, 0, null, null, null), + (null, null, null, null, 25, null, null) + """); assertUpdate("DROP TABLE " + tableName); } - @Test(dataProvider = "testCollectStatisticsOnWriteDataProvider") + @ParameterizedTest + @MethodSource("testCollectStatisticsOnWriteDataProvider") public void testCollectStatisticsOnWrite(boolean collectOnStatsOnCreateTable, boolean partitioned) { String tableName = "test_collect_stats_insert_" + collectOnStatsOnCreateTable + partitioned; @@ -313,47 +340,52 @@ public void testCollectStatisticsOnWrite(boolean collectOnStatsOnCreateTable, bo "SHOW STATS FOR " + tableName, collectOnStatsOnCreateTable ? """ - VALUES - ('nationkey', null, 7, 0, null, '0', '9'), - ('regionkey', null, 3, 0, null, '0', '2'), - ('comment', %s, 7, 0, null, null, null), - ('name', %s, 7, 0, null, null, null), - (null, null, null, null, 7, null, null)""" - .formatted(partitioned ? "1328.0" : "954.9999999999999", partitioned ? "501.99999999999994" : "280.0") + VALUES + ('nationkey', null, 7, 0, null, '0', '9'), + ('regionkey', null, 3, 0, null, '0', '2'), + ('comment', %s, 7, 0, null, null, null), + ('name', %s, 7, 0, null, null, null), + (null, null, null, null, 7, null, null) + """ + .formatted(partitioned ? "1301.0" : "936.0", partitioned ? "469.0" : "270.0") : """ - VALUES - ('nationkey', null, null, 0, null, '0', '9'), - ('regionkey', null, null, 0, null, '0', '2'), - ('comment', %s, null, 0, null, null, null), - ('name', %s, null, 0, null, null, null), - (null, null, null, null, 7, null, null)""" - .formatted(partitioned ? "1328.0" : "954.9999999999999", partitioned ? "501.99999999999994" : "280.0")); + VALUES + ('nationkey', null, null, 0, null, '0', '9'), + ('regionkey', null, null, 0, null, '0', '2'), + ('comment', %s, null, 0, null, null, null), + ('name', %s, null, 0, null, null, null), + (null, null, null, null, 7, null, null) + """ + .formatted(partitioned ? "1301.0" : "936.0", partitioned ? "469.0" : "270.0")); assertUpdate(withStatsOnWrite(getSession(), true), "INSERT INTO " + tableName + " SELECT * FROM tpch.sf1.nation WHERE nationkey >= 12 OR regionkey >= 3", 18); assertQuery( "SHOW STATS FOR " + tableName, collectOnStatsOnCreateTable ? """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', %s, 25, 0, null, null, null), - ('name', %s, 25, 0, null, null, null), - (null, null, null, null, 25, null, null)""" - .formatted(partitioned ? "4141.0" : "2659.0", partitioned ? "1533.0" : "745.0") + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', %s, 25, 0, null, null, null), + ('name', %s, 25, 0, null, null, null), + (null, null, null, null, 25, null, null) + """ + .formatted(partitioned ? "4058.0" : "2627.0", partitioned ? "1447.0" : "726.0") : """ - VALUES - ('nationkey', null, null, 0, null, '0', '24'), - ('regionkey', null, null, 0, null, '0', '4'), - ('comment', %s, null, 0, null, null, null), - ('name', %s, null, 0, null, null, null), - (null, null, null, null, 25, null, null)""" - .formatted(partitioned ? "4141.0" : "2659.0", partitioned ? "1533.0" : "745.0")); + VALUES + ('nationkey', null, null, 0, null, '0', '24'), + ('regionkey', null, null, 0, null, '0', '4'), + ('comment', %s, null, 0, null, null, null), + ('name', %s, null, 0, null, null, null), + (null, null, null, null, 25, null, null) + """ + .formatted(partitioned ? "4058.0" : "2627.0", partitioned ? "1447.0" : "726.0")); assertUpdate("DROP TABLE " + tableName); } - @Test(dataProvider = "testCollectStatisticsOnWriteDataProvider") + @ParameterizedTest + @MethodSource("testCollectStatisticsOnWriteDataProvider") public void testCollectStatisticsOnWriteToEmptyTable(boolean collectOnStatsOnCreateTable, boolean partitioned) { String tableName = "test_collect_stats_insert_into_empty_" + collectOnStatsOnCreateTable + partitioned; @@ -367,35 +399,37 @@ public void testCollectStatisticsOnWriteToEmptyTable(boolean collectOnStatsOnCre assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', 0, 0, 1, null, null, null), - ('regionkey', 0, 0, 1, null, null, null), - ('comment', 0, 0, 1, null, null, null), - ('name', 0, 0, 1, null, null, null), - (null, null, null, null, 0, null, null)"""); + VALUES + ('nationkey', 0, 0, 1, null, null, null), + ('regionkey', 0, 0, 1, null, null, null), + ('comment', 0, 0, 1, null, null, null), + ('name', 0, 0, 1, null, null, null), + (null, null, null, null, 0, null, null) + """); assertUpdate(withStatsOnWrite(getSession(), true), "INSERT INTO " + tableName + " TABLE tpch.sf1.nation", 25); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', %f, 25, 0, null, null, null), - ('name', %f, 25, 0, null, null, null), - (null, null, null, null, 25, null, null)""" - .formatted(partitioned ? 3558.0 : 2178.0, partitioned ? 1231.0 : 594.0)); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', %f, 25, 0, null, null, null), + ('name', %f, 25, 0, null, null, null), + (null, null, null, null, 25, null, null) + """ + .formatted(partitioned ? 3507.0 : 2162.0, partitioned ? 1182.0 : 583)); assertUpdate("DROP TABLE " + tableName); } - @DataProvider public Object[][] testCollectStatisticsOnWriteDataProvider() { return cartesianProduct(trueFalse(), trueFalse()); } - @Test(dataProviderClass = DataProviders.class, dataProvider = "trueFalse") + @ParameterizedTest + @ValueSource(booleans = {true, false}) public void testAnalyzeAfterStatsDrift(boolean withOptimize) { String tableName = "test_analyze_stats_drift_" + withOptimize; @@ -405,10 +439,10 @@ public void testAnalyzeAfterStatsDrift(boolean withOptimize) assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + (null, null, null, null, 25, null, null)"""); // remove two regions in multiple queries List idsToRemove = computeActual("SELECT nationkey FROM tpch.sf1.nation WHERE regionkey IN (2, 4)").getOnlyColumn() @@ -423,20 +457,22 @@ public void testAnalyzeAfterStatsDrift(boolean withOptimize) assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + (null, null, null, null, 25, null, null) + """); if (withOptimize) { assertUpdate("ALTER TABLE " + tableName + " EXECUTE optimize"); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 15, 0, null, '0', '24'), - ('regionkey', null, 4, 0, null, '0', '3'), - (null, null, null, null, 15, null, null)"""); + VALUES + ('nationkey', null, 15, 0, null, '0', '24'), + ('regionkey', null, 4, 0, null, '0', '3'), + (null, null, null, null, 15, null, null) + """); } // ANALYZE can be used to update stats and prevent them from drifting over time @@ -445,17 +481,19 @@ public void testAnalyzeAfterStatsDrift(boolean withOptimize) "SHOW STATS FOR " + tableName, withOptimize ? """ - VALUES - ('nationkey', null, 15, 0, null, '0', '24'), - ('regionkey', null, 4, 0, null, '0', '3'), -- not updated yet - (null, null, null, null, 15, null, null)""" + VALUES + ('nationkey', null, 15, 0, null, '0', '24'), + ('regionkey', null, 4, 0, null, '0', '3'), -- not updated yet + (null, null, null, null, 15, null, null) + """ : // TODO row count and min/max values are incorrect as they are taken from manifest file list """ - VALUES - ('nationkey', null, 15, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), -- not updated yet - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 15, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), -- not updated yet + (null, null, null, null, 25, null, null) + """); // ANALYZE all columns assertUpdate("ANALYZE " + tableName); @@ -463,17 +501,19 @@ public void testAnalyzeAfterStatsDrift(boolean withOptimize) "SHOW STATS FOR " + tableName, withOptimize ? """ - VALUES - ('nationkey', null, 15, 0, null, '0', '24'), - ('regionkey', null, 3, 0, null, '0', '3'), - (null, null, null, null, 15, null, null)""" + VALUES + ('nationkey', null, 15, 0, null, '0', '24'), + ('regionkey', null, 3, 0, null, '0', '3'), + (null, null, null, null, 15, null, null) + """ : // TODO row count and min/max values are incorrect as they are taken from manifest file list """ - VALUES - ('nationkey', null, 15, 0, null, '0', '24'), - ('regionkey', null, 3, 0, null, '0', '4'), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 15, 0, null, '0', '24'), + ('regionkey', null, 3, 0, null, '0', '4'), + (null, null, null, null, 25, null, null) + """); assertUpdate("DROP TABLE " + tableName); } @@ -486,7 +526,7 @@ public void testAnalyzeSomeColumns() assertUpdate(noStatsOnWrite, "CREATE TABLE " + tableName + " AS SELECT * FROM tpch.sf1.nation", 25); // analyze NULL list of columns - assertQueryFails("ANALYZE " + tableName + " WITH (columns = NULL)", "\\QInvalid null value for catalog 'iceberg' analyze property 'columns' from [null]"); + assertQueryFails("ANALYZE " + tableName + " WITH (columns = NULL)", "\\Qline 1:41: Invalid null value for catalog 'iceberg' analyze property 'columns' from [null]"); // analyze empty list of columns assertQueryFails("ANALYZE " + tableName + " WITH (columns = ARRAY[])", "\\QCannot specify empty list of columns for analysis"); @@ -500,19 +540,20 @@ public void testAnalyzeSomeColumns() // specify NULL column assertQueryFails( "ANALYZE " + tableName + " WITH (columns = ARRAY['nationkey', NULL])", - "\\QUnable to set catalog 'iceberg' analyze property 'columns' to [ARRAY['nationkey',null]]: Invalid null value in analyze columns property"); + "\\Qline 1:41: Unable to set catalog 'iceberg' analyze property 'columns' to [ARRAY['nationkey',null]]: Invalid null value in analyze columns property"); // analyze nationkey and regionkey assertUpdate("ANALYZE " + tableName + " WITH (columns = ARRAY['nationkey', 'regionkey'])"); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 2178.0, null, 0, null, null, null), - ('name', 594.0, null, 0, null, null, null), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', 2162.0, null, 0, null, null, null), + ('name', 583.0, null, 0, null, null, null), + (null, null, null, null, 25, null, null) + """); // insert modified rows assertUpdate(noStatsOnWrite, "INSERT INTO " + tableName + " SELECT nationkey + 25, concat(name, '1'), regionkey + 5, concat(comment, '21') FROM tpch.sf1.nation", 25); @@ -522,12 +563,13 @@ public void testAnalyzeSomeColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 50, 0, null, '0', '49'), - ('regionkey', null, 10, 0, null, '0', '9'), - ('comment', 4471.0, null, 0, null, null, null), - ('name', 1215.0, null, 0, null, null, null), - (null, null, null, null, 50, null, null)"""); + VALUES + ('nationkey', null, 50, 0, null, '0', '49'), + ('regionkey', null, 10, 0, null, '0', '9'), + ('comment', 4441.0, null, 0, null, null, null), + ('name', 1193.0, null, 0, null, null, null), + (null, null, null, null, 50, null, null) + """); // drop stats assertUpdate("ALTER TABLE " + tableName + " EXECUTE DROP_EXTENDED_STATS"); @@ -537,12 +579,13 @@ public void testAnalyzeSomeColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 50, 0, null, '0', '49'), - ('regionkey', null, 10, 0, null, '0', '9'), - ('comment', 4471.0, 50, 0, null, null, null), - ('name', 1215.0, 50, 0, null, null, null), - (null, null, null, null, 50, null, null)"""); + VALUES + ('nationkey', null, 50, 0, null, '0', '49'), + ('regionkey', null, 10, 0, null, '0', '9'), + ('comment', 4441.0, 50, 0, null, null, null), + ('name', 1193.0, 50, 0, null, null, null), + (null, null, null, null, 50, null, null) + """); // insert modified rows assertUpdate(noStatsOnWrite, "INSERT INTO " + tableName + " SELECT nationkey + 50, concat(name, '2'), regionkey + 10, concat(comment, '22') FROM tpch.sf1.nation", 25); @@ -551,36 +594,39 @@ public void testAnalyzeSomeColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 50, 0, null, '0', '74'), - ('regionkey', null, 10, 0, null, '0', '14'), - ('comment', 6746.999999999999, 50, 0, null, null, null), - ('name', 1836.0, 50, 0, null, null, null), - (null, null, null, null, 75, null, null)"""); + VALUES + ('nationkey', null, 50, 0, null, '0', '74'), + ('regionkey', null, 10, 0, null, '0', '14'), + ('comment', 6701.0, 50, 0, null, null, null), + ('name', 1803.0, 50, 0, null, null, null), + (null, null, null, null, 75, null, null) + """); // reanalyze with a subset of columns assertUpdate("ANALYZE " + tableName + " WITH (columns = ARRAY['nationkey', 'regionkey'])"); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 75, 0, null, '0', '74'), - ('regionkey', null, 15, 0, null, '0', '14'), - ('comment', 6746.999999999999, 50, 0, null, null, null), -- result of previous analyze - ('name', 1836.0, 50, 0, null, null, null), -- result of previous analyze - (null, null, null, null, 75, null, null)"""); + VALUES + ('nationkey', null, 75, 0, null, '0', '74'), + ('regionkey', null, 15, 0, null, '0', '14'), + ('comment', 6701.0, 50, 0, null, null, null), -- result of previous analyze + ('name', 1803.0, 50, 0, null, null, null), -- result of previous analyze + (null, null, null, null, 75, null, null) + """); // analyze all columns assertUpdate("ANALYZE " + tableName); assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 75, 0, null, '0', '74'), - ('regionkey', null, 15, 0, null, '0', '14'), - ('comment', 6746.999999999999, 75, 0, null, null, null), - ('name', 1836.0, 75, 0, null, null, null), - (null, null, null, null, 75, null, null)"""); + VALUES + ('nationkey', null, 75, 0, null, '0', '74'), + ('regionkey', null, 15, 0, null, '0', '14'), + ('comment', 6701.0, 75, 0, null, null, null), + ('name', 1803.0, 75, 0, null, null, null), + (null, null, null, null, 75, null, null) + """); assertUpdate("DROP TABLE " + tableName); } @@ -593,8 +639,8 @@ public void testAnalyzeSnapshot() assertUpdate("CREATE TABLE " + tableName + " (a) AS VALUES 11", 1); long snapshotId = getCurrentSnapshotId(tableName); assertUpdate("INSERT INTO " + tableName + " VALUES 22", 1); - assertThatThrownBy(() -> query("ANALYZE \"%s@%d\"".formatted(tableName, snapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, snapshotId)); + assertThat(query("ANALYZE \"%s@%d\"".formatted(tableName, snapshotId))) + .failure().hasMessage(format("line 1:1: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, snapshotId)); assertThat(query("SELECT * FROM " + tableName)) .matches("VALUES 11, 22"); @@ -604,12 +650,12 @@ public void testAnalyzeSnapshot() @Test public void testAnalyzeSystemTable() { - assertThatThrownBy(() -> query("ANALYZE \"nation$files\"")) + assertThat(query("ANALYZE \"nation$files\"")) // The error message isn't clear to the user, but it doesn't matter - .hasMessage("Cannot record write for catalog not part of transaction"); - assertThatThrownBy(() -> query("ANALYZE \"nation$snapshots\"")) + .nonTrinoExceptionFailure().hasMessage("Cannot record write for catalog not part of transaction"); + assertThat(query("ANALYZE \"nation$snapshots\"")) // The error message isn't clear to the user, but it doesn't matter - .hasMessage("Cannot record write for catalog not part of transaction"); + .nonTrinoExceptionFailure().hasMessage("Cannot record write for catalog not part of transaction"); } @Test @@ -618,20 +664,24 @@ public void testDropExtendedStats() String tableName = "test_drop_extended_stats"; assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.sf1.nation", 25); - String baseStats = """ + String baseStats = + """ VALUES ('nationkey', null, null, 0, null, '0', '24'), ('regionkey', null, null, 0, null, '0', '4'), - ('comment', 2178.0, null, 0, null, null, null), - ('name', 594.0, null, 0, null, null, null), - (null, null, null, null, 25, null, null)"""; - String extendedStats = """ + ('comment', 2162.0, null, 0, null, null, null), + ('name', 583.0, null, 0, null, null, null), + (null, null, null, null, 25, null, null) + """; + String extendedStats = + """ VALUES ('nationkey', null, 25, 0, null, '0', '24'), ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 2178.0, 25, 0, null, null, null), - ('name', 594.0, 25, 0, null, null, null), - (null, null, null, null, 25, null, null)"""; + ('comment', 2162.0, 25, 0, null, null, null), + ('name', 583.0, 25, 0, null, null, null), + (null, null, null, null, 25, null, null) + """; assertQuery("SHOW STATS FOR " + tableName, extendedStats); @@ -657,12 +707,13 @@ public void testDropMissingStats() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, null, 0, null, '0', '24'), - ('regionkey', null, null, 0, null, '0', '4'), - ('comment', 2178.0, null, 0, null, null, null), - ('name', 594.0, null, 0, null, null, null), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, null, 0, null, '0', '24'), + ('regionkey', null, null, 0, null, '0', '4'), + ('comment', 2162.0, null, 0, null, null, null), + ('name', 583.0, null, 0, null, null, null), + (null, null, null, null, 25, null, null) + """); assertUpdate("DROP TABLE " + tableName); } @@ -691,8 +742,8 @@ public void testDropStatsSnapshot() assertUpdate("CREATE TABLE " + tableName + " (a) AS VALUES 11", 1); long snapshotId = getCurrentSnapshotId(tableName); assertUpdate("INSERT INTO " + tableName + " VALUES 22", 1); - assertThatThrownBy(() -> query("ALTER TABLE \"%s@%d\" EXECUTE DROP_EXTENDED_STATS".formatted(tableName, snapshotId))) - .hasMessage(format("Invalid Iceberg table name: %s@%d", tableName, snapshotId)); + assertThat(query("ALTER TABLE \"%s@%d\" EXECUTE DROP_EXTENDED_STATS".formatted(tableName, snapshotId))) + .failure().hasMessage(format("line 1:7: Table 'iceberg.tpch.\"%s@%s\"' does not exist", tableName, snapshotId)); assertThat(query("SELECT * FROM " + tableName)) .matches("VALUES 11, 22"); @@ -702,10 +753,10 @@ public void testDropStatsSnapshot() @Test public void testDropStatsSystemTable() { - assertThatThrownBy(() -> query("ALTER TABLE \"nation$files\" EXECUTE DROP_EXTENDED_STATS")) - .hasMessage("This connector does not support table procedures"); - assertThatThrownBy(() -> query("ALTER TABLE \"nation$snapshots\" EXECUTE DROP_EXTENDED_STATS")) - .hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$files\" EXECUTE DROP_EXTENDED_STATS")) + .failure().hasMessage("This connector does not support table procedures"); + assertThat(query("ALTER TABLE \"nation$snapshots\" EXECUTE DROP_EXTENDED_STATS")) + .failure().hasMessage("This connector does not support table procedures"); } @Test @@ -718,32 +769,35 @@ public void testAnalyzeAndRollbackToSnapshot() assertUpdate("ANALYZE " + tableName); long analyzeSnapshot = getCurrentSnapshotId(tableName); // ANALYZE currently does not create a new snapshot - assertEquals(analyzeSnapshot, createSnapshot); + assertThat(analyzeSnapshot).isEqualTo(createSnapshot); assertUpdate("INSERT INTO " + tableName + " SELECT * FROM tpch.sf1.nation WHERE nationkey = 1", 1); - assertNotEquals(getCurrentSnapshotId(tableName), createSnapshot); + assertThat(getCurrentSnapshotId(tableName)) + .isNotEqualTo(createSnapshot); // NDV information present after INSERT assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 2475.0, 25, 0, null, null, null), - ('name', 726.0, 25, 0, null, null, null), - (null, null, null, null, 26, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', 2448.0, 25, 0, null, null, null), + ('name', 704.0, 25, 0, null, null, null), + (null, null, null, null, 26, null, null) + """); - assertUpdate(format("CALL system.rollback_to_snapshot('%s', '%s', %s)", schema, tableName, createSnapshot)); + assertUpdate(format("ALTER TABLE %s.%s EXECUTE rollback_to_snapshot(%s)", schema, tableName, createSnapshot)); // NDV information still present after rollback_to_snapshot assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 2178.0, 25, 0, null, null, null), - ('name', 594.0, 25, 0, null, null, null), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', 2162.0, 25, 0, null, null, null), + ('name', 583.0, 25, 0, null, null, null), + (null, null, null, null, 25, null, null) + """); assertUpdate("DROP TABLE " + tableName); } @@ -764,12 +818,13 @@ public void testAnalyzeAndDeleteOrphanFiles() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('nationkey', null, 25, 0, null, '0', '24'), - ('regionkey', null, 5, 0, null, '0', '4'), - ('comment', 2178.0, 25, 0, null, null, null), - ('name', 594.0, 25, 0, null, null, null), - (null, null, null, null, 25, null, null)"""); + VALUES + ('nationkey', null, 25, 0, null, '0', '24'), + ('regionkey', null, 5, 0, null, '0', '4'), + ('comment', 2162.0, 25, 0, null, null, null), + ('name', 583.0, 25, 0, null, null, null), + (null, null, null, null, 25, null, null) + """); assertUpdate("DROP TABLE " + tableName); } @@ -784,10 +839,11 @@ public void testEmptyNoScalarColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('a', 0, 0, 1, null, null, null), - ('b', 0, 0, 1, null, null, null), - (null, null, null, null, 0, null, null)"""); + VALUES + ('a', 0, 0, 1, null, null, null), + ('b', 0, 0, 1, null, null, null), + (null, null, null, null, 0, null, null) + """); // On empty table assertQueryFails("ANALYZE " + tableName + " WITH (columns = ARRAY[])", "Cannot specify empty list of columns for analysis"); @@ -798,10 +854,11 @@ public void testEmptyNoScalarColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('a', 0, 0, 1, null, null, null), - ('b', 0, 0, 1, null, null, null), - (null, null, null, null, 0, null, null)"""); + VALUES + ('a', 0, 0, 1, null, null, null), + ('b', 0, 0, 1, null, null, null), + (null, null, null, null, 0, null, null) + """); // write with stats collection assertUpdate( @@ -811,10 +868,11 @@ public void testEmptyNoScalarColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('a', null, null, null, null, null, null), - ('b', null, null, null, null, null, null), - (null, null, null, null, 2, null, null)"""); + VALUES + ('a', null, null, null, null, null, null), + ('b', null, null, null, null, null, null), + (null, null, null, null, 2, null, null) + """); assertUpdate("DROP TABLE " + tableName); } @@ -833,10 +891,11 @@ public void testNoScalarColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('a', null, null, null, null, null, null), - ('b', null, null, null, null, null, null), - (null, null, null, null, 2, null, null)"""); + VALUES + ('a', null, null, null, null, null, null), + ('b', null, null, null, null, null, null), + (null, null, null, null, 2, null, null) + """); // On non-empty table assertQueryFails("ANALYZE " + tableName + " WITH (columns = ARRAY[])", "Cannot specify empty list of columns for analysis"); @@ -847,10 +906,11 @@ public void testNoScalarColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('a', null, null, null, null, null, null), - ('b', null, null, null, null, null, null), - (null, null, null, null, 2, null, null)"""); + VALUES + ('a', null, null, null, null, null, null), + ('b', null, null, null, null, null, null), + (null, null, null, null, 2, null, null) + """); // write with stats collection assertUpdate( @@ -860,10 +920,11 @@ public void testNoScalarColumns() assertQuery( "SHOW STATS FOR " + tableName, """ - VALUES - ('a', null, null, null, null, null, null), - ('b', null, null, null, null, null, null), - (null, null, null, null, 4, null, null)"""); + VALUES + ('a', null, null, null, null, null, null), + ('b', null, null, null, null, null, null), + (null, null, null, null, 4, null, null) + """); assertUpdate("DROP TABLE " + tableName); } @@ -887,23 +948,26 @@ public void testShowStatsAsOf() assertQuery( "SHOW STATS FOR (SELECT * FROM show_stats_as_of FOR VERSION AS OF " + beforeAnalyzedSnapshot + ")", """ - VALUES - ('key', null, null, 0, null, '3', '3'), -- NDV not present, as ANALYZE was run on a later snapshot - (null, null, null, null, 1, null, null)"""); + VALUES + ('key', null, null, 0, null, '3', '3'), -- NDV not present, as ANALYZE was run on a later snapshot + (null, null, null, null, 1, null, null) + """); assertQuery( "SHOW STATS FOR (SELECT * FROM show_stats_as_of FOR VERSION AS OF " + analyzedSnapshot + ")", """ - VALUES - ('key', null, 2, 0, null, '3', '4'), -- NDV present, this is the snapshot ANALYZE was run for - (null, null, null, null, 2, null, null)"""); + VALUES + ('key', null, 2, 0, null, '3', '4'), -- NDV present, this is the snapshot ANALYZE was run for + (null, null, null, null, 2, null, null) + """); assertQuery( "SHOW STATS FOR (SELECT * FROM show_stats_as_of FOR VERSION AS OF " + laterSnapshot + ")", """ - VALUES - ('key', null, 2, 0, null, '3', '5'), -- NDV present, stats "inherited" from previous snapshot - (null, null, null, null, 3, null, null)"""); + VALUES + ('key', null, 2, 0, null, '3', '5'), -- NDV present, stats "inherited" from previous snapshot + (null, null, null, null, 3, null, null) + """); assertUpdate("DROP TABLE show_stats_as_of"); } @@ -940,31 +1004,35 @@ public void testShowStatsAfterExpiration() assertQuery( "SHOW STATS FOR (SELECT * FROM show_stats_after_expiration FOR VERSION AS OF " + beforeAnalyzedSnapshot + ")", """ - VALUES - ('key', null, null, 0, null, '1', '3'), -- NDV not present, as ANALYZE was run on a later snapshot - (null, null, null, null, 3, null, null)"""); + VALUES + ('key', null, null, 0, null, '1', '3'), -- NDV not present, as ANALYZE was run on a later snapshot + (null, null, null, null, 3, null, null) + """); assertQuery( "SHOW STATS FOR (SELECT * FROM show_stats_after_expiration FOR VERSION AS OF " + analyzedSnapshot + ")", """ - VALUES - ('key', null, 4, 0, null, '1', '4'), -- NDV present, this is the snapshot ANALYZE was run for - (null, null, null, null, 4, null, null)"""); + VALUES + ('key', null, 4, 0, null, '1', '4'), -- NDV present, this is the snapshot ANALYZE was run for + (null, null, null, null, 4, null, null) + """); assertQuery( "SHOW STATS FOR (SELECT * FROM show_stats_after_expiration FOR VERSION AS OF " + laterSnapshot + ")", """ - VALUES - ('key', null, 4, 0, null, '1', '5'), -- NDV present, stats "inherited" from previous snapshot - (null, null, null, null, 5, null, null)"""); + VALUES + ('key', null, 4, 0, null, '1', '5'), -- NDV present, stats "inherited" from previous snapshot + (null, null, null, null, 5, null, null) + """); // Same as laterSnapshot but implicitly assertQuery( "SHOW STATS FOR show_stats_after_expiration", """ - VALUES - ('key', null, 4, 0, null, '1', '5'), -- NDV present, stats "inherited" from previous snapshot - (null, null, null, null, 5, null, null)"""); + VALUES + ('key', null, 4, 0, null, '1', '5'), -- NDV present, stats "inherited" from previous snapshot + (null, null, null, null, 5, null, null) + """); // Re-analyzing after snapshot expired assertUpdate("ANALYZE show_stats_after_expiration"); @@ -972,13 +1040,80 @@ public void testShowStatsAfterExpiration() assertQuery( "SHOW STATS FOR show_stats_after_expiration", """ - VALUES - ('key', null, 5, 0, null, '1', '5'), -- NDV present, stats "inherited" from previous snapshot - (null, null, null, null, 5, null, null)"""); + VALUES + ('key', null, 5, 0, null, '1', '5'), -- NDV present, stats "inherited" from previous snapshot + (null, null, null, null, 5, null, null) + """); assertUpdate("DROP TABLE show_stats_after_expiration"); } + @Test + public void testShowStatsAfterOptimize() + { + String tableName = "show_stats_after_optimize_" + randomNameSuffix(); + + String catalog = getSession().getCatalog().orElseThrow(); + Session writeSession = withStatsOnWrite(getSession(), false); + Session minimalSnapshotRetentionSession = Session.builder(getSession()) + .setCatalogSessionProperty(catalog, EXPIRE_SNAPSHOTS_MIN_RETENTION, "0s") + .build(); + + String expireSnapshotQuery = "ALTER TABLE " + tableName + " EXECUTE expire_snapshots(retention_threshold => '0d')"; + + assertUpdate(writeSession, "CREATE TABLE " + tableName + "(key integer)"); + // create several snapshots + assertUpdate(writeSession, "INSERT INTO " + tableName + " VALUES 1", 1); + assertUpdate(writeSession, "INSERT INTO " + tableName + " VALUES 2", 1); + assertUpdate(writeSession, "INSERT INTO " + tableName + " VALUES 3", 1); + + assertUpdate("ANALYZE " + tableName); + assertUpdate(writeSession, "INSERT INTO " + tableName + " VALUES 4", 1); + + assertQuery( + "SHOW STATS FOR " + tableName, + """ + VALUES + ('key', null, 3, 0, null, '1', '4'), -- NDV present, stats "inherited" from previous snapshot + (null, null, null, null, 4, null, null) + """); + + assertUpdate(minimalSnapshotRetentionSession, expireSnapshotQuery); + + // NDV is not present after expire_snapshot as last snapshot did not contained stats + assertQuery( + "SHOW STATS FOR " + tableName, + """ + VALUES + ('key', null, null, 0, null, '1', '4'), -- NDV not present as expire_snapshot removed stats for previous snapshots + (null, null, null, null, 4, null, null) + """); + + assertUpdate("ANALYZE " + tableName); + + assertQuery( + "SHOW STATS FOR " + tableName, + """ + VALUES + ('key', null, 4, 0, null, '1', '4'), -- NDV present + (null, null, null, null, 4, null, null) + """); + + // Optimize should rewrite stats file + assertUpdate("ALTER TABLE " + tableName + " EXECUTE optimize"); + assertUpdate(minimalSnapshotRetentionSession, expireSnapshotQuery); + + assertQuery( + "SHOW STATS FOR " + tableName, + """ + VALUES + ('key', null, 4, 0, null, '1', '4'), -- NDV present + (null, null, null, null, 4, null, null) + """); + + assertUpdate("DROP TABLE " + tableName); + } + @Test public void testStatsAfterDeletingAllRows() { @@ -986,6 +1121,7 @@ public void testStatsAfterDeletingAllRows() assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.sf1.nation", 25); assertThat(query("SHOW STATS FOR " + tableName)) + .result() .projected("column_name", "distinct_values_count", "row_count") .skippingTypesCheck() .containsAll("VALUES " + @@ -996,6 +1132,7 @@ public void testStatsAfterDeletingAllRows() "(null, null, DOUBLE '25')"); assertUpdate("DELETE FROM " + tableName + " WHERE nationkey < 50", 25); assertThat(query("SHOW STATS FOR " + tableName)) + .result() .projected("column_name", "distinct_values_count", "row_count") .skippingTypesCheck() .containsAll("VALUES " + @@ -1006,6 +1143,21 @@ public void testStatsAfterDeletingAllRows() "(null, null, DOUBLE '25')"); } + @Test + public void testNaN() + { + String tableName = "test_nan"; + assertUpdate("CREATE TABLE " + tableName + " AS SELECT 1 AS c1, double 'NaN' AS c2", 1); + assertQuery( + "SHOW STATS FOR " + tableName, + """ + VALUES + ('c1', null, 1.0, 0.0, null, 1, 1), + ('c2', null, 1.0, 0.0, null, null, null), + (null, null, null, null, 1.0, null, null) + """); + } + private long getCurrentSnapshotId(String tableName) { return (long) computeActual(format("SELECT snapshot_id FROM \"%s$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES", tableName)) diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableName.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableName.java index 54293b89ea14..2be516346846 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableName.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableName.java @@ -13,16 +13,10 @@ */ package io.trino.plugin.iceberg; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; -import java.util.Optional; - -import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED; -import static io.trino.testing.assertions.TrinoExceptionAssert.assertTrinoExceptionThrownBy; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; +import static org.assertj.core.api.Assertions.assertThatThrownBy; public class TestIcebergTableName { @@ -33,70 +27,86 @@ public void testParse() assertParseNameAndType("abc$history", "abc", TableType.HISTORY); assertParseNameAndType("abc$snapshots", "abc", TableType.SNAPSHOTS); - assertNoValidTableType("abc$data"); - assertInvalid("abc@123", "Invalid Iceberg table name: abc@123"); - assertInvalid("abc@xyz", "Invalid Iceberg table name: abc@xyz"); - assertNoValidTableType("abc$what"); - assertInvalid("abc@123$data@456", "Invalid Iceberg table name: abc@123$data@456"); - assertInvalid("abc@123$snapshots", "Invalid Iceberg table name: abc@123$snapshots"); - assertInvalid("abc$snapshots@456", "Invalid Iceberg table name: abc$snapshots@456"); - assertInvalid("xyz$data@456", "Invalid Iceberg table name: xyz$data@456"); - assertInvalid("abc$partitions@456", "Invalid Iceberg table name: abc$partitions@456"); - assertInvalid("abc$manifests@456", "Invalid Iceberg table name: abc$manifests@456"); + assertInvalid("abc$data"); + assertInvalid("abc@123"); + assertInvalid("abc@xyz"); + assertInvalid("abc$what"); + assertInvalid("abc@123$data@456"); + assertInvalid("abc@123$snapshots"); + assertInvalid("abc$snapshots@456"); + assertInvalid("xyz$data@456"); + assertInvalid("abc$partitions@456"); + assertInvalid("abc$manifests@456"); } @Test public void testIsDataTable() { - assertTrue(IcebergTableName.isDataTable("abc")); + assertThat(IcebergTableName.isDataTable("abc")).isTrue(); + + assertThatThrownBy(() -> IcebergTableName.isDataTable("abc$data")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid Iceberg table name: abc$data"); + + assertThat(IcebergTableName.isDataTable("abc$history")).isFalse(); - assertFalse(IcebergTableName.isDataTable("abc$data")); // it's invalid - assertFalse(IcebergTableName.isDataTable("abc$history")); - assertFalse(IcebergTableName.isDataTable("abc$invalid")); + assertThatThrownBy(() -> IcebergTableName.isDataTable("abc$invalid")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid Iceberg table name: abc$invalid"); } @Test public void testTableNameFrom() { - assertEquals(IcebergTableName.tableNameFrom("abc"), "abc"); - assertEquals(IcebergTableName.tableNameFrom("abc$data"), "abc"); - assertEquals(IcebergTableName.tableNameFrom("abc$history"), "abc"); - assertEquals(IcebergTableName.tableNameFrom("abc$invalid"), "abc"); + assertThat(IcebergTableName.tableNameFrom("abc")).isEqualTo("abc"); + + assertThatThrownBy(() -> IcebergTableName.tableNameFrom("abc$data")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid Iceberg table name: abc$data"); + + assertThat(IcebergTableName.tableNameFrom("abc$history")).isEqualTo("abc"); + + assertThatThrownBy(() -> IcebergTableName.tableNameFrom("abc$invalid")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid Iceberg table name: abc$invalid"); } @Test public void testTableTypeFrom() { - assertEquals(IcebergTableName.tableTypeFrom("abc"), Optional.of(TableType.DATA)); - assertEquals(IcebergTableName.tableTypeFrom("abc$data"), Optional.empty()); // it's invalid - assertEquals(IcebergTableName.tableTypeFrom("abc$history"), Optional.of(TableType.HISTORY)); + assertThat(IcebergTableName.tableTypeFrom("abc")).isEqualTo(TableType.DATA); + + assertThatThrownBy(() -> IcebergTableName.tableTypeFrom("abc$data")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid Iceberg table name: abc$data"); + + assertThat(IcebergTableName.tableTypeFrom("abc$history")).isEqualTo(TableType.HISTORY); - assertEquals(IcebergTableName.tableTypeFrom("abc$invalid"), Optional.empty()); + assertThatThrownBy(() -> IcebergTableName.tableTypeFrom("abc$invalid")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid Iceberg table name: abc$invalid"); } @Test public void testTableNameWithType() { - assertEquals(IcebergTableName.tableNameWithType("abc", TableType.DATA), "abc$data"); - assertEquals(IcebergTableName.tableNameWithType("abc", TableType.HISTORY), "abc$history"); + assertThat(IcebergTableName.tableNameWithType("abc", TableType.DATA)).isEqualTo("abc$data"); + assertThat(IcebergTableName.tableNameWithType("abc", TableType.HISTORY)).isEqualTo("abc$history"); } - private static void assertInvalid(String inputName, String message) + private static void assertInvalid(String inputName) { - assertTrinoExceptionThrownBy(() -> IcebergTableName.tableTypeFrom(inputName)) - .hasErrorCode(NOT_SUPPORTED) - .hasMessage(message); - } + assertThat(IcebergTableName.isIcebergTableName(inputName)).isFalse(); - private static void assertNoValidTableType(String inputName) - { - assertThat(IcebergTableName.tableTypeFrom(inputName)) - .isEmpty(); + assertThatThrownBy(() -> IcebergTableName.tableTypeFrom(inputName)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid Iceberg table name: " + inputName); } private static void assertParseNameAndType(String inputName, String tableName, TableType tableType) { - assertEquals(IcebergTableName.tableNameFrom(inputName), tableName); - assertEquals(IcebergTableName.tableTypeFrom(inputName), Optional.of(tableType)); + assertThat(IcebergTableName.isIcebergTableName(inputName)).isTrue(); + assertThat(IcebergTableName.tableNameFrom(inputName)).isEqualTo(tableName); + assertThat(IcebergTableName.tableTypeFrom(inputName)).isEqualTo(tableType); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithCustomLocation.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithCustomLocation.java index 568b7013f939..36edac5015ea 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithCustomLocation.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithCustomLocation.java @@ -15,68 +15,51 @@ import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; -import io.trino.plugin.hive.metastore.Table; -import io.trino.plugin.hive.metastore.file.FileHiveMetastore; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.Table; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DistributedQueryRunner; import io.trino.testing.MaterializedResult; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import io.trino.testing.QueryRunner; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; -import java.io.File; import java.io.IOException; -import java.nio.file.Files; import java.util.Map; import java.util.Optional; -import static com.google.common.io.MoreFiles.deleteRecursively; -import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; import static io.trino.plugin.iceberg.DataFileRecord.toDataFileRecord; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; import static io.trino.testing.TestingConnectorSession.SESSION; import static java.lang.String.format; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertNotEquals; -import static org.testng.Assert.assertTrue; public class TestIcebergTableWithCustomLocation extends AbstractTestQueryFramework { - private FileHiveMetastore metastore; - private File metastoreDir; + private HiveMetastore metastore; private TrinoFileSystem fileSystem; @Override - protected DistributedQueryRunner createQueryRunner() + protected QueryRunner createQueryRunner() throws Exception { - metastoreDir = Files.createTempDirectory("test_iceberg").toFile(); - metastore = createTestingFileHiveMetastore(metastoreDir); - - return IcebergQueryRunner.builder() + QueryRunner queryRunner = IcebergQueryRunner.builder() .setIcebergProperties(Map.of("iceberg.unique-table-location", "true")) - .setMetastoreDirectory(metastoreDir) .build(); + + metastore = getHiveMetastore(queryRunner); + + return queryRunner; } - @BeforeClass + @BeforeAll public void initFileSystem() { fileSystem = getFileSystemFactory(getDistributedQueryRunner()).create(SESSION); } - @AfterClass(alwaysRun = true) - public void tearDown() - throws IOException - { - deleteRecursively(metastoreDir.toPath(), ALLOW_INSECURE); - } - @Test public void testTableHasUuidSuffixInLocation() { @@ -98,18 +81,28 @@ public void testCreateAndDrop() assertThat(table.getTableType()).isEqualTo(EXTERNAL_TABLE.name()); Location tableLocation = Location.of(table.getStorage().getLocation()); - assertTrue(fileSystem.newInputFile(tableLocation).exists(), "The directory corresponding to the table storage location should exist"); + assertThat(fileSystem.newInputFile(tableLocation).exists()) + .describedAs("The directory corresponding to the table storage location should exist") + .isTrue(); MaterializedResult materializedResult = computeActual("SELECT * FROM \"test_create_and_drop$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); DataFileRecord dataFile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); Location dataFileLocation = Location.of(dataFile.getFilePath()); - assertTrue(fileSystem.newInputFile(dataFileLocation).exists(), "The data file should exist"); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()) + .describedAs("The data file should exist") + .isTrue(); assertQuerySucceeds(format("DROP TABLE %s", tableName)); - assertFalse(metastore.getTable("tpch", tableName).isPresent(), "Table should be dropped"); - assertFalse(fileSystem.newInputFile(dataFileLocation).exists(), "The data file should have been removed"); - assertFalse(fileSystem.newInputFile(tableLocation).exists(), "The directory corresponding to the dropped Iceberg table should not be removed because it may be shared with other tables"); + assertThat(metastore.getTable("tpch", tableName).isPresent()) + .describedAs("Table should be dropped") + .isFalse(); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()) + .describedAs("The data file should have been removed") + .isFalse(); + assertThat(fileSystem.newInputFile(tableLocation).exists()) + .describedAs("The directory corresponding to the dropped Iceberg table should not be removed because it may be shared with other tables") + .isFalse(); } @Test @@ -126,7 +119,9 @@ public void testCreateRenameDrop() Optional
renamedTable = metastore.getTable("tpch", renamedName); assertThat(renamedTable).as("Table should exist").isPresent(); String renamedTableLocation = renamedTable.get().getStorage().getLocation(); - assertEquals(renamedTableLocation, tableInitialLocation, "Location should not be changed"); + assertThat(renamedTableLocation) + .describedAs("Location should not be changed") + .isEqualTo(tableInitialLocation); assertQuerySucceeds(format("DROP TABLE %s", renamedName)); assertThat(metastore.getTable("tpch", tableName)).as("Initial table should not exist").isEmpty(); @@ -147,12 +142,16 @@ public void testCreateRenameCreate() Optional
renamedTable = metastore.getTable("tpch", renamedName); assertThat(renamedTable).as("Table should exist").isPresent(); String renamedTableLocation = renamedTable.get().getStorage().getLocation(); - assertEquals(renamedTableLocation, tableInitialLocation, "Location should not be changed"); + assertThat(renamedTableLocation) + .describedAs("Location should not be changed") + .isEqualTo(tableInitialLocation); assertQuerySucceeds(format("CREATE TABLE %s as select 1 as val", tableName)); Optional
recreatedTableWithInitialName = metastore.getTable("tpch", tableName); assertThat(recreatedTableWithInitialName).as("Table should exist").isPresent(); String recreatedTableLocation = recreatedTableWithInitialName.get().getStorage().getLocation(); - assertNotEquals(tableInitialLocation, recreatedTableLocation, "Location should be different"); + assertThat(tableInitialLocation) + .describedAs("Location should be different") + .isNotEqualTo(recreatedTableLocation); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithExternalLocation.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithExternalLocation.java index cb9f5d87aadd..c80c6b166b77 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithExternalLocation.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithExternalLocation.java @@ -15,65 +15,53 @@ import io.trino.filesystem.Location; import io.trino.filesystem.TrinoFileSystem; -import io.trino.plugin.hive.metastore.Table; -import io.trino.plugin.hive.metastore.file.FileHiveMetastore; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.Table; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DistributedQueryRunner; import io.trino.testing.MaterializedResult; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import io.trino.testing.QueryRunner; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import java.io.File; import java.io.IOException; -import java.nio.file.Files; -import static com.google.common.io.MoreFiles.deleteRecursively; -import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; import static io.trino.plugin.iceberg.DataFileRecord.toDataFileRecord; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; import static io.trino.testing.TestingConnectorSession.SESSION; import static io.trino.testing.TestingNames.randomNameSuffix; import static java.lang.String.format; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public class TestIcebergTableWithExternalLocation extends AbstractTestQueryFramework { - private FileHiveMetastore metastore; - private File metastoreDir; + private HiveMetastore metastore; private TrinoFileSystem fileSystem; @Override - protected DistributedQueryRunner createQueryRunner() + protected QueryRunner createQueryRunner() throws Exception { - metastoreDir = Files.createTempDirectory("test_iceberg").toFile(); - metastore = createTestingFileHiveMetastore(metastoreDir); - - return IcebergQueryRunner.builder() - .setMetastoreDirectory(metastoreDir) + QueryRunner queryRunner = IcebergQueryRunner.builder() .build(); + + metastore = getHiveMetastore(queryRunner); + + return queryRunner; } - @BeforeClass + @BeforeAll public void initFileSystem() { fileSystem = getFileSystemFactory(getDistributedQueryRunner()).create(SESSION); } - @AfterClass(alwaysRun = true) - public void tearDown() - throws IOException - { - deleteRecursively(metastoreDir.toPath(), ALLOW_INSECURE); - } - @Test public void testCreateAndDrop() throws IOException @@ -87,16 +75,24 @@ public void testCreateAndDrop() Table table = metastore.getTable("tpch", tableName).orElseThrow(); assertThat(table.getTableType()).isEqualTo(EXTERNAL_TABLE.name()); Location tableLocation = Location.of(table.getStorage().getLocation()); - assertTrue(fileSystem.newInputFile(tableLocation).exists(), "The directory corresponding to the table storage location should exist"); + assertThat(fileSystem.newInputFile(tableLocation).exists()) + .describedAs("The directory corresponding to the table storage location should exist") + .isTrue(); MaterializedResult materializedResult = computeActual("SELECT * FROM \"test_table_external_create_and_drop$files\""); - assertEquals(materializedResult.getRowCount(), 1); + assertThat(materializedResult.getRowCount()).isEqualTo(1); DataFileRecord dataFile = toDataFileRecord(materializedResult.getMaterializedRows().get(0)); Location dataFileLocation = Location.of(dataFile.getFilePath()); - assertTrue(fileSystem.newInputFile(dataFileLocation).exists(), "The data file should exist"); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()) + .describedAs("The data file should exist") + .isTrue(); assertQuerySucceeds(format("DROP TABLE %s", tableName)); assertThat(metastore.getTable("tpch", tableName)).as("Table should be dropped").isEmpty(); - assertFalse(fileSystem.newInputFile(dataFileLocation).exists(), "The data file should have been removed"); - assertFalse(fileSystem.newInputFile(tableLocation).exists(), "The directory corresponding to the dropped Iceberg table should be removed as we don't allow shared locations."); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()) + .describedAs("The data file should have been removed") + .isFalse(); + assertThat(fileSystem.newInputFile(tableLocation).exists()) + .describedAs("The directory corresponding to the dropped Iceberg table should be removed as we don't allow shared locations.") + .isFalse(); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithObjectStoreLayout.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithObjectStoreLayout.java new file mode 100644 index 000000000000..912d88c13d05 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTableWithObjectStoreLayout.java @@ -0,0 +1,72 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.Table; +import io.trino.testing.AbstractTestQueryFramework; +import io.trino.testing.DistributedQueryRunner; +import org.junit.jupiter.api.Test; + +import static io.trino.plugin.hive.TableType.EXTERNAL_TABLE; +import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.testing.TestingConnectorSession.SESSION; +import static org.assertj.core.api.Assertions.assertThat; + +final class TestIcebergTableWithObjectStoreLayout + extends AbstractTestQueryFramework +{ + private HiveMetastore metastore; + private TrinoFileSystem fileSystem; + + @Override + protected DistributedQueryRunner createQueryRunner() + throws Exception + { + DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() + .addIcebergProperty("iceberg.object-store-layout.enabled", "true") + .build(); + + metastore = getHiveMetastore(queryRunner); + + fileSystem = getFileSystemFactory(queryRunner).create(SESSION); + + return queryRunner; + } + + @Test + void testCreateTableWithDataLocation() + throws Exception + { + assertQuerySucceeds("CREATE TABLE test_create_table_with_different_location WITH (data_location = 'local:///table-location/abc') AS SELECT 1 AS val"); + Table table = metastore.getTable("tpch", "test_create_table_with_different_location").orElseThrow(); + assertThat(table.getTableType()).isEqualTo(EXTERNAL_TABLE.name()); + + Location tableLocation = Location.of(table.getStorage().getLocation()); + assertThat(fileSystem.newInputFile(tableLocation).exists()).isTrue(); + + String filePath = (String) computeScalar("SELECT file_path FROM \"test_create_table_with_different_location$files\""); + Location dataFileLocation = Location.of(filePath); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()).isTrue(); + assertThat(filePath).matches("local:///table-location/abc/.{6}/tpch/test_create_table_with_different_location-.*/.*\\.parquet"); + + assertQuerySucceeds("DROP TABLE test_create_table_with_different_location"); + assertThat(metastore.getTable("tpch", "test_create_table_with_different_location")).isEmpty(); + assertThat(fileSystem.newInputFile(dataFileLocation).exists()).isFalse(); + assertThat(fileSystem.newInputFile(tableLocation).exists()).isFalse(); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTaskFailureRecoveryTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTaskFailureRecoveryTest.java index 3c7ffc28847a..3253c77c6b31 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTaskFailureRecoveryTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergTaskFailureRecoveryTest.java @@ -13,19 +13,26 @@ */ package io.trino.plugin.iceberg; +import com.google.inject.Module; import io.trino.operator.RetryPolicy; import io.trino.plugin.exchange.filesystem.FileSystemExchangePlugin; import io.trino.plugin.exchange.filesystem.containers.MinioStorage; import io.trino.testing.QueryRunner; import io.trino.tpch.TpchTable; -import org.testng.annotations.AfterClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.parallel.Execution; import java.util.List; import java.util.Map; import static io.trino.plugin.exchange.filesystem.containers.MinioStorage.getExchangeManagerProperties; import static io.trino.testing.TestingNames.randomNameSuffix; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +import static org.junit.jupiter.api.parallel.ExecutionMode.CONCURRENT; +@TestInstance(PER_CLASS) +@Execution(CONCURRENT) public class TestIcebergTaskFailureRecoveryTest extends BaseIcebergFailureRecoveryTest { @@ -40,30 +47,29 @@ protected TestIcebergTaskFailureRecoveryTest() protected QueryRunner createQueryRunner( List> requiredTpchTables, Map configProperties, - Map coordinatorProperties) + Map coordinatorProperties, + Module failureInjectionModule) throws Exception { - this.minioStorage = new MinioStorage("test-exchange-spooling-" + randomNameSuffix()); + this.minioStorage = closeAfterClass(new MinioStorage("test-exchange-spooling-" + randomNameSuffix())); minioStorage.start(); return IcebergQueryRunner.builder() - .setInitialTables(requiredTpchTables) .setCoordinatorProperties(coordinatorProperties) .setExtraProperties(configProperties) .setAdditionalSetup(runner -> { runner.installPlugin(new FileSystemExchangePlugin()); runner.loadExchangeManager("filesystem", getExchangeManagerProperties(minioStorage)); }) + .setAdditionalModule(failureInjectionModule) + .setInitialTables(requiredTpchTables) .build(); } - @AfterClass(alwaysRun = true) + @AfterAll public void destroy() throws Exception { - if (minioStorage != null) { - minioStorage.close(); - minioStorage = null; - } + minioStorage = null; // closed by closeAfterClass } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java index f0e2ac5358a9..a5df23fe0812 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergUtil.java @@ -13,26 +13,34 @@ */ package io.trino.plugin.iceberg; -import org.testng.annotations.Test; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.NestedField; +import org.junit.jupiter.api.Test; +import static io.trino.plugin.iceberg.IcebergUtil.getProjectedColumns; import static io.trino.plugin.iceberg.IcebergUtil.parseVersion; +import static io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; +import static org.assertj.core.groups.Tuple.tuple; public class TestIcebergUtil { @Test public void testParseVersion() { - assertEquals(parseVersion("00000-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json"), 0); - assertEquals(parseVersion("99999-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json"), 99999); - assertEquals(parseVersion("00010-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json"), 10); - assertEquals(parseVersion("00011-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json"), 11); - assertEquals(parseVersion("v0.metadata.json"), 0); - assertEquals(parseVersion("v10.metadata.json"), 10); - assertEquals(parseVersion("v99999.metadata.json"), 99999); - assertEquals(parseVersion("v0.gz.metadata.json"), 0); - assertEquals(parseVersion("v0.metadata.json.gz"), 0); + assertThat(parseVersion("00000-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")).isEqualTo(0); + assertThat(parseVersion("99999-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")).isEqualTo(99999); + assertThat(parseVersion("00010-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")).isEqualTo(10); + assertThat(parseVersion("00011-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")).isEqualTo(11); + assertThat(parseVersion("v0.metadata.json")).isEqualTo(0); + assertThat(parseVersion("v10.metadata.json")).isEqualTo(10); + assertThat(parseVersion("v99999.metadata.json")).isEqualTo(99999); + assertThat(parseVersion("v0.gz.metadata.json")).isEqualTo(0); + assertThat(parseVersion("v0.metadata.json.gz")).isEqualTo(0); assertThatThrownBy(() -> parseVersion("hdfs://hadoop-master:9000/user/hive/warehouse/orders_5-581fad8517934af6be1857a903559d44/metadata/00000-409702ba-4735-4645-8f14-09537cc0b2c8.metadata.json")) .hasMessageMatching("Not a file name: .*"); @@ -56,4 +64,62 @@ public void testParseVersion() assertThatThrownBy(() -> parseVersion("v-10.metadata.json")) .hasMessageMatching("Invalid metadata file name:.*"); } + + @Test + public void testGetProjectedColumns() + { + Schema schema = new Schema( + NestedField.required(1, "id", Types.LongType.get()), + NestedField.required(2, "nested", Types.StructType.of( + NestedField.required(3, "value", Types.StringType.get()), + NestedField.required(4, "list", Types.ListType.ofRequired(5, Types.StringType.get())), + NestedField.required(6, "nested", Types.StructType.of( + NestedField.required(7, "value", Types.StringType.get())))))); + + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER)) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly( + tuple(1, "id", 1, ImmutableList.of()), + tuple(2, "nested", 2, ImmutableList.of()), + tuple(3, "value", 2, ImmutableList.of(3)), + tuple(4, "list", 2, ImmutableList.of(4)), + tuple(5, "element", 2, ImmutableList.of(4, 5)), + tuple(6, "nested", 2, ImmutableList.of(6)), + tuple(7, "value", 2, ImmutableList.of(6, 7))); + + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(1))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly(tuple(1, "id", 1, ImmutableList.of())); + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(2))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly(tuple(2, "nested", 2, ImmutableList.of())); + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(3))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly(tuple(3, "value", 2, ImmutableList.of(3))); + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(4))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly(tuple(4, "list", 2, ImmutableList.of(4))); + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(5))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly(tuple(5, "element", 2, ImmutableList.of(4, 5))); + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(6))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly(tuple(6, "nested", 2, ImmutableList.of(6))); + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(7))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly(tuple(7, "value", 2, ImmutableList.of(6, 7))); + + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(3, 7))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly( + tuple(3, "value", 2, ImmutableList.of(3)), + tuple(7, "value", 2, ImmutableList.of(6, 7))); + + assertThat(getProjectedColumns(schema, TESTING_TYPE_MANAGER, ImmutableSet.of(1, 4, 5))) + .extracting(IcebergColumnHandle::getId, IcebergColumnHandle::getName, column -> column.getBaseColumn().getId(), IcebergColumnHandle::getPath) + .containsExactly( + tuple(1, "id", 1, ImmutableList.of()), + tuple(4, "list", 2, ImmutableList.of(4)), + tuple(5, "element", 2, ImmutableList.of(4, 5))); + } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergV2.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergV2.java index 44d840408ace..b8da11f5e764 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergV2.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestIcebergV2.java @@ -15,40 +15,47 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import io.trino.Session; +import io.trino.filesystem.FileEntry; +import io.trino.filesystem.FileIterator; +import io.trino.filesystem.Location; +import io.trino.filesystem.TrinoFileSystem; import io.trino.filesystem.TrinoFileSystemFactory; -import io.trino.plugin.base.CatalogName; -import io.trino.plugin.base.util.Closables; -import io.trino.plugin.blackhole.BlackHolePlugin; -import io.trino.plugin.hive.TrinoViewHiveMetastore; -import io.trino.plugin.hive.metastore.HiveMetastore; -import io.trino.plugin.hive.metastore.cache.CachingHiveMetastore; -import io.trino.plugin.iceberg.catalog.IcebergTableOperationsProvider; +import io.trino.metastore.Column; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.HiveType; +import io.trino.metastore.PrincipalPrivileges; +import io.trino.metastore.Storage; +import io.trino.plugin.hive.HiveStorageFormat; +import io.trino.plugin.hive.TestingHivePlugin; import io.trino.plugin.iceberg.catalog.TrinoCatalog; -import io.trino.plugin.iceberg.catalog.file.FileMetastoreTableOperationsProvider; -import io.trino.plugin.iceberg.catalog.hms.TrinoHiveCatalog; -import io.trino.plugin.iceberg.fileio.ForwardingFileIo; import io.trino.spi.connector.SchemaTableName; import io.trino.spi.predicate.Domain; import io.trino.spi.predicate.Range; import io.trino.spi.predicate.TupleDomain; import io.trino.spi.predicate.ValueSet; +import io.trino.spi.statistics.ColumnStatistics; +import io.trino.spi.statistics.DoubleRange; +import io.trino.spi.statistics.Estimate; import io.trino.spi.statistics.TableStatistics; +import io.trino.spi.type.ArrayType; import io.trino.spi.type.TestingTypeManager; import io.trino.spi.type.TypeManager; import io.trino.testing.AbstractTestQueryFramework; -import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.MaterializedRow; import io.trino.testing.QueryRunner; import io.trino.testing.sql.TestTable; -import org.apache.hadoop.fs.Path; import org.apache.iceberg.BaseTable; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileContent; import org.apache.iceberg.Metrics; import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortField; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableOperations; @@ -56,100 +63,91 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetWriter; -import org.apache.iceberg.deletes.EqualityDeleteWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.deletes.PositionDeleteWriter; import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.mapping.MappingUtil; import org.apache.iceberg.parquet.Parquet; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; import java.io.Closeable; -import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.OptionalLong; +import java.util.Set; import java.util.UUID; -import java.util.concurrent.CyclicBarrier; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.IntStream; -import static com.google.common.base.Verify.verify; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.getOnlyElement; -import static com.google.common.io.MoreFiles.deleteRecursively; -import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; -import static io.trino.plugin.hive.metastore.cache.CachingHiveMetastore.memoizeMetastore; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static com.google.common.util.concurrent.MoreExecutors.newDirectExecutorService; +import static io.trino.plugin.iceberg.IcebergTestUtils.FILE_IO_FACTORY; +import static io.trino.plugin.iceberg.IcebergTestUtils.SESSION; import static io.trino.plugin.iceberg.IcebergTestUtils.getFileSystemFactory; -import static io.trino.plugin.iceberg.IcebergUtil.loadIcebergTable; +import static io.trino.plugin.iceberg.IcebergTestUtils.getHiveMetastore; +import static io.trino.plugin.iceberg.IcebergTestUtils.getMetadataFileAndUpdatedMillis; +import static io.trino.plugin.iceberg.IcebergTestUtils.getTrinoCatalog; +import static io.trino.plugin.iceberg.util.EqualityDeleteUtils.writeEqualityDeleteForTable; +import static io.trino.plugin.iceberg.util.EqualityDeleteUtils.writeEqualityDeleteForTableWithSchema; +import static io.trino.spi.type.BigintType.BIGINT; import static io.trino.spi.type.IntegerType.INTEGER; -import static io.trino.testing.TestingConnectorSession.SESSION; +import static io.trino.testing.MaterializedResult.resultBuilder; import static io.trino.testing.TestingNames.randomNameSuffix; import static io.trino.tpch.TpchTable.NATION; import static java.lang.String.format; import static java.nio.ByteOrder.LITTLE_ENDIAN; import static java.nio.charset.StandardCharsets.UTF_8; -import static java.util.concurrent.Executors.newFixedThreadPool; -import static java.util.concurrent.TimeUnit.SECONDS; +import static java.util.Map.entry; +import static org.apache.iceberg.FileContent.EQUALITY_DELETES; +import static org.apache.iceberg.FileContent.POSITION_DELETES; import static org.apache.iceberg.FileFormat.ORC; +import static org.apache.iceberg.FileFormat.PARQUET; +import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING; +import static org.apache.iceberg.TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED; +import static org.apache.iceberg.TableProperties.METADATA_PREVIOUS_VERSIONS_MAX; import static org.apache.iceberg.TableProperties.SPLIT_SIZE; +import static org.apache.iceberg.TableUtil.formatVersion; +import static org.apache.iceberg.mapping.NameMappingParser.toJson; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +@TestInstance(PER_CLASS) public class TestIcebergV2 extends AbstractTestQueryFramework { private HiveMetastore metastore; - private java.nio.file.Path tempDir; - private File metastoreDir; private TrinoFileSystemFactory fileSystemFactory; + private TrinoCatalog catalog; @Override protected QueryRunner createQueryRunner() throws Exception { - tempDir = Files.createTempDirectory("test_iceberg_v2"); - metastoreDir = tempDir.resolve("iceberg_data").toFile(); - metastore = createTestingFileHiveMetastore(metastoreDir); - - DistributedQueryRunner queryRunner = IcebergQueryRunner.builder() + QueryRunner queryRunner = IcebergQueryRunner.builder() .setInitialTables(NATION) - .setMetastoreDirectory(metastoreDir) .build(); - try { - queryRunner.installPlugin(new BlackHolePlugin()); - queryRunner.createCatalog("blackhole", "blackhole"); - } - catch (RuntimeException e) { - Closables.closeAllSuppress(e, queryRunner); - throw e; - } + metastore = getHiveMetastore(queryRunner); + fileSystemFactory = getFileSystemFactory(queryRunner); + catalog = getTrinoCatalog(metastore, fileSystemFactory, "iceberg"); - return queryRunner; - } + queryRunner.installPlugin(new TestingHivePlugin(queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data"))); + queryRunner.createCatalog("hive", "hive", ImmutableMap.builder() + .put("hive.security", "allow-all") + .buildOrThrow()); - @BeforeClass - public void initFileSystemFactory() - { - fileSystemFactory = getFileSystemFactory(getDistributedQueryRunner()); - } - - @AfterClass(alwaysRun = true) - public void tearDown() - throws IOException - { - deleteRecursively(tempDir, ALLOW_INSECURE); + return queryRunner; } @Test @@ -157,11 +155,11 @@ public void testSettingFormatVersion() { String tableName = "test_seting_format_version_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (format_version = 2) AS SELECT * FROM tpch.tiny.nation", 25); - assertThat(loadTable(tableName).operations().current().formatVersion()).isEqualTo(2); + assertThat(formatVersion(loadTable(tableName))).isEqualTo(2); assertUpdate("DROP TABLE " + tableName); assertUpdate("CREATE TABLE " + tableName + " WITH (format_version = 1) AS SELECT * FROM tpch.tiny.nation", 25); - assertThat(loadTable(tableName).operations().current().formatVersion()).isEqualTo(1); + assertThat(formatVersion(loadTable(tableName))).isEqualTo(1); assertUpdate("DROP TABLE " + tableName); } @@ -170,10 +168,50 @@ public void testDefaultFormatVersion() { String tableName = "test_default_format_version_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.tiny.nation", 25); - assertThat(loadTable(tableName).operations().current().formatVersion()).isEqualTo(2); + assertThat(formatVersion(loadTable(tableName))).isEqualTo(2); assertUpdate("DROP TABLE " + tableName); } + @Test + public void testSetPropertiesObjectStoreLayoutEnabled() + { + try (TestTable table = newTrinoTable("test_object_store", "(x int) WITH (object_store_layout_enabled = false)")) { + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .doesNotContain("object_store_layout_enabled"); + assertThat(loadTable(table.getName()).properties()) + .doesNotContainKey("write.object-storage.enabled"); + + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES object_store_layout_enabled = true"); + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .contains("object_store_layout_enabled = true"); + assertThat(loadTable(table.getName()).properties()) + .containsEntry("write.object-storage.enabled", "true"); + } + } + + @Test + public void testSetPropertiesDataLocation() + { + try (TestTable table = newTrinoTable("test_data_location", "(x int)")) { + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .doesNotContain("data_location ="); + assertThat(loadTable(table.getName()).properties()) + .doesNotContainKey("write.data.path"); + + assertQueryFails( + "ALTER TABLE " + table.getName() + " SET PROPERTIES data_location = 'local:///data-location'", + "Data location can only be set when object store layout is enabled"); + + assertUpdate("ALTER TABLE " + table.getName() + " SET PROPERTIES object_store_layout_enabled = true, data_location = 'local:///data-location'"); + assertThat((String) computeScalar("SHOW CREATE TABLE " + table.getName())) + .contains("object_store_layout_enabled = true") + .contains("data_location = 'local:///data-location'"); + assertThat(loadTable(table.getName()).properties()) + .containsEntry("write.object-storage.enabled", "true") + .containsEntry("write.data.path", "local:///data-location"); + } + } + @Test public void testV2TableRead() { @@ -193,13 +231,10 @@ public void testV2TableWithPositionDelete() String dataFilePath = (String) computeActual("SELECT file_path FROM \"" + tableName + "$files\" LIMIT 1").getOnlyValue(); - Path metadataDir = new Path(metastoreDir.toURI()); - String deleteFileName = "delete_file_" + UUID.randomUUID(); - FileIO fileIo = new ForwardingFileIo(fileSystemFactory.create(SESSION)); + FileIO fileIo = FILE_IO_FACTORY.create(fileSystemFactory.create(SESSION)); - Path path = new Path(metadataDir, deleteFileName); - PositionDeleteWriter writer = Parquet.writeDeletes(fileIo.newOutputFile(path.toString())) - .createWriterFunc(GenericParquetWriter::buildWriter) + PositionDeleteWriter writer = Parquet.writeDeletes(fileIo.newOutputFile("local:///delete_file_" + UUID.randomUUID())) + .createWriterFunc(GenericParquetWriter::create) .forTable(icebergTable) .overwrite() .rowSchema(icebergTable.schema()) @@ -223,10 +258,15 @@ public void testV2TableWithEqualityDelete() String tableName = "test_v2_equality_delete" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.tiny.nation", 25); Table icebergTable = loadTable(tableName); - writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[]{1L}))); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1"); // nationkey is before the equality delete column in the table schema, comment is after assertQuery("SELECT nationkey, comment FROM " + tableName, "SELECT nationkey, comment FROM nation WHERE regionkey != 1"); + + assertUpdate("INSERT INTO " + tableName + " SELECT * FROM tpch.tiny.nation", 25); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {2L})), ImmutableMap.of("regionkey", 2L)); + // the equality delete file is applied to 2 data files + assertQuery("SELECT count(*) FROM \"" + tableName + "$files\" WHERE content = " + EQUALITY_DELETES.id(), "VALUES 2"); } @Test @@ -252,25 +292,68 @@ public void testV2TableWithEqualityDeleteWhenColumnIsNested() "SELECT regionkey, ARRAY[1,2] array_column, MAP(ARRAY[1], ARRAY[2]) map_column, " + "CAST(ROW(1, 2e0) AS ROW(x BIGINT, y DOUBLE)) row_column FROM tpch.tiny.nation", 25); Table icebergTable = loadTable(tableName); - writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[]{1L}))); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); assertQuery("SELECT array_column[1], map_column[1], row_column.x FROM " + tableName, "SELECT 1, 2, 1 FROM nation WHERE regionkey != 1"); } + @Test + public void testParquetMissingFieldId() + throws Exception + { + String hiveTableName = "test_hive_parquet" + randomNameSuffix(); + assertUpdate("CREATE TABLE hive.tpch." + hiveTableName + "(names ARRAY(varchar)) WITH (format = 'PARQUET')"); + assertUpdate("INSERT INTO hive.tpch." + hiveTableName + " VALUES ARRAY['Alice', 'Bob'], ARRAY['Carol', 'Dave'], ARRAY['Eve', 'Frank']", 3); + + String location = metastore.getTable("tpch", hiveTableName).orElseThrow().getStorage().getLocation(); + FileIterator files = fileSystemFactory.create(SESSION).listFiles(Location.of(location)); + ImmutableList.Builder fileEntries = ImmutableList.builder(); + while (files.hasNext()) { + FileEntry file = files.next(); + if (!file.location().path().contains(".trinoSchema") && !file.location().path().contains(".trinoPermissions")) { + fileEntries.add(file); + } + } + FileEntry parquetFile = getOnlyElement(fileEntries.build()); + + String icebergTableName = "test_iceberg_parquet" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + icebergTableName + "(names ARRAY(varchar))"); + Table icebergTable = loadTable(icebergTableName); + icebergTable.newAppend() + .appendFile(DataFiles.builder(icebergTable.spec()) + .withPath(parquetFile.location().toString()) + .withFileSizeInBytes(parquetFile.length()) + .withRecordCount(3) + .withFormat(PARQUET).build()) + .commit(); + icebergTable.updateProperties() + .set("schema.name-mapping.default", "[{\"field-id\":1,\"names\":[\"names\"]}]") + .commit(); + + assertQuery("SELECT * FROM " + icebergTableName, "VALUES ARRAY['Alice', 'Bob'], ARRAY['Carol', 'Dave'], ARRAY['Eve', 'Frank']"); + + assertUpdate("DROP TABLE hive.tpch." + hiveTableName); + assertUpdate("DROP TABLE " + icebergTableName); + } + @Test public void testOptimizingV2TableRemovesEqualityDeletesWhenWholeTableIsScanned() throws Exception { String tableName = "test_optimize_table_cleans_equality_delete_file_when_whole_table_is_scanned" + randomNameSuffix(); - assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.tiny.nation", 25); + assertUpdate("CREATE TABLE " + tableName + " (LIKE nation) WITH (partitioning = ARRAY['regionkey'])"); + // Create multiple files per partition + for (int nationKey = 0; nationKey < 25; nationKey++) { + assertUpdate("INSERT INTO " + tableName + " SELECT * FROM tpch.tiny.nation WHERE nationkey = " + nationKey, 1); + } Table icebergTable = loadTable(tableName); - assertThat(icebergTable.currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); - writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[]{1L}))); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); List initialActiveFiles = getActiveFiles(tableName); - query("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1"); // nationkey is before the equality delete column in the table schema, comment is after assertQuery("SELECT nationkey, comment FROM " + tableName, "SELECT nationkey, comment FROM nation WHERE regionkey != 1"); - assertThat(loadTable(tableName).currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); + assertThat(loadTable(tableName).currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); List updatedFiles = getActiveFiles(tableName); assertThat(updatedFiles).doesNotContain(initialActiveFiles.toArray(new String[0])); } @@ -280,16 +363,20 @@ public void testOptimizingV2TableDoesntRemoveEqualityDeletesWhenOnlyPartOfTheTab throws Exception { String tableName = "test_optimize_table_with_equality_delete_file_for_different_partition_" + randomNameSuffix(); - assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.tiny.nation", 25); + assertUpdate("CREATE TABLE " + tableName + " (LIKE nation) WITH (partitioning = ARRAY['regionkey'])"); + // Create multiple files per partition + for (int nationKey = 0; nationKey < 25; nationKey++) { + assertUpdate("INSERT INTO " + tableName + " SELECT * FROM tpch.tiny.nation WHERE nationkey = " + nationKey, 1); + } Table icebergTable = loadTable(tableName); - assertThat(icebergTable.currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); List initialActiveFiles = getActiveFiles(tableName); - writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[]{1L}))); - query("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE regionkey != 1"); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE regionkey != 1"); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1"); // nationkey is before the equality delete column in the table schema, comment is after assertQuery("SELECT nationkey, comment FROM " + tableName, "SELECT nationkey, comment FROM nation WHERE regionkey != 1"); - assertThat(loadTable(tableName).currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("1"); + assertThat(loadTable(tableName).currentSnapshot().summary()).containsEntry("total-equality-deletes", "1"); List updatedFiles = getActiveFiles(tableName); assertThat(updatedFiles).doesNotContain(initialActiveFiles.stream().filter(path -> !path.contains("regionkey=1")).toArray(String[]::new)); } @@ -301,10 +388,215 @@ public void testSelectivelyOptimizingLeavesEqualityDeletes() String tableName = "test_selectively_optimizing_leaves_eq_deletes_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['nationkey']) AS SELECT * FROM tpch.tiny.nation", 25); Table icebergTable = loadTable(tableName); - writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[]{1L}))); - query("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE nationkey < 5"); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE nationkey < 5"); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1 OR nationkey != 1"); - assertThat(loadTable(tableName).currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("1"); + assertThat(loadTable(tableName).currentSnapshot().summary()).containsEntry("total-equality-deletes", "1"); + } + + @Test + public void testOptimizePopulateSplitOffsets() + { + // For optimize we need to set task_min_writer_count to 1, otherwise it will create more than one file. + Session session = Session.builder(getSession()) + .setSystemProperty("task_min_writer_count", "1") + .build(); + + try (TestTable table = newTrinoTable("test_optimize_split_offsets", "AS SELECT * FROM tpch.tiny.nation")) { + assertUpdate(session, "ALTER TABLE " + table.getName() + " EXECUTE optimize"); + assertThat(computeActual("SELECT split_offsets FROM \"" + table.getName() + "$files\"")) + .isEqualTo(resultBuilder(getSession(), ImmutableList.of(new ArrayType(BIGINT))) + .row(ImmutableList.of(4L)) + .build()); + } + } + + @Test + public void testMultipleEqualityDeletes() + throws Exception + { + String tableName = "test_multiple_equality_deletes_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.tiny.nation", 25); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + + for (int i = 1; i < 3; i++) { + writeEqualityDeleteToNationTable( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("regionkey", Integer.toUnsignedLong(i))); + } + + assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE (regionkey != 1L AND regionkey != 2L)"); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testEqualityDeleteAppliesOnlyToCorrectDataVersion() + throws Exception + { + String tableName = "test_multiple_equality_deletes_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.tiny.nation", 25); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); + + for (int i = 1; i < 3; i++) { + writeEqualityDeleteToNationTable( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("regionkey", Integer.toUnsignedLong(i))); + } + + assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE (regionkey != 1L AND regionkey != 2L)"); + + // Reinsert the data for regionkey = 1. This should insert the data with a larger datasequence number and the delete file should not apply to it anymore. + // Also delete something again so that the split has deletes and the delete logic is activated. + assertUpdate("INSERT INTO " + tableName + " SELECT * FROM tpch.tiny.nation WHERE regionkey = 1", 5); + writeEqualityDeleteToNationTable( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("regionkey", Integer.toUnsignedLong(3))); + + assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE (regionkey != 2L AND regionkey != 3L)"); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testMultipleEqualityDeletesWithEquivalentSchemas() + throws Exception + { + String tableName = "test_multiple_equality_deletes_equivalent_schemas_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.tiny.nation", 25); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + Schema deleteRowSchema = new Schema(ImmutableList.of("regionkey", "name").stream() + .map(name -> icebergTable.schema().findField(name)) + .collect(toImmutableList())); + List equalityFieldIds = ImmutableList.of("regionkey", "name").stream() + .map(name -> deleteRowSchema.findField(name).fieldId()) + .collect(toImmutableList()); + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("regionkey", 1L, "name", "BRAZIL"), + deleteRowSchema, + equalityFieldIds); + Schema equivalentDeleteRowSchema = new Schema(ImmutableList.of("name", "regionkey").stream() + .map(name -> icebergTable.schema().findField(name)) + .collect(toImmutableList())); + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("name", "INDIA", "regionkey", 2L), + equivalentDeleteRowSchema, + equalityFieldIds); + + assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE NOT ((regionkey = 1 AND name = 'BRAZIL') OR (regionkey = 2 AND name = 'INDIA'))"); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testMultipleEqualityDeletesWithDifferentSchemas() + throws Exception + { + String tableName = "test_multiple_equality_deletes_different_schemas_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.tiny.nation", 25); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("regionkey", 1L, "name", "BRAZIL"), + Optional.of(ImmutableList.of("regionkey", "name"))); + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("name", "ALGERIA"), + Optional.of(ImmutableList.of("name"))); + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("regionkey", 2L), + Optional.of(ImmutableList.of("regionkey"))); + + assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE NOT ((regionkey = 1 AND name = 'BRAZIL') OR regionkey = 2 OR name = 'ALGERIA')"); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testEqualityDeletesAcrossPartitions() + throws Exception + { + String tableName = "test_equality_deletes_across_partitions_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['partition']) AS SELECT 'part_1' as partition, * FROM tpch.tiny.nation", 25); + assertUpdate("INSERT INTO " + tableName + " SELECT 'part_2' as partition, * FROM tpch.tiny.nation", 25); + Table icebergTable = loadTable(tableName); + PartitionData partitionData1 = PartitionData.fromJson("{\"partitionValues\":[\"part_1\"]}", new Type[] {Types.StringType.get()}); + PartitionData partitionData2 = PartitionData.fromJson("{\"partitionValues\":[\"part_2\"]}", new Type[] {Types.StringType.get()}); + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.of(icebergTable.spec()), + Optional.of(partitionData1), + ImmutableMap.of("regionkey", 1L), + Optional.of(ImmutableList.of("regionkey"))); + // Delete from both partitions so internal code doesn't skip all deletion logic for second partition invalidating this test + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.of(icebergTable.spec()), + Optional.of(partitionData2), + ImmutableMap.of("regionkey", 2L), + Optional.of(ImmutableList.of("regionkey"))); + + assertQuery("SELECT * FROM " + tableName, "SELECT 'part_1', * FROM nation WHERE regionkey <> 1 UNION ALL select 'part_2', * FROM NATION where regionkey <> 2"); + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testMultipleEqualityDeletesWithNestedFields() + throws Exception + { + String tableName = "test_multiple_equality_deletes_nested_fields_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " ( id BIGINT, root ROW(nested BIGINT, nested_other BIGINT))"); + assertUpdate("INSERT INTO " + tableName + " VALUES (1, row(10, 100))", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, row(20, 200))", 1); + assertUpdate("INSERT INTO " + tableName + " VALUES (2, row(20, 200))", 1); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + + List deleteFileColumns = ImmutableList.of("root.nested"); + Schema deleteRowSchema = icebergTable.schema().select(deleteFileColumns); + List equalityFieldIds = ImmutableList.of("root.nested").stream() + .map(name -> deleteRowSchema.findField(name).fieldId()) + .collect(toImmutableList()); + Types.StructType nestedStructType = (Types.StructType) deleteRowSchema.findField("root").type(); + Record nestedStruct = GenericRecord.create(nestedStructType); + nestedStruct.setField("nested", 20L); + for (int i = 1; i < 3; i++) { + writeEqualityDeleteToNationTableWithDeleteColumns( + icebergTable, + Optional.empty(), + Optional.empty(), + ImmutableMap.of("root", nestedStruct), + deleteRowSchema, + equalityFieldIds); + } + + assertThat(query("SELECT * FROM " + tableName)) + .matches("VALUES (BIGINT '1', CAST(row(10, 100) AS ROW(nested BIGINT, nested_other BIGINT)))"); + + // verify that the equality delete is effective also when not specifying the corresponding column in the projection list + assertThat(query("SELECT id FROM " + tableName)) + .matches("VALUES BIGINT '1'"); + + assertUpdate("DROP TABLE " + tableName); } @Test @@ -314,10 +606,10 @@ public void testOptimizingWholeTableRemovesEqualityDeletes() String tableName = "test_optimizing_whole_table_removes_eq_deletes_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['nationkey']) AS SELECT * FROM tpch.tiny.nation", 25); Table icebergTable = loadTable(tableName); - writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[]{1L}))); - query("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1 OR nationkey != 1"); - assertThat(loadTable(tableName).currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); + assertThat(loadTable(tableName).currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); } @Test @@ -327,14 +619,14 @@ public void testOptimizingV2TableWithEmptyPartitionSpec() String tableName = "test_optimize_table_with_global_equality_delete_file_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " AS SELECT * FROM tpch.tiny.nation", 25); Table icebergTable = loadTable(tableName); - assertThat(icebergTable.currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); writeEqualityDeleteToNationTable(icebergTable); List initialActiveFiles = getActiveFiles(tableName); - query("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1"); // nationkey is before the equality delete column in the table schema, comment is after assertQuery("SELECT nationkey, comment FROM " + tableName, "SELECT nationkey, comment FROM nation WHERE regionkey != 1"); - assertThat(loadTable(tableName).currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); + assertThat(loadTable(tableName).currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); List updatedFiles = getActiveFiles(tableName); assertThat(updatedFiles).doesNotContain(initialActiveFiles.toArray(new String[0])); } @@ -344,113 +636,50 @@ public void testOptimizingPartitionsOfV2TableWithGlobalEqualityDeleteFile() throws Exception { String tableName = "test_optimize_partitioned_table_with_global_equality_delete_file_" + randomNameSuffix(); - assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.tiny.nation", 25); + assertUpdate("CREATE TABLE " + tableName + " (LIKE nation) WITH (partitioning = ARRAY['regionkey'])"); + // Create multiple files per partition + for (int nationKey = 0; nationKey < 25; nationKey++) { + assertUpdate("INSERT INTO " + tableName + " SELECT * FROM tpch.tiny.nation WHERE nationkey = " + nationKey, 1); + } Table icebergTable = loadTable(tableName); - assertThat(icebergTable.currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("0"); - writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[]{1L}))); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); List initialActiveFiles = getActiveFiles(tableName); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1"); - query("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE regionkey != 1"); + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE WHERE regionkey != 1"); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation WHERE regionkey != 1"); // nationkey is before the equality delete column in the table schema, comment is after assertQuery("SELECT nationkey, comment FROM " + tableName, "SELECT nationkey, comment FROM nation WHERE regionkey != 1"); - assertThat(loadTable(tableName).currentSnapshot().summary().get("total-equality-deletes")).isEqualTo("1"); + assertThat(loadTable(tableName).currentSnapshot().summary()).containsEntry("total-equality-deletes", "1"); List updatedFiles = getActiveFiles(tableName); assertThat(updatedFiles) .doesNotContain(initialActiveFiles.stream() - .filter(path -> !path.contains("regionkey=1")) - .toArray(String[]::new)); + .filter(path -> !path.contains("regionkey=1")) + .toArray(String[]::new)); } @Test - public void testOptimizeDuringWriteOperations() + public void testOptimizingWholeTableRemovesDeleteFiles() throws Exception { - runOptimizeDuringWriteOperations(true); - runOptimizeDuringWriteOperations(false); - } + try (TestTable testTable = newTrinoTable("test_optimize_removes_obsolete_delete_files_", "AS SELECT * FROM tpch.tiny.nation")) { + assertUpdate("DELETE FROM " + testTable.getName() + " WHERE regionkey % 2 = 0", 15); + Table icebergTable = loadTable(testTable.getName()); + writeEqualityDeleteToNationTable(icebergTable, Optional.of(icebergTable.spec()), Optional.of(new PartitionData(new Long[] {1L}))); - private void runOptimizeDuringWriteOperations(boolean useSmallFiles) - throws Exception - { - int threads = 5; - int deletionThreads = threads - 1; - int rows = 20; - int rowsPerThread = rows / deletionThreads; - - CyclicBarrier barrier = new CyclicBarrier(threads); - ExecutorService executor = newFixedThreadPool(threads); - - // Slow down the delete operations so optimize is more likely to complete - String blackholeTable = "blackhole_table_" + randomNameSuffix(); - assertUpdate("CREATE TABLE blackhole.default.%s (a INT, b INT) WITH (split_count = 1, pages_per_split = 1, rows_per_page = 1, page_processing_delay = '1s')".formatted(blackholeTable)); - - try (TestTable table = new TestTable( - getQueryRunner()::execute, - "test_optimize_during_write_operations", - "(int_col INT)")) { - String tableName = table.getName(); - - // Testing both situations where a file is fully removed by the delete operation and when a row level delete is required. - if (useSmallFiles) { - for (int i = 0; i < rows; i++) { - assertUpdate(format("INSERT INTO %s VALUES %s", tableName, i), 1); - } - } - else { - String values = IntStream.range(0, rows).mapToObj(String::valueOf).collect(Collectors.joining(", ")); - assertUpdate(format("INSERT INTO %s VALUES %s", tableName, values), rows); - } + assertThat(query("SELECT * FROM " + testTable.getName())) + .matches("SELECT * FROM nation WHERE regionkey != 1 AND regionkey % 2 = 1"); - List>> deletionFutures = IntStream.range(0, deletionThreads) - .mapToObj(threadNumber -> executor.submit(() -> { - barrier.await(10, SECONDS); - List successfulDeletes = new ArrayList<>(); - for (int i = 0; i < rowsPerThread; i++) { - try { - int rowNumber = threadNumber * rowsPerThread + i; - getQueryRunner().execute(format("DELETE FROM %s WHERE int_col = %s OR ((SELECT count(*) FROM blackhole.default.%s) > 42)", tableName, rowNumber, blackholeTable)); - successfulDeletes.add(true); - } - catch (RuntimeException e) { - successfulDeletes.add(false); - } - } - return successfulDeletes; - })) - .collect(toImmutableList()); - - Future optimizeFuture = executor.submit(() -> { - try { - barrier.await(10, SECONDS); - // Allow for some deletes to start before running optimize - Thread.sleep(50); - assertUpdate("ALTER TABLE %s EXECUTE optimize".formatted(tableName)); - } - catch (Exception e) { - throw new RuntimeException(e); - } - }); - - List expectedValues = new ArrayList<>(); - for (int threadNumber = 0; threadNumber < deletionThreads; threadNumber++) { - List deleteOutcomes = deletionFutures.get(threadNumber).get(); - verify(deleteOutcomes.size() == rowsPerThread); - for (int rowNumber = 0; rowNumber < rowsPerThread; rowNumber++) { - boolean successfulDelete = deleteOutcomes.get(rowNumber); - if (!successfulDelete) { - expectedValues.add(String.valueOf(threadNumber * rowsPerThread + rowNumber)); - } - } - } + assertQuery("SELECT count(*) FROM \"" + testTable.getName() + "$files\" WHERE content = " + POSITION_DELETES.id(), "VALUES 1"); + assertQuery("SELECT count(*) FROM \"" + testTable.getName() + "$files\" WHERE content = " + EQUALITY_DELETES.id(), "VALUES 1"); - optimizeFuture.get(); - assertThat(expectedValues.size()).isGreaterThan(0).isLessThan(rows); - assertQuery("SELECT * FROM " + tableName, "VALUES " + String.join(", ", expectedValues)); - } - finally { - executor.shutdownNow(); - executor.awaitTermination(10, SECONDS); + assertQuerySucceeds("ALTER TABLE " + testTable.getName() + " EXECUTE OPTIMIZE"); + + assertQuery("SELECT count(*) FROM \"" + testTable.getName() + "$files\" WHERE content = " + POSITION_DELETES.id(), "VALUES 0"); + assertQuery("SELECT count(*) FROM \"" + testTable.getName() + "$files\" WHERE content = " + EQUALITY_DELETES.id(), "VALUES 0"); + + assertThat(query("SELECT * FROM " + testTable.getName())) + .matches("SELECT * FROM nation WHERE regionkey != 1 AND regionkey % 2 = 1"); } } @@ -459,9 +688,9 @@ public void testUpgradeTableToV2FromTrino() { String tableName = "test_upgrade_table_to_v2_from_trino_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (format_version = 1) AS SELECT * FROM tpch.tiny.nation", 25); - assertEquals(loadTable(tableName).operations().current().formatVersion(), 1); + assertThat(formatVersion(loadTable(tableName))).isEqualTo(1); assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES format_version = 2"); - assertEquals(loadTable(tableName).operations().current().formatVersion(), 2); + assertThat(formatVersion(loadTable(tableName))).isEqualTo(2); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation"); } @@ -470,8 +699,9 @@ public void testDowngradingV2TableToV1Fails() { String tableName = "test_downgrading_v2_table_to_v1_fails_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (format_version = 2) AS SELECT * FROM tpch.tiny.nation", 25); - assertEquals(loadTable(tableName).operations().current().formatVersion(), 2); - assertThatThrownBy(() -> query("ALTER TABLE " + tableName + " SET PROPERTIES format_version = 1")) + assertThat(formatVersion(loadTable(tableName))).isEqualTo(2); + assertThat(query("ALTER TABLE " + tableName + " SET PROPERTIES format_version = 1")) + .failure() .hasMessage("Failed to set new property values") .rootCause() .hasMessage("Cannot downgrade v2 table to v1"); @@ -482,9 +712,9 @@ public void testUpgradingToInvalidVersionFails() { String tableName = "test_upgrading_to_invalid_version_fails_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (format_version = 2) AS SELECT * FROM tpch.tiny.nation", 25); - assertEquals(loadTable(tableName).operations().current().formatVersion(), 2); - assertThatThrownBy(() -> query("ALTER TABLE " + tableName + " SET PROPERTIES format_version = 42")) - .hasMessage("Unable to set catalog 'iceberg' table property 'format_version' to [42]: format_version must be between 1 and 2"); + assertThat(formatVersion(loadTable(tableName))).isEqualTo(2); + assertThat(query("ALTER TABLE " + tableName + " SET PROPERTIES format_version = 42")) + .failure().hasMessage("line 1:79: Unable to set catalog 'iceberg' table property 'format_version' to [42]: format_version must be between 1 and 2"); } @Test @@ -493,23 +723,23 @@ public void testUpdatingAllTableProperties() String tableName = "test_updating_all_table_properties_" + randomNameSuffix(); assertUpdate("CREATE TABLE " + tableName + " WITH (format_version = 1, format = 'ORC') AS SELECT * FROM tpch.tiny.nation", 25); BaseTable table = loadTable(tableName); - assertEquals(table.operations().current().formatVersion(), 1); - assertTrue(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("ORC")); - assertTrue(table.spec().isUnpartitioned()); + assertThat(formatVersion(table)).isEqualTo(1); + assertThat(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("ORC")).isTrue(); + assertThat(table.spec().isUnpartitioned()).isTrue(); assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES format_version = 2, partitioning = ARRAY['regionkey'], format = 'PARQUET', sorted_by = ARRAY['comment']"); table = loadTable(tableName); - assertEquals(table.operations().current().formatVersion(), 2); - assertTrue(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("PARQUET")); - assertTrue(table.spec().isPartitioned()); + assertThat(formatVersion(table)).isEqualTo(2); + assertThat(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("PARQUET")).isTrue(); + assertThat(table.spec().isPartitioned()).isTrue(); List partitionFields = table.spec().fields(); assertThat(partitionFields).hasSize(1); - assertEquals(partitionFields.get(0).name(), "regionkey"); - assertTrue(partitionFields.get(0).transform().isIdentity()); - assertTrue(table.sortOrder().isSorted()); + assertThat(partitionFields.get(0).name()).isEqualTo("regionkey"); + assertThat(partitionFields.get(0).transform().isIdentity()).isTrue(); + assertThat(table.sortOrder().isSorted()).isTrue(); List sortFields = table.sortOrder().fields(); - assertEquals(sortFields.size(), 1); - assertEquals(getOnlyElement(sortFields).sourceId(), table.schema().findField("comment").fieldId()); + assertThat(sortFields).hasSize(1); + assertThat(getOnlyElement(sortFields).sourceId()).isEqualTo(table.schema().findField("comment").fieldId()); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation"); } @@ -520,20 +750,20 @@ public void testUnsettingAllTableProperties() assertUpdate("CREATE TABLE " + tableName + " WITH (format_version = 1, format = 'PARQUET', partitioning = ARRAY['regionkey'], sorted_by = ARRAY['comment']) " + "AS SELECT * FROM tpch.tiny.nation", 25); BaseTable table = loadTable(tableName); - assertEquals(table.operations().current().formatVersion(), 1); - assertTrue(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("PARQUET")); - assertTrue(table.spec().isPartitioned()); + assertThat(formatVersion(table)).isEqualTo(1); + assertThat(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("PARQUET")).isTrue(); + assertThat(table.spec().isPartitioned()).isTrue(); List partitionFields = table.spec().fields(); assertThat(partitionFields).hasSize(1); - assertEquals(partitionFields.get(0).name(), "regionkey"); - assertTrue(partitionFields.get(0).transform().isIdentity()); + assertThat(partitionFields.get(0).name()).isEqualTo("regionkey"); + assertThat(partitionFields.get(0).transform().isIdentity()).isTrue(); assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES format_version = DEFAULT, format = DEFAULT, partitioning = DEFAULT, sorted_by = DEFAULT"); table = loadTable(tableName); - assertEquals(table.operations().current().formatVersion(), 2); - assertTrue(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("PARQUET")); - assertTrue(table.spec().isUnpartitioned()); - assertTrue(table.sortOrder().isUnsorted()); + assertThat(formatVersion(table)).isEqualTo(2); + assertThat(table.properties().get(TableProperties.DEFAULT_FILE_FORMAT).equalsIgnoreCase("PARQUET")).isTrue(); + assertThat(table.spec().isUnpartitioned()).isTrue(); + assertThat(table.sortOrder().isUnsorted()).isTrue(); assertQuery("SELECT * FROM " + tableName, "SELECT * FROM nation"); } @@ -594,7 +824,7 @@ public void testDeletingEntireFileWithMultipleSplits() long initialSnapshotId = (long) computeScalar("SELECT snapshot_id FROM \"" + tableName + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES"); assertUpdate("DELETE FROM " + tableName + " WHERE regionkey < 10", 25); long parentSnapshotId = (long) computeScalar("SELECT parent_id FROM \"" + tableName + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES"); - assertEquals(initialSnapshotId, parentSnapshotId); + assertThat(initialSnapshotId).isEqualTo(parentSnapshotId); assertThat(query("SELECT * FROM " + tableName)).returnsEmptyResult(); assertThat(this.loadTable(tableName).newScan().planFiles()).hasSize(1); } @@ -610,7 +840,7 @@ public void testMultipleDeletes() long initialSnapshotId = (long) computeScalar("SELECT snapshot_id FROM \"" + tableName + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES"); assertUpdate("DELETE FROM " + tableName + " WHERE regionkey % 2 = 1", "SELECT count(*) FROM nation WHERE regionkey % 2 = 1"); long parentSnapshotId = (long) computeScalar("SELECT parent_id FROM \"" + tableName + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES"); - assertEquals(initialSnapshotId, parentSnapshotId); + assertThat(initialSnapshotId).isEqualTo(parentSnapshotId); assertUpdate("DELETE FROM " + tableName + " WHERE regionkey % 2 = 0", "SELECT count(*) FROM nation WHERE regionkey % 2 = 0"); assertThat(query("SELECT * FROM " + tableName)).returnsEmptyResult(); @@ -625,10 +855,10 @@ public void testDeletingEntirePartitionedTable() assertThat(this.loadTable(tableName).newScan().planFiles()).hasSize(5); assertUpdate("DELETE FROM " + tableName + " WHERE regionkey < 10", "SELECT count(*) FROM nation WHERE regionkey < 10"); - assertThat(this.loadTable(tableName).newScan().planFiles()).hasSize(0); + assertThat(this.loadTable(tableName).newScan().planFiles()).isEmpty(); assertUpdate("DELETE FROM " + tableName + " WHERE regionkey < 10"); assertThat(query("SELECT * FROM " + tableName)).returnsEmptyResult(); - assertThat(this.loadTable(tableName).newScan().planFiles()).hasSize(0); + assertThat(this.loadTable(tableName).newScan().planFiles()).isEmpty(); } @Test @@ -636,7 +866,7 @@ public void testFilesTable() throws Exception { String tableName = "test_files_table_" + randomNameSuffix(); - String tableLocation = metastoreDir.getPath() + "/" + tableName; + String tableLocation = "local:///" + tableName; assertUpdate("CREATE TABLE " + tableName + " WITH (location = '" + tableLocation + "', format_version = 2) AS SELECT * FROM tpch.tiny.nation", 25); BaseTable table = loadTable(tableName); Metrics metrics = new Metrics( @@ -657,8 +887,6 @@ public void testFilesTable() .withEncryptionKeyMetadata(ByteBuffer.wrap("Trino".getBytes(UTF_8))) .build(); table.newAppend().appendFile(dataFile).commit(); - // TODO Currently, Trino does not include equality delete files stats in the $files table. - // Once it is fixed by https://github.com/trinodb/trino/pull/16232, include equality delete output in the test. writeEqualityDeleteToNationTable(table); assertQuery( "SELECT " + @@ -680,14 +908,14 @@ public void testFilesTable() (0, 'PARQUET', 25L, - JSON '{"1":141,"2":220,"3":99,"4":807}', + JSON '{"1":137,"2":216,"3":91,"4":801}', JSON '{"1":25,"2":25,"3":25,"4":25}', jSON '{"1":0,"2":0,"3":0,"4":0}', jSON '{}', JSON '{"1":"0","2":"ALGERIA","3":"0","4":" haggle. careful"}', JSON '{"1":"24","2":"VIETNAM","3":"4","4":"y final packaget"}', null, - null, + ARRAY[4L], null), (0, 'ORC', @@ -700,47 +928,204 @@ public void testFilesTable() JSON '{"1":"4"}', X'54 72 69 6e 6f', ARRAY[4L], - null) + null), + (2, + 'PARQUET', + 1L, + JSON '{"3":52}', + JSON '{"3":1}', + JSON '{"3":0}', + JSON '{}', + JSON '{"3":"1"}', + JSON '{"3":"1"}', + null, + ARRAY[4], + ARRAY[3]) """); } @Test public void testStatsFilePruning() { - try (TestTable testTable = new TestTable(getQueryRunner()::execute, "test_stats_file_pruning_", "(a INT, b INT) WITH (partitioning = ARRAY['b'])")) { + try (TestTable testTable = newTrinoTable("test_stats_file_pruning_", "(a INT, b INT) WITH (partitioning = ARRAY['b'])")) { assertUpdate("INSERT INTO " + testTable.getName() + " VALUES (1, 10), (10, 10)", 2); assertUpdate("INSERT INTO " + testTable.getName() + " VALUES (200, 10), (300, 20)", 2); Optional snapshotId = Optional.of((long) computeScalar("SELECT snapshot_id FROM \"" + testTable.getName() + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES")); TypeManager typeManager = new TestingTypeManager(); Table table = loadTable(testTable.getName()); - TableStatistics withNoFilter = TableStatisticsReader.makeTableStatistics(typeManager, table, snapshotId, TupleDomain.all(), TupleDomain.all(), true); - assertEquals(withNoFilter.getRowCount().getValue(), 4.0); + TableStatistics withNoFilter = TableStatisticsReader.makeTableStatistics( + typeManager, + table, + snapshotId, + TupleDomain.all(), + TupleDomain.all(), + ImmutableSet.of(), + true, + newDirectExecutorService(), + fileSystemFactory.create(SESSION)); + assertThat(withNoFilter.getRowCount().getValue()).isEqualTo(4.0); TableStatistics withPartitionFilter = TableStatisticsReader.makeTableStatistics( typeManager, table, snapshotId, TupleDomain.withColumnDomains(ImmutableMap.of( - new IcebergColumnHandle(ColumnIdentity.primitiveColumnIdentity(1, "b"), INTEGER, ImmutableList.of(), INTEGER, Optional.empty()), + IcebergColumnHandle.optional(ColumnIdentity.primitiveColumnIdentity(2, "b")).columnType(INTEGER).build(), Domain.singleValue(INTEGER, 10L))), TupleDomain.all(), - true); - assertEquals(withPartitionFilter.getRowCount().getValue(), 3.0); + ImmutableSet.of(), + true, + newDirectExecutorService(), + fileSystemFactory.create(SESSION)); + assertThat(withPartitionFilter.getRowCount().getValue()).isEqualTo(3.0); + IcebergColumnHandle column = IcebergColumnHandle.optional(ColumnIdentity.primitiveColumnIdentity(1, "a")).columnType(INTEGER).build(); TableStatistics withUnenforcedFilter = TableStatisticsReader.makeTableStatistics( typeManager, table, snapshotId, TupleDomain.all(), TupleDomain.withColumnDomains(ImmutableMap.of( - new IcebergColumnHandle(ColumnIdentity.primitiveColumnIdentity(0, "a"), INTEGER, ImmutableList.of(), INTEGER, Optional.empty()), + column, Domain.create(ValueSet.ofRanges(Range.greaterThan(INTEGER, 100L)), true))), - true); - assertEquals(withUnenforcedFilter.getRowCount().getValue(), 2.0); + ImmutableSet.of(column), + true, + newDirectExecutorService(), + fileSystemFactory.create(SESSION)); + assertThat(withUnenforcedFilter.getRowCount().getValue()).isEqualTo(2.0); + } + } + + @Test + public void testColumnStatsPruning() + { + try (TestTable testTable = newTrinoTable("test_column_stats_pruning_", "(a INT, b INT) WITH (partitioning = ARRAY['b'])")) { + assertUpdate("INSERT INTO " + testTable.getName() + " VALUES (1, 10), (10, 10)", 2); + assertUpdate("INSERT INTO " + testTable.getName() + " VALUES (200, 10), (300, 20)", 2); + + Optional snapshotId = Optional.of((long) computeScalar("SELECT snapshot_id FROM \"" + testTable.getName() + "$snapshots\" ORDER BY committed_at DESC FETCH FIRST 1 ROW WITH TIES")); + TypeManager typeManager = new TestingTypeManager(); + Table table = loadTable(testTable.getName()); + TableStatistics withNoProjectedColumns = TableStatisticsReader.makeTableStatistics( + typeManager, + table, + snapshotId, + TupleDomain.all(), + TupleDomain.all(), + ImmutableSet.of(), + true, + newDirectExecutorService(), + fileSystemFactory.create(SESSION)); + assertThat(withNoProjectedColumns.getRowCount().getValue()).isEqualTo(4.0); + assertThat(withNoProjectedColumns.getColumnStatistics()).isEmpty(); + + IcebergColumnHandle column = IcebergColumnHandle.optional(ColumnIdentity.primitiveColumnIdentity(1, "a")).columnType(INTEGER).build(); + TableStatistics withProjectedColumns = TableStatisticsReader.makeTableStatistics( + typeManager, + table, + snapshotId, + TupleDomain.all(), + TupleDomain.all(), + ImmutableSet.of(column), + true, + newDirectExecutorService(), + fileSystemFactory.create(SESSION)); + assertThat(withProjectedColumns.getRowCount().getValue()).isEqualTo(4.0); + assertThat(withProjectedColumns.getColumnStatistics()).containsOnlyKeys(column); + assertThat(withProjectedColumns.getColumnStatistics().get(column)) + .isEqualTo(ColumnStatistics.builder() + .setNullsFraction(Estimate.zero()) + .setDistinctValuesCount(Estimate.of(4.0)) + .setRange(new DoubleRange(1.0, 300.0)) + .build()); + + TableStatistics withPartitionFilterAndProjectedColumn = TableStatisticsReader.makeTableStatistics( + typeManager, + table, + snapshotId, + TupleDomain.all(), + TupleDomain.withColumnDomains(ImmutableMap.of( + IcebergColumnHandle.optional(ColumnIdentity.primitiveColumnIdentity(2, "b")).columnType(INTEGER).build(), + Domain.singleValue(INTEGER, 10L))), + ImmutableSet.of(column), + true, + newDirectExecutorService(), + fileSystemFactory.create(SESSION)); + assertThat(withPartitionFilterAndProjectedColumn.getRowCount().getValue()).isEqualTo(3.0); + assertThat(withPartitionFilterAndProjectedColumn.getColumnStatistics()).containsOnlyKeys(column); + assertThat(withPartitionFilterAndProjectedColumn.getColumnStatistics().get(column)) + .isEqualTo(ColumnStatistics.builder() + .setNullsFraction(Estimate.zero()) + .setDistinctValuesCount(Estimate.of(4.0)) + .setRange(new DoubleRange(1.0, 200.0)) + .build()); } } + @Test + public void testInt96TimestampWithTimeZone() + { + assertUpdate("CREATE TABLE hive.tpch.test_timestamptz_base (t timestamp) WITH (format = 'PARQUET')"); + assertUpdate("INSERT INTO hive.tpch.test_timestamptz_base (t) VALUES (timestamp '2022-07-26 12:13')", 1); + + // Writing TIMESTAMP WITH LOCAL TIME ZONE is not supported, so we first create Parquet object by writing unzoned + // timestamp (which is converted to UTC using default timezone) and then creating another table that reads from the same file. + String tableLocation = metastore.getTable("tpch", "test_timestamptz_base").orElseThrow().getStorage().getLocation(); + + // TIMESTAMP WITH LOCAL TIME ZONE is not mapped to any Trino type, so we need to create the metastore entry manually + metastore.createTable( + new io.trino.metastore.Table( + "tpch", + "test_timestamptz", + Optional.of("hive"), + "EXTERNAL_TABLE", + new Storage( + HiveStorageFormat.PARQUET.toStorageFormat(), + Optional.of(tableLocation), + Optional.empty(), + false, + ImmutableMap.of()), + List.of(new Column("t", HiveType.HIVE_TIMESTAMPLOCALTZ, Optional.empty(), Map.of())), + List.of(), + ImmutableMap.of(), + Optional.empty(), + Optional.empty(), + OptionalLong.empty()), + PrincipalPrivileges.fromHivePrivilegeInfos(ImmutableSet.of())); + + assertThat(query("SELECT * FROM hive.tpch.test_timestamptz")) + .matches("VALUES TIMESTAMP '2022-07-26 17:13:00.000 UTC'"); + + String path = (String) computeScalar("SELECT \"$path\" FROM hive.tpch.test_timestamptz_base"); + long size = (long) computeScalar("SELECT \"$file_size\" FROM hive.tpch.test_timestamptz_base"); + + // Read a Parquet file from Iceberg connector + assertUpdate("CREATE TABLE iceberg.tpch.test_timestamptz_migrated(t TIMESTAMP(6) WITH TIME ZONE)"); + + BaseTable table = loadTable("test_timestamptz_migrated"); + + table.updateProperties() + .set(DEFAULT_NAME_MAPPING, toJson(MappingUtil.create(table.schema()))) + .commit(); + + table.newAppend() + .appendFile(DataFiles.builder(table.spec()) + .withPath(path) + .withFormat(PARQUET) + .withFileSizeInBytes(size) + .withRecordCount(1) + .build()) + .commit(); + + assertThat(query("SELECT * FROM iceberg.tpch.test_timestamptz_migrated")) + .matches("VALUES TIMESTAMP '2022-07-26 17:13:00.000000 UTC'"); + + assertUpdate("DROP TABLE hive.tpch.test_timestamptz_base"); + assertUpdate("DROP TABLE hive.tpch.test_timestamptz"); + assertUpdate("DROP TABLE iceberg.tpch.test_timestamptz_migrated"); + } + @Test public void testSnapshotReferenceSystemTable() { @@ -784,6 +1169,397 @@ public void testSnapshotReferenceSystemTable() "('main', 'BRANCH', " + snapshotId3 + ", null, null, null)"); } + @Test + public void testReadingSnapshotReference() + { + String tableName = "test_reading_snapshot_reference" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " WITH (partitioning = ARRAY['regionkey']) AS SELECT * FROM tpch.tiny.nation", 25); + Table icebergTable = loadTable(tableName); + long refSnapshotId = icebergTable.currentSnapshot().snapshotId(); + icebergTable.manageSnapshots() + .createTag("test-tag", refSnapshotId) + .createBranch("test-branch", refSnapshotId) + .commit(); + assertQuery("SELECT * FROM \"" + tableName + "$refs\"", + "VALUES ('test-tag', 'TAG', " + refSnapshotId + ", null, null, null)," + + "('test-branch', 'BRANCH', " + refSnapshotId + ", null, null, null)," + + "('main', 'BRANCH', " + refSnapshotId + ", null, null, null)"); + + assertUpdate("INSERT INTO " + tableName + " SELECT * FROM tpch.tiny.nation LIMIT 5", 5); + assertQuery("SELECT * FROM " + tableName + " FOR VERSION AS OF " + refSnapshotId, + "SELECT * FROM nation"); + assertQuery("SELECT * FROM " + tableName + " FOR VERSION AS OF 'test-tag'", + "SELECT * FROM nation"); + assertQuery("SELECT * FROM " + tableName + " FOR VERSION AS OF 'test-branch'", + "SELECT * FROM nation"); + + assertQueryFails("SELECT * FROM " + tableName + " FOR VERSION AS OF 'test-wrong-ref'", + ".*?Cannot find snapshot with reference name: test-wrong-ref"); + assertQueryFails("SELECT * FROM " + tableName + " FOR VERSION AS OF 'TEST-TAG'", + ".*?Cannot find snapshot with reference name: TEST-TAG"); + } + + @Test + public void testNestedFieldPartitioning() + { + String tableName = "test_nested_field_partitioning_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INT, district ROW(name VARCHAR), state ROW(name VARCHAR)) WITH (partitioning = ARRAY['\"state.name\"'])"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('Patna'), ROW('BH')), " + + "(2, ROW('Patna'), ROW('BH')), " + + "(3, ROW('Bengaluru'), ROW('KA')), " + + "(4, ROW('Bengaluru'), ROW('KA'))", + 4); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(5, ROW('Patna'), ROW('BH')), " + + "(6, ROW('Patna'), ROW('BH')), " + + "(7, ROW('Bengaluru'), ROW('KA')), " + + "(8, ROW('Bengaluru'), ROW('KA'))", + 4); + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(4); + + assertUpdate("DELETE FROM " + tableName + " WHERE district.name = 'Bengaluru'", 4); + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(4); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY['\"state.name\"', '\"district.name\"']"); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.spec().fields().stream().map(PartitionField::name).toList()) + .containsExactlyInAnyOrder("state.name", "district.name"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(9, ROW('Patna'), ROW('BH')), " + + "(10, ROW('Bengaluru'), ROW('BH')), " + + "(11, ROW('Bengaluru'), ROW('KA')), " + + "(12, ROW('Bengaluru'), ROW('KA'))", + 4); + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(7); + + assertQuery("SELECT id, district.name, state.name FROM " + tableName, "VALUES " + + "(1, 'Patna', 'BH'), " + + "(2, 'Patna', 'BH'), " + + "(5, 'Patna', 'BH'), " + + "(6, 'Patna', 'BH'), " + + "(9, 'Patna', 'BH'), " + + "(10, 'Bengaluru', 'BH'), " + + "(11, 'Bengaluru', 'KA'), " + + "(12, 'Bengaluru', 'KA')"); + + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(3); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testHighlyNestedFieldPartitioning() + { + String tableName = "test_highly_nested_field_partitioning_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INT, country ROW(name VARCHAR, state ROW(name VARCHAR, district ROW(name VARCHAR))))" + + " WITH (partitioning = ARRAY['\"country.state.district.name\"'])"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('India', ROW('BH', ROW('Patna')))), " + + "(2, ROW('India', ROW('BH', ROW('Patna')))), " + + "(3, ROW('India', ROW('KA', ROW('Bengaluru')))), " + + "(4, ROW('India', ROW('KA', ROW('Bengaluru'))))", + 4); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(5, ROW('India', ROW('BH', ROW('Patna')))), " + + "(6, ROW('India', ROW('BH', ROW('Patna')))), " + + "(7, ROW('India', ROW('KA', ROW('Bengaluru')))), " + + "(8, ROW('India', ROW('KA', ROW('Bengaluru'))))", + 4); + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(4); + + assertQuery("SELECT partition.\"country.state.district.name\" FROM \"" + tableName + "$partitions\"", "VALUES 'Patna', 'Bengaluru'"); + + assertUpdate("DELETE FROM " + tableName + " WHERE country.state.district.name = 'Bengaluru'", 4); + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(2); + + assertUpdate("ALTER TABLE " + tableName + " SET PROPERTIES partitioning = ARRAY['\"country.state.district.name\"', '\"country.state.name\"']"); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.spec().fields().stream().map(PartitionField::name).toList()) + .containsExactlyInAnyOrder("country.state.district.name", "country.state.name"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(9, ROW('India', ROW('BH', ROW('Patna')))), " + + "(10, ROW('India', ROW('BH', ROW('Bengaluru')))), " + + "(11, ROW('India', ROW('KA', ROW('Bengaluru')))), " + + "(12, ROW('India', ROW('KA', ROW('Bengaluru'))))", + 4); + + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(5); + + assertQuery("SELECT id, country.name, country.state.name, country.state.district.name FROM " + tableName, "VALUES " + + "(1, 'India', 'BH', 'Patna'), " + + "(2, 'India', 'BH', 'Patna'), " + + "(5, 'India', 'BH', 'Patna'), " + + "(6, 'India', 'BH', 'Patna'), " + + "(9, 'India', 'BH', 'Patna'), " + + "(10, 'India', 'BH', 'Bengaluru'), " + + "(11, 'India', 'KA', 'Bengaluru'), " + + "(12, 'India', 'KA', 'Bengaluru')"); + + assertUpdate("ALTER TABLE " + tableName + " EXECUTE OPTIMIZE"); + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(3); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testHighlyNestedFieldPartitioningWithTruncateTransform() + { + String tableName = "test_highly_nested_field_partitioning_with_transform_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INT, country ROW(name VARCHAR, state ROW(name VARCHAR, district ROW(name VARCHAR))))" + + " WITH (partitioning = ARRAY['truncate(\"country.state.district.name\", 5)'])"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('India', ROW('BH', ROW('Patna')))), " + + "(2, ROW('India', ROW('BH', ROW('Patna_Truncate')))), " + + "(3, ROW('India', ROW('DL', ROW('Delhi')))), " + + "(4, ROW('India', ROW('DL', ROW('Delhi_Truncate'))))", + 4); + + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(2); + List files = computeActual("SELECT file_path, record_count FROM \"" + tableName + "$files\"").getMaterializedRows(); + List partitionedFiles = files.stream() + .filter(file -> ((String) file.getField(0)).contains("country.state.district.name_trunc=")) + .collect(toImmutableList()); + + assertThat(partitionedFiles).hasSize(2); + assertThat(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(4L); + + assertQuery("SELECT id, country.state.district.name, country.state.name, country.name FROM " + tableName, "VALUES " + + "(1, 'Patna', 'BH', 'India'), " + + "(2, 'Patna_Truncate', 'BH', 'India'), " + + "(3, 'Delhi', 'DL', 'India'), " + + "(4, 'Delhi_Truncate', 'DL', 'India')"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testHighlyNestedFieldPartitioningWithBucketTransform() + { + String tableName = "test_highly_nested_field_partitioning_with_transform_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INT, country ROW(name VARCHAR, state ROW(name VARCHAR, district ROW(name VARCHAR))))" + + " WITH (partitioning = ARRAY['bucket(\"country.state.district.name\", 2)'])"); + + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW('India', ROW('BH', ROW('Patna')))), " + + "(2, ROW('India', ROW('MH', ROW('Mumbai')))), " + + "(3, ROW('India', ROW('DL', ROW('Delhi')))), " + + "(4, ROW('India', ROW('KA', ROW('Bengaluru'))))", + 4); + + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(2); + List files = computeActual("SELECT file_path, record_count FROM \"" + tableName + "$files\"").getMaterializedRows(); + List partitionedFiles = files.stream() + .filter(file -> ((String) file.getField(0)).contains("country.state.district.name_bucket=")) + .collect(toImmutableList()); + + assertThat(partitionedFiles).hasSize(2); + assertThat(partitionedFiles.stream().mapToLong(row -> (long) row.getField(1)).sum()).isEqualTo(4L); + + assertQuery("SELECT id, country.state.district.name, country.state.name, country.name FROM " + tableName, "VALUES " + + "(1, 'Patna', 'BH', 'India'), " + + "(2, 'Mumbai', 'MH', 'India'), " + + "(3, 'Delhi', 'DL', 'India'), " + + "(4, 'Bengaluru', 'KA', 'India')"); + + assertUpdate("DROP TABLE " + tableName); + } + + @Test + public void testHighlyNestedFieldPartitioningWithTimestampTransform() + { + testHighlyNestedFieldPartitioningWithTimestampTransform( + "ARRAY['year(\"grandparent.parent.ts\")']", + ".*?(grandparent\\.parent\\.ts_year=.*/).*", + ImmutableSet.of("grandparent.parent.ts_year=2021/", "grandparent.parent.ts_year=2022/", "grandparent.parent.ts_year=2023/")); + testHighlyNestedFieldPartitioningWithTimestampTransform( + "ARRAY['month(\"grandparent.parent.ts\")']", + ".*?(grandparent\\.parent\\.ts_month=.*/).*", + ImmutableSet.of("grandparent.parent.ts_month=2021-01/", "grandparent.parent.ts_month=2022-02/", "grandparent.parent.ts_month=2023-03/")); + testHighlyNestedFieldPartitioningWithTimestampTransform( + "ARRAY['day(\"grandparent.parent.ts\")']", + ".*?(grandparent\\.parent\\.ts_day=.*/).*", + ImmutableSet.of("grandparent.parent.ts_day=2021-01-01/", "grandparent.parent.ts_day=2022-02-02/", "grandparent.parent.ts_day=2023-03-03/")); + testHighlyNestedFieldPartitioningWithTimestampTransform( + "ARRAY['hour(\"grandparent.parent.ts\")']", + ".*?(grandparent\\.parent\\.ts_hour=.*/).*", + ImmutableSet.of("grandparent.parent.ts_hour=2021-01-01-01/", "grandparent.parent.ts_hour=2022-02-02-02/", "grandparent.parent.ts_hour=2023-03-03-03/")); + } + + @Test + void testMapValueSchemaChange() + { + testMapValueSchemaChange("PARQUET", "map(array[1], array[NULL])"); + testMapValueSchemaChange("ORC", "map(array[1], array[row(NULL)])"); + testMapValueSchemaChange("AVRO", "map(array[1], array[row(NULL)])"); + } + + private void testMapValueSchemaChange(String format, String expectedValue) + { + try (TestTable table = newTrinoTable( + "test_map_value_schema_change", + "WITH (format = '" + format + "') AS SELECT CAST(map(array[1], array[row(2)]) AS map(integer, row(field integer))) col")) { + Table icebergTable = loadTable(table.getName()); + icebergTable.updateSchema() + .addColumn("col.value", "new_field", Types.IntegerType.get()) + .deleteColumn("col.value.field") + .commit(); + assertThat(query("SELECT * FROM " + table.getName())) + .as("Format: %s", format) + .matches("SELECT CAST(" + expectedValue + " AS map(integer, row(new_field integer)))"); + } + } + + @Test + public void testUpdateAfterEqualityDelete() + throws Exception + { + String tableName = "test_update_after_equality_delete_" + randomNameSuffix(); + for (String format : ImmutableList.of("PARQUET", "ORC", "AVRO")) { + assertUpdate("CREATE TABLE " + tableName + " WITH (format = '" + format + "') AS SELECT * FROM tpch.tiny.nation", 25); + Table icebergTable = loadTable(tableName); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "0"); + writeEqualityDeleteToNationTable(icebergTable); + assertThat(icebergTable.currentSnapshot().summary()).containsEntry("total-equality-deletes", "1"); + assertUpdate("UPDATE " + tableName + " SET comment = 'test'", 20); + assertQuery("SELECT nationkey, comment FROM " + tableName, "SELECT nationkey, 'test' FROM nation WHERE regionkey != 1"); + assertUpdate("DROP TABLE " + tableName); + } + } + + @Test + @Disabled // TODO https://github.com/trinodb/trino/issues/24539 Fix flaky test + void testEnvironmentContext() + { + try (TestTable table = newTrinoTable("test_environment_context", "(x int)")) { + Table icebergTable = loadTable(table.getName()); + assertThat(icebergTable.currentSnapshot().summary()) + .contains(entry("engine-name", "trino"), entry("engine-version", "testversion")); + } + } + + @Test + public void testMetadataDeleteAfterCommitEnabled() + throws IOException + { + int metadataPreviousVersionCount = 5; + String tableName = "test_metadata_delete_after_commit_enabled" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + "(_bigint BIGINT, _varchar VARCHAR)"); + BaseTable icebergTable = loadTable(tableName); + String location = icebergTable.location(); + icebergTable.updateProperties() + .set(METADATA_DELETE_AFTER_COMMIT_ENABLED, "true") + .set(METADATA_PREVIOUS_VERSIONS_MAX, String.valueOf(metadataPreviousVersionCount)) + .commit(); + + TrinoFileSystem trinoFileSystem = fileSystemFactory.create(SESSION); + Map historyMetadataFiles = getMetadataFileAndUpdatedMillis(trinoFileSystem, location); + for (int i = 0; i < 10; i++) { + assertUpdate("INSERT INTO " + tableName + " VALUES (1, 'a')", 1); + Map metadataFiles = getMetadataFileAndUpdatedMillis(trinoFileSystem, location); + historyMetadataFiles.putAll(metadataFiles); + assertThat(metadataFiles.size()).isLessThanOrEqualTo(1 + metadataPreviousVersionCount); + Set expectMetadataFiles = historyMetadataFiles + .entrySet() + .stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(metadataPreviousVersionCount + 1) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + assertThat(metadataFiles.keySet()).containsAll(expectMetadataFiles); + } + assertUpdate("DROP TABLE " + tableName); + } + + @Test + void testAnalyzeNoSnapshot() + { + String table = "test_analyze_no_snapshot" + randomNameSuffix(); + SchemaTableName schemaTableName = new SchemaTableName("tpch", table); + + catalog.newCreateTableTransaction( + SESSION, + schemaTableName, + new Schema(Types.NestedField.of(1, true, "x", Types.LongType.get())), + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + Optional.ofNullable(catalog.defaultTableLocation(SESSION, schemaTableName)), + ImmutableMap.of()) + .commitTransaction(); + + String expectedStats = """ + VALUES + ('x', 0e0, 0e0, 1e0, NULL, NULL, NULL), + (NULL, NULL, NULL, NULL, 0e0, NULL, NULL) + """; + + assertThat(query("SHOW STATS FOR " + table)) + .skippingTypesCheck() + .matches(expectedStats); + + assertUpdate("ANALYZE " + table); + + assertThat(query("SHOW STATS FOR " + table)) + .skippingTypesCheck() + .matches(expectedStats); + + catalog.dropTable(SESSION, schemaTableName); + } + + private void testHighlyNestedFieldPartitioningWithTimestampTransform(String partitioning, String partitionDirectoryRegex, Set expectedPartitionDirectories) + { + String tableName = "test_highly_nested_field_partitioning_with_timestamp_transform_" + randomNameSuffix(); + assertUpdate("CREATE TABLE " + tableName + " (id INTEGER, grandparent ROW(parent ROW(ts TIMESTAMP(6), a INT), b INT)) WITH (partitioning = " + partitioning + ")"); + assertUpdate( + "INSERT INTO " + tableName + " VALUES " + + "(1, ROW(ROW(TIMESTAMP '2021-01-01 01:01:01.111111', 1), 1)), " + + "(2, ROW(ROW(TIMESTAMP '2022-02-02 02:02:02.222222', 2), 2)), " + + "(3, ROW(ROW(TIMESTAMP '2023-03-03 03:03:03.333333', 3), 3)), " + + "(4, ROW(ROW(TIMESTAMP '2022-02-02 02:04:04.444444', 4), 4))", + 4); + + assertThat(loadTable(tableName).newScan().planFiles()).hasSize(3); + Set partitionedDirectories = computeActual("SELECT file_path FROM \"" + tableName + "$files\"") + .getMaterializedRows().stream() + .map(entry -> extractPartitionFolder((String) entry.getField(0), partitionDirectoryRegex)) + .flatMap(Optional::stream) + .collect(toImmutableSet()); + + assertThat(partitionedDirectories).isEqualTo(expectedPartitionDirectories); + + assertQuery("SELECT id, grandparent.parent.ts, grandparent.parent.a, grandparent.b FROM " + tableName, "VALUES " + + "(1, '2021-01-01 01:01:01.111111', 1, 1), " + + "(2, '2022-02-02 02:02:02.222222', 2, 2), " + + "(3, '2023-03-03 03:03:03.333333', 3, 3), " + + "(4, '2022-02-02 02:04:04.444444', 4, 4)"); + + assertUpdate("DROP TABLE " + tableName); + } + + private Optional extractPartitionFolder(String file, String regex) + { + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(file); + if (matcher.matches()) { + return Optional.of(matcher.group(1)); + } + return Optional.empty(); + } + private void writeEqualityDeleteToNationTable(Table icebergTable) throws Exception { @@ -796,36 +1572,37 @@ private void writeEqualityDeleteToNationTable(Table icebergTable, Optional partitionSpec, Optional partitionData, Map overwriteValues) + private void writeEqualityDeleteToNationTable( + Table icebergTable, + Optional partitionSpec, + Optional partitionData, + Map overwriteValues) throws Exception { - Path metadataDir = new Path(metastoreDir.toURI()); - String deleteFileName = "delete_file_" + UUID.randomUUID(); - FileIO fileIo = new ForwardingFileIo(fileSystemFactory.create(SESSION)); - - Schema deleteRowSchema = icebergTable.schema().select(overwriteValues.keySet()); - List equalityFieldIds = overwriteValues.keySet().stream() - .map(name -> deleteRowSchema.findField(name).fieldId()) - .collect(toImmutableList()); - Parquet.DeleteWriteBuilder writerBuilder = Parquet.writeDeletes(fileIo.newOutputFile(new Path(metadataDir, deleteFileName).toString())) - .forTable(icebergTable) - .rowSchema(deleteRowSchema) - .createWriterFunc(GenericParquetWriter::buildWriter) - .equalityFieldIds(equalityFieldIds) - .overwrite(); - if (partitionSpec.isPresent() && partitionData.isPresent()) { - writerBuilder = writerBuilder - .withSpec(partitionSpec.get()) - .withPartition(partitionData.get()); - } - EqualityDeleteWriter writer = writerBuilder.buildEqualityWriter(); + writeEqualityDeleteToNationTableWithDeleteColumns(icebergTable, partitionSpec, partitionData, overwriteValues, Optional.empty()); + } - Record dataDelete = GenericRecord.create(deleteRowSchema); - try (Closeable ignored = writer) { - writer.write(dataDelete.copy(overwriteValues)); - } + private void writeEqualityDeleteToNationTableWithDeleteColumns( + Table icebergTable, + Optional partitionSpec, + Optional partitionData, + Map overwriteValues, + Optional> deleteFileColumns) + throws Exception + { + writeEqualityDeleteForTable(icebergTable, fileSystemFactory, partitionSpec, partitionData, overwriteValues, deleteFileColumns); + } - icebergTable.newRowDelta().addDeletes(writer.toDeleteFile()).commit(); + private void writeEqualityDeleteToNationTableWithDeleteColumns( + Table icebergTable, + Optional partitionSpec, + Optional partitionData, + Map overwriteValues, + Schema deleteRowSchema, + List equalityDeleteFieldIds) + throws Exception + { + writeEqualityDeleteForTableWithSchema(icebergTable, fileSystemFactory, partitionSpec, partitionData, deleteRowSchema, equalityDeleteFieldIds, overwriteValues); } private Table updateTableToV2(String tableName) @@ -833,6 +1610,7 @@ private Table updateTableToV2(String tableName) BaseTable table = loadTable(tableName); TableOperations operations = table.operations(); TableMetadata currentMetadata = operations.current(); + checkArgument(currentMetadata.formatVersion() != 2, "Format version is already 2: '%s'", tableName); operations.commit(currentMetadata, currentMetadata.upgradeToFormatVersion(2)); return table; @@ -840,24 +1618,12 @@ private Table updateTableToV2(String tableName) private BaseTable loadTable(String tableName) { - IcebergTableOperationsProvider tableOperationsProvider = new FileMetastoreTableOperationsProvider(fileSystemFactory); - CachingHiveMetastore cachingHiveMetastore = memoizeMetastore(metastore, 1000); - TrinoCatalog catalog = new TrinoHiveCatalog( - new CatalogName("hive"), - cachingHiveMetastore, - new TrinoViewHiveMetastore(cachingHiveMetastore, false, "trino-version", "test"), - fileSystemFactory, - new TestingTypeManager(), - tableOperationsProvider, - false, - false, - false); - return (BaseTable) loadIcebergTable(catalog, tableOperationsProvider, SESSION, new SchemaTableName("tpch", tableName)); + return IcebergTestUtils.loadTable(tableName, metastore, fileSystemFactory, "hive", "tpch"); } private List getActiveFiles(String tableName) { - return computeActual(format("SELECT file_path FROM \"%s$files\"", tableName)).getOnlyColumn() + return computeActual(format("SELECT file_path FROM \"%s$files\" WHERE content = %d", tableName, FileContent.DATA.id())).getOnlyColumn() .map(String.class::cast) .collect(toImmutableList()); } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetadataQueryOptimization.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetadataQueryOptimization.java index 032c26f41658..e85bfe65dfe7 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetadataQueryOptimization.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetadataQueryOptimization.java @@ -16,14 +16,13 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import io.trino.Session; -import io.trino.metadata.InternalFunctionBundle; -import io.trino.plugin.hive.metastore.Database; -import io.trino.plugin.hive.metastore.HiveMetastore; -import io.trino.plugin.iceberg.catalog.file.TestingIcebergFileMetastoreCatalogModule; +import io.trino.metastore.Database; +import io.trino.metastore.HiveMetastore; +import io.trino.metastore.HiveMetastoreFactory; import io.trino.spi.security.PrincipalType; +import io.trino.sql.ir.Constant; import io.trino.sql.planner.assertions.BasePushdownPlanTest; -import io.trino.sql.tree.LongLiteral; -import io.trino.testing.LocalQueryRunner; +import io.trino.testing.PlanTester; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Test; @@ -35,9 +34,8 @@ import static com.google.common.io.MoreFiles.deleteRecursively; import static com.google.common.io.RecursiveDeleteOption.ALLOW_INSECURE; -import static com.google.inject.util.Modules.EMPTY_MODULE; -import static io.trino.SystemSessionProperties.TASK_PARTITIONED_WRITER_COUNT; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; +import static io.trino.SystemSessionProperties.TASK_MAX_WRITER_COUNT; +import static io.trino.spi.type.IntegerType.INTEGER; import static io.trino.sql.planner.assertions.PlanMatchPattern.anyTree; import static io.trino.sql.planner.assertions.PlanMatchPattern.values; import static io.trino.testing.TestingSession.testSessionBuilder; @@ -51,13 +49,13 @@ public class TestMetadataQueryOptimization private File baseDir; @Override - protected LocalQueryRunner createLocalQueryRunner() + protected PlanTester createPlanTester() { Session session = testSessionBuilder() .setCatalog(ICEBERG_CATALOG) .setSchema(SCHEMA_NAME) // optimize_metadata_queries doesn't work when files are written by different writers - .setSystemProperty(TASK_PARTITIONED_WRITER_COUNT, "1") + .setSystemProperty(TASK_MAX_WRITER_COUNT, "1") .build(); try { @@ -66,17 +64,13 @@ protected LocalQueryRunner createLocalQueryRunner() catch (IOException e) { throw new UncheckedIOException(e); } - HiveMetastore metastore = createTestingFileHiveMetastore(baseDir); - LocalQueryRunner queryRunner = LocalQueryRunner.create(session); + PlanTester planTester = PlanTester.create(session); + planTester.installPlugin(new TestingIcebergPlugin(baseDir.toPath())); + planTester.createCatalog(ICEBERG_CATALOG, "iceberg", ImmutableMap.of()); - InternalFunctionBundle.InternalFunctionBundleBuilder functions = InternalFunctionBundle.builder(); - new IcebergPlugin().getFunctions().forEach(functions::functions); - queryRunner.addFunctions(functions.build()); - - queryRunner.createCatalog( - ICEBERG_CATALOG, - new TestingIcebergConnectorFactory(Optional.of(new TestingIcebergFileMetastoreCatalogModule(metastore)), Optional.empty(), EMPTY_MODULE), - ImmutableMap.of()); + HiveMetastore metastore = ((IcebergConnector) planTester.getConnector(ICEBERG_CATALOG)).getInjector() + .getInstance(HiveMetastoreFactory.class) + .createMetastore(Optional.empty()); Database database = Database.builder() .setDatabaseName(SCHEMA_NAME) @@ -85,7 +79,7 @@ protected LocalQueryRunner createLocalQueryRunner() .build(); metastore.createDatabase(database); - return queryRunner; + return planTester; } @Test @@ -93,11 +87,11 @@ public void testOptimization() { String testTable = "test_metadata_optimization"; - getQueryRunner().execute(format( + getPlanTester().executeStatement(format( "CREATE TABLE %s (a, b, c) WITH (PARTITIONING = ARRAY['b', 'c']) AS VALUES (5, 6, 7), (8, 9, 10)", testTable)); - Session session = Session.builder(getQueryRunner().getDefaultSession()) + Session session = Session.builder(getPlanTester().getDefaultSession()) .setSystemProperty("optimize_metadata_queries", "true") .build(); @@ -107,15 +101,24 @@ public void testOptimization() anyTree(values( ImmutableList.of("b", "c"), ImmutableList.of( - ImmutableList.of(new LongLiteral("6"), new LongLiteral("7")), - ImmutableList.of(new LongLiteral("9"), new LongLiteral("10")))))); + ImmutableList.of(new Constant(INTEGER, 9L), new Constant(INTEGER, 10L)), + ImmutableList.of(new Constant(INTEGER, 6L), new Constant(INTEGER, 7L)))))); + + assertPlan( + format("SELECT DISTINCT b, c FROM %s LIMIT 10", testTable), + session, + anyTree(values( + ImmutableList.of("b", "c"), + ImmutableList.of( + ImmutableList.of(new Constant(INTEGER, 9L), new Constant(INTEGER, 10L)), + ImmutableList.of(new Constant(INTEGER, 6L), new Constant(INTEGER, 7L)))))); assertPlan( format("SELECT DISTINCT b, c FROM %s WHERE b > 7", testTable), session, anyTree(values( ImmutableList.of("b", "c"), - ImmutableList.of(ImmutableList.of(new LongLiteral("9"), new LongLiteral("10")))))); + ImmutableList.of(ImmutableList.of(new Constant(INTEGER, 9L), new Constant(INTEGER, 10L)))))); assertPlan( format("SELECT DISTINCT b, c FROM %s WHERE b > 7 AND c < 8", testTable), @@ -124,6 +127,57 @@ public void testOptimization() values(ImmutableList.of("b", "c"), ImmutableList.of()))); } + @Test + public void testOptimizationOnPartitionWithMultipleFiles() + { + String testTable = "test_metadata_optimization_on_partition_with_multiple_files"; + + getPlanTester().executeStatement(format( + "CREATE TABLE %s (a, b, c) WITH (PARTITIONING = ARRAY['b', 'c']) AS VALUES (1, 8, 9), (2, 8, 9)", + testTable)); + + Session session = Session.builder(getPlanTester().getDefaultSession()) + .setSystemProperty("optimize_metadata_queries", "true") + .build(); + + // Insert again to generate another file in same partition + getPlanTester().executeStatement(format( + "INSERT INTO %s VALUES (3, 8, 9)", + testTable)); + + assertPlan( + format("SELECT DISTINCT b, c FROM %s ORDER BY b", testTable), + session, + anyTree(values( + ImmutableList.of("b", "c"), + ImmutableList.of( + ImmutableList.of(new Constant(INTEGER, 8L), new Constant(INTEGER, 9L)))))); + } + + @Test + public void testOptimizationWithNullPartitions() + { + String testTable = "test_metadata_optimization_with_null_partitions"; + + getPlanTester().executeStatement(format( + "CREATE TABLE %s (a, b, c) WITH (PARTITIONING = ARRAY['b', 'c'])" + + "AS VALUES (5, 6, CAST(NULL AS INTEGER)), (8, 9, CAST(NULL AS INTEGER))", + testTable)); + + Session session = Session.builder(getPlanTester().getDefaultSession()) + .setSystemProperty("optimize_metadata_queries", "true") + .build(); + + assertPlan( + format("SELECT DISTINCT b, c FROM %s ORDER BY b", testTable), + session, + anyTree(values( + ImmutableList.of("b", "c"), + ImmutableList.of( + ImmutableList.of(new Constant(INTEGER, 6L), new Constant(INTEGER, null)), + ImmutableList.of(new Constant(INTEGER, 9L), new Constant(INTEGER, null)))))); + } + @AfterAll public void cleanup() throws Exception diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetricsWrapper.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetricsWrapper.java index f37e86c95c9a..cf42535ae7c6 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetricsWrapper.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestMetricsWrapper.java @@ -19,7 +19,7 @@ import io.airlift.json.JsonCodec; import io.airlift.json.ObjectMapperProvider; import org.apache.iceberg.Metrics; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.lang.reflect.Method; import java.lang.reflect.Type; @@ -30,7 +30,6 @@ import static com.google.common.collect.ImmutableSet.toImmutableSet; import static io.airlift.json.JsonCodec.jsonCodec; import static org.assertj.core.api.Assertions.assertThat; -import static org.testng.Assert.assertEquals; public class TestMetricsWrapper { @@ -51,13 +50,13 @@ public void testRoundTrip() Metrics actual = CODEC.fromJson(CODEC.toJson(new MetricsWrapper(expected))).metrics(); - assertEquals(actual.recordCount(), recordCount); - assertEquals(actual.columnSizes(), columnSizes); - assertEquals(actual.valueCounts(), valueCounts); - assertEquals(actual.nullValueCounts(), nullValueCounts); - assertEquals(actual.nanValueCounts(), nanValueCounts); - assertEquals(actual.lowerBounds(), lowerBounds); - assertEquals(actual.upperBounds(), upperBounds); + assertThat(actual.recordCount()).isEqualTo(recordCount); + assertThat(actual.columnSizes()).isEqualTo(columnSizes); + assertThat(actual.valueCounts()).isEqualTo(valueCounts); + assertThat(actual.nullValueCounts()).isEqualTo(nullValueCounts); + assertThat(actual.nanValueCounts()).isEqualTo(nanValueCounts); + assertThat(actual.lowerBounds()).isEqualTo(lowerBounds); + assertThat(actual.upperBounds()).isEqualTo(upperBounds); } @Test diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestParquetPredicates.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestParquetPredicates.java new file mode 100644 index 000000000000..9a403fdc3cd0 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestParquetPredicates.java @@ -0,0 +1,207 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.spi.predicate.Domain; +import io.trino.spi.predicate.TupleDomain; +import io.trino.spi.type.RowType; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; + +import static io.trino.parquet.ParquetTypeUtils.getDescriptors; +import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.PRIMITIVE; +import static io.trino.plugin.iceberg.ColumnIdentity.TypeCategory.STRUCT; +import static io.trino.plugin.iceberg.IcebergPageSourceProvider.getParquetTupleDomain; +import static io.trino.spi.predicate.TupleDomain.withColumnDomains; +import static io.trino.spi.type.IntegerType.INTEGER; +import static io.trino.spi.type.RowType.rowType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; +import static org.assertj.core.api.Assertions.assertThat; + +public class TestParquetPredicates +{ + @Test + public void testParquetTupleDomainStructWithPrimitiveColumnPredicate() + { + // trino type + RowType baseType = rowType( + RowType.field("a", INTEGER), + RowType.field("b", INTEGER), + RowType.field("c", INTEGER)); + + // iceberg type + ColumnIdentity fieldA = new ColumnIdentity(1, "a", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldB = new ColumnIdentity(2, "b", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldC = new ColumnIdentity(3, "c", PRIMITIVE, ImmutableList.of()); + + // parquet type + MessageType fileSchema = new MessageType("iceberg_schema", + new GroupType(OPTIONAL, "row_field", + new PrimitiveType(OPTIONAL, INT32, "a").withId(1), + new PrimitiveType(OPTIONAL, INT32, "b").withId(2), + new PrimitiveType(OPTIONAL, INT32, "c").withId(3))); + + // predicate domain + IcebergColumnHandle projectedColumn = IcebergColumnHandle.required(new ColumnIdentity( + 5, + "row_field", + STRUCT, + ImmutableList.of(fieldA, fieldB, fieldC))) + .fieldType(baseType, INTEGER) + .path(2) + .build(); + Domain predicateDomain = Domain.singleValue(INTEGER, 123L); + TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of(projectedColumn, predicateDomain)); + + Map, ColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain calculatedTupleDomain = getParquetTupleDomain(descriptorsByPath, tupleDomain); + + assertThat(calculatedTupleDomain.getDomains().orElseThrow()).hasSize(1); + ColumnDescriptor selectedColumnDescriptor = descriptorsByPath.get(ImmutableList.of("row_field", "b")); + assertThat(calculatedTupleDomain.getDomains().orElseThrow().get(selectedColumnDescriptor)).isEqualTo(predicateDomain); + } + + @Test + public void testParquetTupleDomainStructWithPrimitiveColumnDifferentIdPredicate() + { + // trino type + RowType baseType = rowType( + RowType.field("a", INTEGER), + RowType.field("b", INTEGER), + RowType.field("c", INTEGER)); + + // iceberg type + ColumnIdentity fieldA = new ColumnIdentity(1, "a", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldB = new ColumnIdentity(2, "b", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldC = new ColumnIdentity(4, "c", PRIMITIVE, ImmutableList.of()); + + // parquet type + MessageType fileSchema = new MessageType("iceberg_schema", + new GroupType(OPTIONAL, "row_field", + new PrimitiveType(OPTIONAL, INT32, "a").withId(1), + new PrimitiveType(OPTIONAL, INT32, "b").withId(2), + new PrimitiveType(OPTIONAL, INT32, "c").withId(3)).withId(5)); + + // predicate domain + IcebergColumnHandle projectedColumn = IcebergColumnHandle.required(new ColumnIdentity( + 5, + "row_field", + STRUCT, + ImmutableList.of(fieldA, fieldB, fieldC))) + .fieldType(baseType, INTEGER) + .path(4) + .build(); + Domain predicateDomain = Domain.singleValue(INTEGER, 123L); + TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of(projectedColumn, predicateDomain)); + + Map, ColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain calculatedTupleDomain = getParquetTupleDomain(descriptorsByPath, tupleDomain); + // same name but different Id between iceberg and parquet for field c + assertThat(calculatedTupleDomain.isAll()).isTrue(); + } + + @Test + public void testParquetTupleDomainStructWithComplexColumnPredicate() + { + // trino type + RowType nestedType = rowType( + RowType.field("c1", INTEGER), + RowType.field("c2", INTEGER)); + RowType baseType = rowType( + RowType.field("a", INTEGER), + RowType.field("b", INTEGER), + RowType.field("c", nestedType)); + + // iceberg type + ColumnIdentity fieldC11 = new ColumnIdentity(1, "c1", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldC12 = new ColumnIdentity(2, "c2", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldA = new ColumnIdentity(3, "a", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldB = new ColumnIdentity(4, "b", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldC = new ColumnIdentity(5, "c", STRUCT, ImmutableList.of(fieldC11, fieldC12)); + + // parquet type + MessageType fileSchema = new MessageType("iceberg_schema", + new GroupType(OPTIONAL, "row_field", + new PrimitiveType(OPTIONAL, INT32, "a").withId(3), + new PrimitiveType(OPTIONAL, INT32, "b").withId(4), + new GroupType(OPTIONAL, + "c", + new PrimitiveType(OPTIONAL, INT32, "c1").withId(1), + new PrimitiveType(OPTIONAL, INT32, "c2").withId(2)).withId(5))); + // predicate domain + IcebergColumnHandle projectedColumn = IcebergColumnHandle.required(new ColumnIdentity( + 6, + "row_field", + STRUCT, + ImmutableList.of(fieldA, fieldB, fieldC))) + .fieldType(baseType, nestedType) + .path(5) + .build(); + + Domain predicateDomain = Domain.onlyNull(nestedType); + TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of(projectedColumn, predicateDomain)); + + Map, ColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain calculatedTupleDomain = getParquetTupleDomain(descriptorsByPath, tupleDomain); + + assertThat(calculatedTupleDomain.isAll()).isTrue(); + } + + @Test + public void testParquetTupleDomainStructWithMissingPrimitiveColumn() + { + // trino type + RowType baseType = rowType( + RowType.field("a", INTEGER), + RowType.field("b", INTEGER), + RowType.field("missing", INTEGER)); + + // iceberg type + ColumnIdentity fieldA = new ColumnIdentity(1, "a", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldB = new ColumnIdentity(2, "b", PRIMITIVE, ImmutableList.of()); + ColumnIdentity fieldC = new ColumnIdentity(3, "missing", PRIMITIVE, ImmutableList.of()); + + // parquet type + MessageType fileSchema = new MessageType("iceberg_schema", + new GroupType(OPTIONAL, "row_field", + new PrimitiveType(OPTIONAL, INT32, "a").withId(1), + new PrimitiveType(OPTIONAL, INT32, "b").withId(2))); + + // predicate domain + IcebergColumnHandle projectedColumn = IcebergColumnHandle.required(new ColumnIdentity( + 5, + "row_field", + STRUCT, + ImmutableList.of(fieldA, fieldB, fieldC))) + .fieldType(baseType, INTEGER) + .path(3) + .build(); + Domain predicateDomain = Domain.singleValue(INTEGER, 123L); + TupleDomain tupleDomain = withColumnDomains(ImmutableMap.of(projectedColumn, predicateDomain)); + + Map, ColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); + TupleDomain calculatedTupleDomain = getParquetTupleDomain(descriptorsByPath, tupleDomain); + + assertThat(calculatedTupleDomain.isAll()).isTrue(); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionFields.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionFields.java index 4deb541c1259..aacbda15dbaa 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionFields.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionFields.java @@ -13,24 +13,30 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableList; +import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.DoubleType; import org.apache.iceberg.types.Types.ListType; import org.apache.iceberg.types.Types.LongType; import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.types.Types.StringType; import org.apache.iceberg.types.Types.TimestampType; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; +import java.util.List; import java.util.function.Consumer; import static com.google.common.collect.Iterables.getOnlyElement; import static io.trino.plugin.iceberg.PartitionFields.parsePartitionField; +import static io.trino.plugin.iceberg.PartitionFields.parsePartitionFields; import static io.trino.plugin.iceberg.PartitionFields.toPartitionFields; -import static org.assertj.core.api.AssertionsForClassTypes.assertThatThrownBy; -import static org.testng.Assert.assertEquals; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; public class TestPartitionFields { @@ -67,29 +73,77 @@ public void testParse() assertParse("void(\"quoted field\")", partitionSpec(builder -> builder.alwaysNull("quoted field"))); assertParse("truncate(\"\"\"another\"\" \"\"quoted\"\" \"\"field\"\"\", 13)", partitionSpec(builder -> builder.truncate("\"another\" \"quoted\" \"field\"", 13))); assertParse("void(\"\"\"another\"\" \"\"quoted\"\" \"\"field\"\"\")", partitionSpec(builder -> builder.alwaysNull("\"another\" \"quoted\" \"field\""))); + assertParse("\"nested.value\"", partitionSpec(builder -> builder.identity("nested.value"))); + assertParse("year(\"nested.ts\")", partitionSpec(builder -> builder.year("nested.ts"))); + assertParse("month(\"nested.ts\")", partitionSpec(builder -> builder.month("nested.ts"))); + assertParse("day(\"nested.ts\")", partitionSpec(builder -> builder.day("nested.ts"))); + assertParse("hour(\"nested.nested.ts\")", partitionSpec(builder -> builder.hour("nested.nested.ts"))); + assertParse("truncate(\"nested.nested.value\", 13)", partitionSpec(builder -> builder.truncate("nested.nested.value", 13))); + assertParse("bucket(\"nested.nested.value\", 42)", partitionSpec(builder -> builder.bucket("nested.nested.value", 42))); + assertParse("void(\"nested.nested.value\")", partitionSpec(builder -> builder.alwaysNull("nested.nested.value"))); + assertParse("\"MixedTs\"", partitionSpec(builder -> builder.identity("MixedTs"))); + assertParse("\"MixedNested.MixedValue\"", partitionSpec(builder -> builder.identity("MixedNested.MixedValue"))); + assertParse("year(\"MixedTs\")", partitionSpec(builder -> builder.year("MixedTs"))); + assertParse("month(\"MixedTs\")", partitionSpec(builder -> builder.month("MixedTs"))); + assertParse("day(\"MixedTs\")", partitionSpec(builder -> builder.day("MixedTs"))); + assertParse("hour(\"MixedTs\")", partitionSpec(builder -> builder.hour("MixedTs"))); + assertParse("bucket(\"MixedTs\", 42)", partitionSpec(builder -> builder.bucket("MixedTs", 42))); + assertParse("truncate(\"MixedString\", 13)", partitionSpec(builder -> builder.truncate("MixedString", 13))); + assertParse("void(\"MixedString\")", partitionSpec(builder -> builder.alwaysNull("MixedString"))); assertInvalid("bucket()", "Invalid partition field declaration: bucket()"); + assertInvalid(".nested", "Invalid partition field declaration: .nested"); assertInvalid("abc", "Cannot find source column: abc"); assertInvalid("notes", "Cannot partition by non-primitive source field: list"); - assertInvalid("bucket(price, 42)", "Cannot bucket by type: double"); - assertInvalid("bucket(notes, 88)", "Cannot bucket by type: list"); - assertInvalid("truncate(ts, 13)", "Cannot truncate type: timestamp"); - assertInvalid("year(order_key)", "Cannot partition type long by year"); + assertInvalid("bucket(price, 42)", "Invalid source type double for transform: bucket[42]"); + assertInvalid("bucket(notes, 88)", "Cannot partition by non-primitive source field: list"); + assertInvalid("truncate(ts, 13)", "Invalid source type timestamp for transform: truncate[13]"); + assertInvalid("year(order_key)", "Invalid source type long for transform: year"); assertInvalid("\"test\"", "Cannot find source column: test"); assertInvalid("\"test with space\"", "Cannot find source column: test with space"); assertInvalid("\"test \"with space\"", "Invalid partition field declaration: \"test \"with space\""); assertInvalid("\"test \"\"\"with space\"", "Invalid partition field declaration: \"test \"\"\"with space\""); assertInvalid("ABC", "Cannot find source column: abc"); - assertInvalid("\"ABC\"", "Uppercase characters in identifier '\"ABC\"' are not supported."); + assertInvalid("\"ABC\"", "Cannot find source column: ABC"); assertInvalid("year(ABC)", "Cannot find source column: abc"); - assertInvalid("bucket(\"ABC\", 12)", "Uppercase characters in identifier '\"ABC\"' are not supported."); + assertInvalid("bucket(\"ABC\", 12)", "Cannot find source column: ABC"); + assertInvalid("\"nested.list\"", "Cannot partition by non-primitive source field: list"); + } + + @Test + public void testConflicts() + { + assertParseName(List.of("col", "col_year"), TimestampType.withZone(), List.of("year(col)"), List.of("col_year_2")); + assertParseName(List.of("col", "col_month"), TimestampType.withZone(), List.of("month(col)"), List.of("col_month_2")); + assertParseName(List.of("col", "col_day"), TimestampType.withZone(), List.of("day(col)"), List.of("col_day_2")); + assertParseName(List.of("col", "col_hour"), TimestampType.withZone(), List.of("hour(col)"), List.of("col_hour_2")); + assertParseName(List.of("col", "col_bucket"), TimestampType.withZone(), List.of("bucket(col,10)"), List.of("col_bucket_2")); + assertParseName(List.of("col", "col_trunc"), StringType.get(), List.of("truncate(col,10)"), List.of("col_trunc_2")); + assertParseName(List.of("col", "col_null"), TimestampType.withZone(), List.of("void(col)"), List.of("col_null_2")); + + assertParseName(List.of("col", "col_year", "col_year_2"), TimestampType.withZone(), List.of("year(col)"), List.of("col_year_3")); + assertParseName(List.of("col", "col_year", "col_year_3"), TimestampType.withZone(), List.of("year(col)"), List.of("col_year_2")); + + assertParseName(List.of("col", "col_year", "col_year_2"), TimestampType.withZone(), List.of("year(col)", "col_year_2"), List.of("col_year_3", "col_year_2")); + } + + private static void assertParseName(List columnNames, Type type, List partitions, List expected) + { + ImmutableList.Builder columns = ImmutableList.builderWithExpectedSize(columnNames.size()); + int i = 1; + for (String name : columnNames) { + columns.add(NestedField.required(i++, name, type)); + } + PartitionSpec spec = parsePartitionFields(new Schema(columns.build()), partitions); + assertThat(spec.fields()).extracting(PartitionField::name) + .containsExactlyElementsOf(expected); } private static void assertParse(String value, PartitionSpec expected, String canonicalRepresentation) { - assertEquals(expected.fields().size(), 1); - assertEquals(parseField(value), expected); - assertEquals(getOnlyElement(toPartitionFields(expected)), canonicalRepresentation); + assertThat(expected.fields()).hasSize(1); + assertThat(parseField(value)).isEqualTo(expected); + assertThat(getOnlyElement(toPartitionFields(expected))).isEqualTo(canonicalRepresentation); } private static void assertParse(String value, PartitionSpec expected) @@ -109,7 +163,7 @@ private static void assertInvalid(String value, String message) private static PartitionSpec parseField(String value) { - return partitionSpec(builder -> parsePartitionField(builder, value)); + return partitionSpec(builder -> parsePartitionField(builder, value, "")); } private static PartitionSpec partitionSpec(Consumer consumer) @@ -122,7 +176,18 @@ private static PartitionSpec partitionSpec(Consumer consu NestedField.optional(5, "notes", ListType.ofRequired(6, StringType.get())), NestedField.optional(7, "quoted field", StringType.get()), NestedField.optional(8, "quoted ts", TimestampType.withoutZone()), - NestedField.optional(9, "\"another\" \"quoted\" \"field\"", StringType.get())); + NestedField.optional(9, "\"another\" \"quoted\" \"field\"", StringType.get()), + NestedField.required(10, "nested", Types.StructType.of( + NestedField.required(12, "value", StringType.get()), + NestedField.required(13, "ts", TimestampType.withZone()), + NestedField.required(14, "list", ListType.ofRequired(15, StringType.get())), + NestedField.required(16, "nested", Types.StructType.of( + NestedField.required(17, "value", StringType.get()), + NestedField.required(18, "ts", TimestampType.withZone()))))), + NestedField.required(19, "MixedTs", TimestampType.withoutZone()), + NestedField.optional(20, "MixedString", StringType.get()), + NestedField.required(21, "MixedNested", Types.StructType.of( + NestedField.required(22, "MixedValue", StringType.get())))); PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); consumer.accept(builder); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionTransforms.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionTransforms.java index 9648e7b82c6b..affeeb9b0890 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionTransforms.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestPartitionTransforms.java @@ -17,7 +17,7 @@ import org.apache.iceberg.types.Types.DateType; import org.apache.iceberg.types.Types.StringType; import org.apache.iceberg.types.Types.TimestampType; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.time.LocalDateTime; import java.time.LocalTime; @@ -29,7 +29,7 @@ import static io.trino.plugin.iceberg.PartitionTransforms.epochYear; import static java.lang.Math.toIntExact; import static java.util.concurrent.TimeUnit.SECONDS; -import static org.testng.Assert.assertEquals; +import static org.assertj.core.api.Assertions.assertThat; public class TestPartitionTransforms { @@ -39,13 +39,13 @@ public class TestPartitionTransforms @Test public void testToStringMatchesSpecification() { - assertEquals(Transforms.identity().toString(), "identity"); - assertEquals(Transforms.bucket(13).bind(StringType.get()).toString(), "bucket[13]"); - assertEquals(Transforms.truncate(19).bind(StringType.get()).toString(), "truncate[19]"); - assertEquals(Transforms.year().toString(), "year"); - assertEquals(Transforms.month().toString(), "month"); - assertEquals(Transforms.day().toString(), "day"); - assertEquals(Transforms.hour().toString(), "hour"); + assertThat(Transforms.identity().toString()).isEqualTo("identity"); + assertThat(Transforms.bucket(13).bind(StringType.get()).toString()).isEqualTo("bucket[13]"); + assertThat(Transforms.truncate(19).bind(StringType.get()).toString()).isEqualTo("truncate[19]"); + assertThat(Transforms.year().toString()).isEqualTo("year"); + assertThat(Transforms.month().toString()).isEqualTo("month"); + assertThat(Transforms.day().toString()).isEqualTo("day"); + assertThat(Transforms.hour().toString()).isEqualTo("hour"); } @Test @@ -65,16 +65,30 @@ public void testEpochTransforms() if (time.toLocalTime().equals(LocalTime.MIDNIGHT)) { int epochDay = toIntExact(time.toLocalDate().toEpochDay()); - assertEquals(actualYear, (int) Transforms.year().bind(ICEBERG_DATE).apply(epochDay), time.toString()); - assertEquals(actualMonth, (int) Transforms.month().bind(ICEBERG_DATE).apply(epochDay), time.toString()); - assertEquals(actualDay, (int) Transforms.day().bind(ICEBERG_DATE).apply(epochDay), time.toString()); + assertThat(actualYear) + .describedAs(time.toString()) + .isEqualTo((int) Transforms.year().bind(ICEBERG_DATE).apply(epochDay)); + assertThat(actualMonth) + .describedAs(time.toString()) + .isEqualTo((int) Transforms.month().bind(ICEBERG_DATE).apply(epochDay)); + assertThat(actualDay) + .describedAs(time.toString()) + .isEqualTo((int) Transforms.day().bind(ICEBERG_DATE).apply(epochDay)); } long epochMicro = SECONDS.toMicros(epochSecond); - assertEquals(actualYear, (int) Transforms.year().bind(ICEBERG_TIMESTAMP).apply(epochMicro), time.toString()); - assertEquals(actualMonth, (int) Transforms.month().bind(ICEBERG_TIMESTAMP).apply(epochMicro), time.toString()); - assertEquals(actualDay, (int) Transforms.day().bind(ICEBERG_TIMESTAMP).apply(epochMicro), time.toString()); - assertEquals(actualHour, (int) Transforms.hour().bind(ICEBERG_TIMESTAMP).apply(epochMicro), time.toString()); + assertThat(actualYear) + .describedAs(time.toString()) + .isEqualTo((int) Transforms.year().bind(ICEBERG_TIMESTAMP).apply(epochMicro)); + assertThat(actualMonth) + .describedAs(time.toString()) + .isEqualTo((int) Transforms.month().bind(ICEBERG_TIMESTAMP).apply(epochMicro)); + assertThat(actualDay) + .describedAs(time.toString()) + .isEqualTo((int) Transforms.day().bind(ICEBERG_TIMESTAMP).apply(epochMicro)); + assertThat(actualHour) + .describedAs(time.toString()) + .isEqualTo((int) Transforms.hour().bind(ICEBERG_TIMESTAMP).apply(epochMicro)); } } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSharedHiveMetastore.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSharedHiveMetastore.java index c3a3240fd2d1..7a1151d9007f 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSharedHiveMetastore.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSharedHiveMetastore.java @@ -21,17 +21,22 @@ import io.trino.testing.DistributedQueryRunner; import io.trino.testing.QueryRunner; import io.trino.tpch.TpchTable; -import org.testng.annotations.AfterClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.parallel.Execution; import java.nio.file.Path; -import static io.trino.plugin.hive.metastore.file.TestingFileHiveMetastore.createTestingFileHiveMetastore; import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; import static io.trino.plugin.tpch.TpchMetadata.TINY_SCHEMA_NAME; import static io.trino.testing.QueryAssertions.copyTpchTables; import static io.trino.testing.TestingSession.testSessionBuilder; import static java.lang.String.format; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +import static org.junit.jupiter.api.parallel.ExecutionMode.CONCURRENT; +@TestInstance(PER_CLASS) +@Execution(CONCURRENT) public class TestSharedHiveMetastore extends BaseSharedMetastoreTest { @@ -44,14 +49,14 @@ protected QueryRunner createQueryRunner() { Session icebergSession = testSessionBuilder() .setCatalog(ICEBERG_CATALOG) - .setSchema(schema) + .setSchema(tpchSchema) .build(); Session hiveSession = testSessionBuilder() .setCatalog(HIVE_CATALOG) - .setSchema(schema) + .setSchema(tpchSchema) .build(); - DistributedQueryRunner queryRunner = DistributedQueryRunner.builder(icebergSession).build(); + QueryRunner queryRunner = DistributedQueryRunner.builder(icebergSession).build(); queryRunner.installPlugin(new TpchPlugin()); queryRunner.createCatalog("tpch", "tpch"); @@ -65,46 +70,50 @@ protected QueryRunner createQueryRunner() "iceberg", ImmutableMap.of( "iceberg.catalog.type", "TESTING_FILE_METASTORE", - "hive.metastore.catalog.dir", dataDirectory.toString())); + "hive.metastore.catalog.dir", dataDirectory.toString(), + "fs.hadoop.enabled", "true")); queryRunner.createCatalog( "iceberg_with_redirections", "iceberg", ImmutableMap.of( "iceberg.catalog.type", "TESTING_FILE_METASTORE", "hive.metastore.catalog.dir", dataDirectory.toString(), - "iceberg.hive-catalog-name", "hive")); + "iceberg.hive-catalog-name", "hive", + "fs.hadoop.enabled", "true")); - queryRunner.installPlugin(new TestingHivePlugin(createTestingFileHiveMetastore(dataDirectory.toFile()))); - queryRunner.createCatalog(HIVE_CATALOG, "hive", ImmutableMap.of("hive.allow-drop-table", "true")); + queryRunner.installPlugin(new TestingHivePlugin(dataDirectory)); + queryRunner.createCatalog(HIVE_CATALOG, "hive"); queryRunner.createCatalog( "hive_with_redirections", "hive", ImmutableMap.of("hive.iceberg-catalog-name", "iceberg")); - queryRunner.execute("CREATE SCHEMA " + schema); + queryRunner.execute("CREATE SCHEMA " + tpchSchema); copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, icebergSession, ImmutableList.of(TpchTable.NATION)); copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, hiveSession, ImmutableList.of(TpchTable.REGION)); + queryRunner.execute("CREATE SCHEMA " + testSchema); return queryRunner; } - @AfterClass(alwaysRun = true) + @AfterAll public void cleanup() { - assertQuerySucceeds("DROP TABLE IF EXISTS hive." + schema + ".region"); - assertQuerySucceeds("DROP TABLE IF EXISTS iceberg." + schema + ".nation"); - assertQuerySucceeds("DROP SCHEMA IF EXISTS hive." + schema); + assertQuerySucceeds("DROP TABLE IF EXISTS hive." + tpchSchema + ".region"); + assertQuerySucceeds("DROP TABLE IF EXISTS iceberg." + tpchSchema + ".nation"); + assertQuerySucceeds("DROP SCHEMA IF EXISTS hive." + tpchSchema); + assertQuerySucceeds("DROP SCHEMA IF EXISTS hive." + testSchema); } @Override protected String getExpectedHiveCreateSchema(String catalogName) { - String expectedHiveCreateSchema = "CREATE SCHEMA %s.%s\n" + - "WITH (\n" + - " location = 'file:%s/%s'\n" + - ")"; - - return format(expectedHiveCreateSchema, catalogName, schema, dataDirectory, schema); + return """ + CREATE SCHEMA %s.%s + WITH ( + location = 'local:///%s' + )""" + .formatted(catalogName, tpchSchema, tpchSchema); } @Override @@ -115,6 +124,6 @@ protected String getExpectedIcebergCreateSchema(String catalogName) "WITH (\n" + " location = '%s/%s'\n" + ")"; - return format(expectedIcebergCreateSchema, catalogName, schema, dataDirectory, schema); + return format(expectedIcebergCreateSchema, catalogName, tpchSchema, dataDirectory, tpchSchema); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSharedHiveThriftMetastore.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSharedHiveThriftMetastore.java new file mode 100644 index 000000000000..0e7ed42192d8 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSharedHiveThriftMetastore.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import io.trino.Session; +import io.trino.plugin.hive.TestingHivePlugin; +import io.trino.plugin.hive.containers.Hive3MinioDataLake; +import io.trino.plugin.hive.containers.HiveMinioDataLake; +import io.trino.plugin.tpch.TpchPlugin; +import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.QueryRunner; +import io.trino.tpch.TpchTable; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.TestInstance; +import org.junit.jupiter.api.parallel.Execution; + +import java.nio.file.Path; +import java.util.Map; + +import static io.trino.plugin.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static io.trino.plugin.tpch.TpchMetadata.TINY_SCHEMA_NAME; +import static io.trino.testing.QueryAssertions.copyTpchTables; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static io.trino.testing.TestingSession.testSessionBuilder; +import static io.trino.testing.containers.Minio.MINIO_ACCESS_KEY; +import static io.trino.testing.containers.Minio.MINIO_REGION; +import static io.trino.testing.containers.Minio.MINIO_SECRET_KEY; +import static java.lang.String.format; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; +import static org.junit.jupiter.api.parallel.ExecutionMode.CONCURRENT; + +@TestInstance(PER_CLASS) +@Execution(CONCURRENT) +public class TestSharedHiveThriftMetastore + extends BaseSharedMetastoreTest +{ + private static final String HIVE_CATALOG = "hive"; + private String bucketName; + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + bucketName = "test-iceberg-shared-metastore" + randomNameSuffix(); + HiveMinioDataLake hiveMinioDataLake = closeAfterClass(new Hive3MinioDataLake(bucketName)); + hiveMinioDataLake.start(); + + Session icebergSession = testSessionBuilder() + .setCatalog(ICEBERG_CATALOG) + .setSchema(tpchSchema) + .build(); + Session hiveSession = testSessionBuilder() + .setCatalog(HIVE_CATALOG) + .setSchema(tpchSchema) + .build(); + + QueryRunner queryRunner = DistributedQueryRunner.builder(icebergSession).build(); + + queryRunner.installPlugin(new TpchPlugin()); + queryRunner.createCatalog("tpch", "tpch"); + + Path dataDirectory = queryRunner.getCoordinator().getBaseDataDir().resolve("iceberg_data"); + dataDirectory.toFile().deleteOnExit(); + + queryRunner.installPlugin(new IcebergPlugin()); + queryRunner.createCatalog( + ICEBERG_CATALOG, + "iceberg", + ImmutableMap.builder() + .put("iceberg.catalog.type", "HIVE_METASTORE") + .put("hive.metastore.uri", hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint().toString()) + .put("hive.metastore.thrift.client.read-timeout", "1m") // read timed out sometimes happens with the default timeout + .put("fs.native-s3.enabled", "true") + .put("s3.aws-access-key", MINIO_ACCESS_KEY) + .put("s3.aws-secret-key", MINIO_SECRET_KEY) + .put("s3.region", MINIO_REGION) + .put("s3.endpoint", hiveMinioDataLake.getMinio().getMinioAddress()) + .put("s3.path-style-access", "true") + .put("s3.streaming.part-size", "5MB") // minimize memory usage + .put("s3.max-connections", "2") // verify no leaks + .put("iceberg.register-table-procedure.enabled", "true") + .put("iceberg.writer-sort-buffer-size", "1MB") + .buildOrThrow()); + queryRunner.createCatalog( + "iceberg_with_redirections", + "iceberg", + ImmutableMap.builder() + .put("iceberg.catalog.type", "HIVE_METASTORE") + .put("hive.metastore.uri", hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint().toString()) + .put("hive.metastore.thrift.client.read-timeout", "1m") // read timed out sometimes happens with the default timeout + .put("fs.native-s3.enabled", "true") + .put("s3.aws-access-key", MINIO_ACCESS_KEY) + .put("s3.aws-secret-key", MINIO_SECRET_KEY) + .put("s3.region", MINIO_REGION) + .put("s3.endpoint", hiveMinioDataLake.getMinio().getMinioAddress()) + .put("s3.path-style-access", "true") + .put("s3.streaming.part-size", "5MB") // minimize memory usage + .put("s3.max-connections", "2") // verify no leaks + .put("iceberg.register-table-procedure.enabled", "true") + .put("iceberg.writer-sort-buffer-size", "1MB") + .put("iceberg.hive-catalog-name", "hive") + .buildOrThrow()); + + queryRunner.installPlugin(new TestingHivePlugin(dataDirectory)); + Map hiveProperties = ImmutableMap.builder() + .put("hive.metastore", "thrift") + .put("hive.metastore.uri", hiveMinioDataLake.getHiveHadoop().getHiveMetastoreEndpoint().toString()) + .put("fs.native-s3.enabled", "true") + .put("s3.aws-access-key", MINIO_ACCESS_KEY) + .put("s3.aws-secret-key", MINIO_SECRET_KEY) + .put("s3.region", MINIO_REGION) + .put("s3.endpoint", hiveMinioDataLake.getMinio().getMinioAddress()) + .put("s3.path-style-access", "true") + .put("s3.streaming.part-size", "5MB") + .put("hive.max-partitions-per-scan", "1000") + .put("hive.max-partitions-for-eager-load", "1000") + .put("hive.security", "allow-all") + .buildOrThrow(); + queryRunner.createCatalog(HIVE_CATALOG, "hive", hiveProperties); + queryRunner.createCatalog( + "hive_with_redirections", + "hive", + ImmutableMap.builder() + .putAll(hiveProperties).put("hive.iceberg-catalog-name", "iceberg") + .buildOrThrow()); + + queryRunner.execute("CREATE SCHEMA " + tpchSchema + " WITH (location = 's3://" + bucketName + "/" + tpchSchema + "')"); + copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, icebergSession, ImmutableList.of(TpchTable.NATION)); + copyTpchTables(queryRunner, "tpch", TINY_SCHEMA_NAME, hiveSession, ImmutableList.of(TpchTable.REGION)); + queryRunner.execute("CREATE SCHEMA " + testSchema + " WITH (location = 's3://" + bucketName + "/" + testSchema + "')"); + + return queryRunner; + } + + @AfterAll + public void cleanup() + { + assertQuerySucceeds("DROP TABLE IF EXISTS hive." + tpchSchema + ".region"); + assertQuerySucceeds("DROP TABLE IF EXISTS iceberg." + tpchSchema + ".nation"); + assertQuerySucceeds("DROP SCHEMA IF EXISTS hive." + tpchSchema); + assertQuerySucceeds("DROP SCHEMA IF EXISTS hive." + testSchema); + } + + @Override + protected String getExpectedHiveCreateSchema(String catalogName) + { + return """ + CREATE SCHEMA %s.%s + WITH ( + location = 's3://%s/%s' + )""" + .formatted(catalogName, tpchSchema, bucketName, tpchSchema); + } + + @Override + protected String getExpectedIcebergCreateSchema(String catalogName) + { + String expectedIcebergCreateSchema = "CREATE SCHEMA %s.%s\n" + + "AUTHORIZATION USER user\n" + + "WITH (\n" + + " location = 's3://%s/%s'\n" + + ")"; + return format(expectedIcebergCreateSchema, catalogName, tpchSchema, bucketName, tpchSchema); + } +} diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSortFieldUtils.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSortFieldUtils.java index 42e5c524f380..46759fa6ae88 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSortFieldUtils.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestSortFieldUtils.java @@ -19,13 +19,13 @@ import org.apache.iceberg.SortOrder; import org.apache.iceberg.types.Types; import org.intellij.lang.annotations.Language; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import java.util.function.Consumer; import static io.trino.plugin.iceberg.SortFieldUtils.parseSortFields; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import static org.testng.Assert.assertEquals; public class TestSortFieldUtils { @@ -51,14 +51,16 @@ public void testParse() // uppercase assertParse("ORDER_KEY ASC NULLS LAST", sortOrder(builder -> builder.asc("order_key", NullOrder.NULLS_LAST))); assertParse("ORDER_KEY DESC NULLS FIRST", sortOrder(builder -> builder.desc("order_key", NullOrder.NULLS_FIRST))); - assertDoesNotParse("\"ORDER_KEY\" ASC NULLS LAST", "Uppercase characters in identifier '\"ORDER_KEY\"' are not supported."); - assertDoesNotParse("\"ORDER_KEY\" DESC NULLS FIRST", "Uppercase characters in identifier '\"ORDER_KEY\"' are not supported."); + assertDoesNotParse("\"ORDER_KEY\" ASC NULLS LAST", "Cannot find field 'ORDER_KEY' .*"); + assertDoesNotParse("\"ORDER_KEY\" DESC NULLS FIRST", "Cannot find field 'ORDER_KEY' .*"); // mixed case + assertParse("\"MixedCase\" ASC NULLS LAST", sortOrder(builder -> builder.asc("MixedCase", NullOrder.NULLS_LAST))); + assertParse("\"MixedCase\" DESC NULLS FIRST", sortOrder(builder -> builder.desc("MixedCase", NullOrder.NULLS_FIRST))); assertParse("OrDER_keY Asc NullS LAst", sortOrder(builder -> builder.asc("order_key", NullOrder.NULLS_LAST))); assertParse("OrDER_keY Desc NullS FIrsT", sortOrder(builder -> builder.desc("order_key", NullOrder.NULLS_FIRST))); - assertDoesNotParse("\"OrDER_keY\" Asc NullS LAst", "Uppercase characters in identifier '\"OrDER_keY\"' are not supported."); - assertDoesNotParse("\"OrDER_keY\" Desc NullS FIrsT", "Uppercase characters in identifier '\"OrDER_keY\"' are not supported."); + assertDoesNotParse("\"OrDER_keY\" Asc NullS LAst", "Cannot find field 'OrDER_keY' .*"); + assertDoesNotParse("\"OrDER_keY\" Desc NullS FIrsT", "Cannot find field 'OrDER_keY' .*"); assertParse("comment", sortOrder(builder -> builder.asc("comment"))); assertParse("\"comment\"", sortOrder(builder -> builder.asc("comment"))); @@ -100,19 +102,19 @@ public void testParse() private static void assertParse(@Language("SQL") String value, SortOrder expected) { - assertEquals(expected.fields().size(), 1); - assertEquals(parseField(value), expected); + assertThat(expected.fields()).hasSize(1); + assertThat(parseField(value)).isEqualTo(expected); } private static void assertDoesNotParse(@Language("SQL") String value) { - assertDoesNotParse(value, "Unable to parse sort field: [%s]".formatted(value)); + assertDoesNotParse(value, "\\QUnable to parse sort field: [%s]".formatted(value)); } - private static void assertDoesNotParse(@Language("SQL") String value, String expectedMessage) + private static void assertDoesNotParse(@Language("SQL") String value, @Language("RegExp") String expectedMessage) { assertThatThrownBy(() -> parseField(value)) - .hasMessage(expectedMessage); + .hasMessageMatching(expectedMessage); } private static SortOrder parseField(String value) @@ -130,7 +132,8 @@ private static SortOrder sortOrder(Consumer consumer) Types.NestedField.optional(5, "notes", Types.ListType.ofRequired(6, Types.StringType.get())), Types.NestedField.optional(7, "quoted field", Types.StringType.get()), Types.NestedField.optional(8, "quoted ts", Types.TimestampType.withoutZone()), - Types.NestedField.optional(9, "\"another\" \"quoted\" \"field\"", Types.StringType.get())); + Types.NestedField.optional(9, "\"another\" \"quoted\" \"field\"", Types.StringType.get()), + Types.NestedField.optional(10, "MixedCase", Types.StringType.get())); SortOrder.Builder builder = SortOrder.builderFor(schema); consumer.accept(builder); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestStructLikeWrapperWithFieldIdToIndex.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestStructLikeWrapperWithFieldIdToIndex.java index 09ad20afc743..4cb3de25b720 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestStructLikeWrapperWithFieldIdToIndex.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestStructLikeWrapperWithFieldIdToIndex.java @@ -19,7 +19,7 @@ import org.apache.iceberg.types.Types.StringType; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.StructLikeWrapper; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import static org.assertj.core.api.Assertions.assertThat; @@ -36,8 +36,8 @@ public void testStructLikeWrapperWithFieldIdToIndexEquals() NestedField.optional(1001, "level", IntegerType.get())); PartitionData firstPartitionData = PartitionData.fromJson("{\"partitionValues\":[\"ERROR\",\"449245\"]}", new Type[] {StringType.get(), IntegerType.get()}); PartitionData secondPartitionData = PartitionData.fromJson("{\"partitionValues\":[\"449245\",\"ERROR\"]}", new Type[] {IntegerType.get(), StringType.get()}); - PartitionTable.StructLikeWrapperWithFieldIdToIndex first = new PartitionTable.StructLikeWrapperWithFieldIdToIndex(StructLikeWrapper.forType(firstStructType).set(firstPartitionData), firstStructType); - PartitionTable.StructLikeWrapperWithFieldIdToIndex second = new PartitionTable.StructLikeWrapperWithFieldIdToIndex(StructLikeWrapper.forType(secondStructType).set(secondPartitionData), secondStructType); + StructLikeWrapperWithFieldIdToIndex first = new StructLikeWrapperWithFieldIdToIndex(StructLikeWrapper.forType(firstStructType).set(firstPartitionData), firstStructType); + StructLikeWrapperWithFieldIdToIndex second = new StructLikeWrapperWithFieldIdToIndex(StructLikeWrapper.forType(secondStructType).set(secondPartitionData), secondStructType); assertThat(first).isNotEqualTo(second); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestTableStatisticsWriter.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestTableStatisticsWriter.java index a81677f5b428..3ad580669fbe 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestTableStatisticsWriter.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestTableStatisticsWriter.java @@ -15,7 +15,7 @@ package io.trino.plugin.iceberg; import org.apache.iceberg.puffin.PuffinCompressionCodec; -import org.testng.annotations.Test; +import org.junit.jupiter.api.Test; import static org.assertj.core.api.Assertions.assertThat; diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergConnectorFactory.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergConnectorFactory.java index fbc412b548c2..84edaa76bc6d 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergConnectorFactory.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergConnectorFactory.java @@ -13,30 +13,47 @@ */ package io.trino.plugin.iceberg; +import com.google.common.collect.ImmutableMap; import com.google.inject.Module; import io.trino.filesystem.TrinoFileSystemFactory; +import io.trino.filesystem.local.LocalFileSystemFactory; +import io.trino.plugin.hive.metastore.file.FileHiveMetastoreConfig; import io.trino.spi.connector.Connector; import io.trino.spi.connector.ConnectorContext; import io.trino.spi.connector.ConnectorFactory; +import java.nio.file.Path; import java.util.Map; import java.util.Optional; -import static io.trino.plugin.iceberg.InternalIcebergConnectorFactory.createConnector; +import static com.google.inject.multibindings.MapBinder.newMapBinder; +import static io.airlift.configuration.ConfigBinder.configBinder; +import static io.trino.plugin.iceberg.IcebergConnectorFactory.createConnector; import static java.util.Objects.requireNonNull; public class TestingIcebergConnectorFactory implements ConnectorFactory { private final Optional icebergCatalogModule; - private final Optional fileSystemFactory; private final Module module; - public TestingIcebergConnectorFactory(Optional icebergCatalogModule, Optional fileSystemFactory, Module module) + public TestingIcebergConnectorFactory(Path localFileSystemRootPath) { + this(localFileSystemRootPath, Optional.empty()); + } + + @Deprecated + public TestingIcebergConnectorFactory( + Path localFileSystemRootPath, + Optional icebergCatalogModule) + { + boolean ignored = localFileSystemRootPath.toFile().mkdirs(); this.icebergCatalogModule = requireNonNull(icebergCatalogModule, "icebergCatalogModule is null"); - this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); - this.module = requireNonNull(module, "module is null"); + this.module = binder -> { + newMapBinder(binder, String.class, TrinoFileSystemFactory.class) + .addBinding("local").toInstance(new LocalFileSystemFactory(localFileSystemRootPath)); + configBinder(binder).bindConfigDefaults(FileHiveMetastoreConfig.class, config -> config.setCatalogDirectory("local:///")); + }; } @Override @@ -48,6 +65,12 @@ public String getName() @Override public Connector create(String catalogName, Map config, ConnectorContext context) { - return createConnector(catalogName, config, context, module, icebergCatalogModule, fileSystemFactory); + if (!config.containsKey("iceberg.catalog.type")) { + config = ImmutableMap.builder() + .putAll(config) + .put("iceberg.catalog.type", "TESTING_FILE_METASTORE") + .buildOrThrow(); + } + return createConnector(catalogName, config, context, module, icebergCatalogModule); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergPlugin.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergPlugin.java index 7ed7cb4c18c8..fa94c7db61c3 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergPlugin.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/TestingIcebergPlugin.java @@ -15,9 +15,9 @@ import com.google.common.collect.ImmutableList; import com.google.inject.Module; -import io.trino.filesystem.TrinoFileSystemFactory; import io.trino.spi.connector.ConnectorFactory; +import java.nio.file.Path; import java.util.List; import java.util.Optional; @@ -27,15 +27,19 @@ public class TestingIcebergPlugin extends IcebergPlugin { + private final Path localFileSystemRootPath; private final Optional icebergCatalogModule; - private final Optional fileSystemFactory; - private final Module module; - public TestingIcebergPlugin(Optional icebergCatalogModule, Optional fileSystemFactory, Module module) + public TestingIcebergPlugin(Path localFileSystemRootPath) { + this(localFileSystemRootPath, Optional.empty()); + } + + @Deprecated + public TestingIcebergPlugin(Path localFileSystemRootPath, Optional icebergCatalogModule) + { + this.localFileSystemRootPath = requireNonNull(localFileSystemRootPath, "localFileSystemRootPath is null"); this.icebergCatalogModule = requireNonNull(icebergCatalogModule, "icebergCatalogModule is null"); - this.fileSystemFactory = requireNonNull(fileSystemFactory, "fileSystemFactory is null"); - this.module = requireNonNull(module, "module is null"); } @Override @@ -44,6 +48,6 @@ public Iterable getConnectorFactories() List connectorFactories = ImmutableList.copyOf(super.getConnectorFactories()); verify(connectorFactories.size() == 1, "Unexpected connector factories: %s", connectorFactories); - return ImmutableList.of(new TestingIcebergConnectorFactory(icebergCatalogModule, fileSystemFactory, module)); + return ImmutableList.of(new TestingIcebergConnectorFactory(localFileSystemRootPath, icebergCatalogModule)); } } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/BaseTrinoCatalogTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/BaseTrinoCatalogTest.java index da32418fad8d..aa13dc42780c 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/BaseTrinoCatalogTest.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/BaseTrinoCatalogTest.java @@ -147,7 +147,7 @@ public void testCreateTable() new Schema(Types.NestedField.of(1, true, "col1", Types.LongType.get())), PartitionSpec.unpartitioned(), SortOrder.unsorted(), - tableLocation, + Optional.of(tableLocation), tableProperties) .commitTransaction(); assertThat(catalog.listTables(SESSION, Optional.of(namespace))).contains(schemaTableName); @@ -208,7 +208,7 @@ public void testCreateWithSortTable() tableSchema, PartitionSpec.unpartitioned(), sortOrder, - tableLocation, + Optional.of(tableLocation), ImmutableMap.of()) .commitTransaction(); assertThat(catalog.listTables(SESSION, Optional.of(namespace))).contains(schemaTableName); @@ -263,7 +263,7 @@ public void testRenameTable() new Schema(Types.NestedField.of(1, true, "col1", Types.LongType.get())), PartitionSpec.unpartitioned(), SortOrder.unsorted(), - arbitraryTableLocation(catalog, SESSION, sourceSchemaTableName), + Optional.of(arbitraryTableLocation(catalog, SESSION, sourceSchemaTableName)), ImmutableMap.of()) .commitTransaction(); assertThat(catalog.listTables(SESSION, Optional.of(namespace))).contains(sourceSchemaTableName); diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/nessie/TestTrinoNessieCatalog.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/nessie/TestTrinoNessieCatalog.java index 45030e2f09e5..7bcdceb2c597 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/nessie/TestTrinoNessieCatalog.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/nessie/TestTrinoNessieCatalog.java @@ -31,8 +31,8 @@ import io.trino.spi.security.TrinoPrincipal; import io.trino.spi.type.TestingTypeManager; import org.apache.iceberg.nessie.NessieIcebergClient; -import org.projectnessie.client.api.NessieApiV1; -import org.projectnessie.client.http.HttpClientBuilder; +import org.projectnessie.client.NessieClientBuilder; +import org.projectnessie.client.api.NessieApiV2; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -88,9 +88,9 @@ protected TrinoCatalog createTrinoCatalog(boolean useUniqueTableLocations) TrinoFileSystemFactory fileSystemFactory = new HdfsFileSystemFactory(HDFS_ENVIRONMENT, HDFS_FILE_SYSTEM_STATS); IcebergNessieCatalogConfig icebergNessieCatalogConfig = new IcebergNessieCatalogConfig() .setServerUri(URI.create(nessieContainer.getRestApiUri())); - NessieApiV1 nessieApi = HttpClientBuilder.builder() + NessieApiV2 nessieApi = NessieClientBuilder.createClientBuilderFromSystemSettings() .withUri(nessieContainer.getRestApiUri()) - .build(NessieApiV1.class); + .build(NessieApiV2.class); NessieIcebergClient nessieClient = new NessieIcebergClient(nessieApi, icebergNessieCatalogConfig.getDefaultReferenceName(), null, ImmutableMap.of()); return new TrinoNessieCatalog( new CatalogName("catalog_name"), @@ -112,9 +112,9 @@ public void testDefaultLocation() IcebergNessieCatalogConfig icebergNessieCatalogConfig = new IcebergNessieCatalogConfig() .setDefaultWarehouseDir(tmpDirectory.toAbsolutePath().toString()) .setServerUri(URI.create(nessieContainer.getRestApiUri())); - NessieApiV1 nessieApi = HttpClientBuilder.builder() + NessieApiV2 nessieApi = NessieClientBuilder.createClientBuilderFromSystemSettings() .withUri(nessieContainer.getRestApiUri()) - .build(NessieApiV1.class); + .build(NessieApiV2.class); NessieIcebergClient nessieClient = new NessieIcebergClient(nessieApi, icebergNessieCatalogConfig.getDefaultReferenceName(), null, ImmutableMap.of()); TrinoCatalog catalogWithDefaultLocation = new TrinoNessieCatalog( new CatalogName("catalog_name"), diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/rest/TestIcebergRestCatalogConfig.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/rest/TestIcebergRestCatalogConfig.java index 5e411999f304..56094d056b65 100644 --- a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/rest/TestIcebergRestCatalogConfig.java +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/rest/TestIcebergRestCatalogConfig.java @@ -14,13 +14,17 @@ package io.trino.plugin.iceberg.catalog.rest; import com.google.common.collect.ImmutableMap; -import org.testng.annotations.Test; +import io.airlift.units.Duration; +import org.apache.iceberg.CatalogProperties; +import org.junit.jupiter.api.Test; import java.util.Map; import static io.airlift.configuration.testing.ConfigAssertions.assertFullMapping; import static io.airlift.configuration.testing.ConfigAssertions.assertRecordedDefaults; import static io.airlift.configuration.testing.ConfigAssertions.recordDefaults; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.MINUTES; public class TestIcebergRestCatalogConfig { @@ -29,9 +33,16 @@ public void testDefaults() { assertRecordedDefaults(recordDefaults(IcebergRestCatalogConfig.class) .setBaseUri(null) + .setPrefix(null) .setWarehouse(null) + .setNestedNamespaceEnabled(false) .setSessionType(IcebergRestCatalogConfig.SessionType.NONE) - .setSecurity(IcebergRestCatalogConfig.Security.NONE)); + .setSessionTimeout(new Duration(CatalogProperties.AUTH_SESSION_TIMEOUT_MS_DEFAULT, MILLISECONDS)) + .setSecurity(IcebergRestCatalogConfig.Security.NONE) + .setVendedCredentialsEnabled(false) + .setViewEndpointsEnabled(true) + .setCaseInsensitiveNameMatching(false) + .setCaseInsensitiveNameMatchingCacheTtl(new Duration(1, MINUTES))); } @Test @@ -39,16 +50,30 @@ public void testExplicitPropertyMappings() { Map properties = ImmutableMap.builder() .put("iceberg.rest-catalog.uri", "http://localhost:1234") + .put("iceberg.rest-catalog.prefix", "dev") .put("iceberg.rest-catalog.warehouse", "test_warehouse_identifier") + .put("iceberg.rest-catalog.nested-namespace-enabled", "true") .put("iceberg.rest-catalog.security", "OAUTH2") .put("iceberg.rest-catalog.session", "USER") + .put("iceberg.rest-catalog.session-timeout", "100ms") + .put("iceberg.rest-catalog.vended-credentials-enabled", "true") + .put("iceberg.rest-catalog.view-endpoints-enabled", "false") + .put("iceberg.rest-catalog.case-insensitive-name-matching", "true") + .put("iceberg.rest-catalog.case-insensitive-name-matching.cache-ttl", "3m") .buildOrThrow(); IcebergRestCatalogConfig expected = new IcebergRestCatalogConfig() .setBaseUri("http://localhost:1234") + .setPrefix("dev") .setWarehouse("test_warehouse_identifier") + .setNestedNamespaceEnabled(true) .setSessionType(IcebergRestCatalogConfig.SessionType.USER) - .setSecurity(IcebergRestCatalogConfig.Security.OAUTH2); + .setSessionTimeout(new Duration(100, MILLISECONDS)) + .setSecurity(IcebergRestCatalogConfig.Security.OAUTH2) + .setVendedCredentialsEnabled(true) + .setViewEndpointsEnabled(false) + .setCaseInsensitiveNameMatching(true) + .setCaseInsensitiveNameMatchingCacheTtl(new Duration(3, MINUTES)); assertFullMapping(properties, expected); } diff --git a/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/rest/TestIcebergS3TablesConnectorSmokeTest.java b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/rest/TestIcebergS3TablesConnectorSmokeTest.java new file mode 100644 index 000000000000..406b051e6b72 --- /dev/null +++ b/plugin/trino-iceberg/src/test/java/io/trino/plugin/iceberg/catalog/rest/TestIcebergS3TablesConnectorSmokeTest.java @@ -0,0 +1,293 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.iceberg.catalog.rest; + +import io.trino.filesystem.Location; +import io.trino.plugin.iceberg.BaseIcebergConnectorSmokeTest; +import io.trino.plugin.iceberg.IcebergConfig; +import io.trino.plugin.iceberg.IcebergConnector; +import io.trino.plugin.iceberg.IcebergQueryRunner; +import io.trino.plugin.iceberg.catalog.TrinoCatalog; +import io.trino.plugin.iceberg.catalog.TrinoCatalogFactory; +import io.trino.spi.connector.SchemaTableName; +import io.trino.testing.DistributedQueryRunner; +import io.trino.testing.QueryRunner; +import io.trino.testing.TestingConnectorBehavior; +import org.apache.iceberg.BaseTable; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; + +import static io.trino.testing.SystemEnvironmentUtils.requireEnv; +import static io.trino.testing.TestingNames.randomNameSuffix; +import static java.lang.String.format; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; + +@TestInstance(PER_CLASS) +final class TestIcebergS3TablesConnectorSmokeTest + extends BaseIcebergConnectorSmokeTest +{ + public static final String S3_TABLES_BUCKET = requireEnv("S3_TABLES_BUCKET"); + public static final String AWS_ACCESS_KEY_ID = requireEnv("AWS_ACCESS_KEY_ID"); + public static final String AWS_SECRET_ACCESS_KEY = requireEnv("AWS_SECRET_ACCESS_KEY"); + public static final String AWS_REGION = requireEnv("AWS_REGION"); + + public TestIcebergS3TablesConnectorSmokeTest() + { + super(new IcebergConfig().getFileFormat().toIceberg()); + } + + @Override + protected boolean hasBehavior(TestingConnectorBehavior connectorBehavior) + { + return switch (connectorBehavior) { + case SUPPORTS_CREATE_MATERIALIZED_VIEW, + SUPPORTS_RENAME_MATERIALIZED_VIEW, + SUPPORTS_RENAME_SCHEMA, + SUPPORTS_RENAME_TABLE -> false; + default -> super.hasBehavior(connectorBehavior); + }; + } + + @Override + protected QueryRunner createQueryRunner() + throws Exception + { + return IcebergQueryRunner.builder() + .addIcebergProperty("iceberg.file-format", format.name()) + .addIcebergProperty("iceberg.register-table-procedure.enabled", "true") + .addIcebergProperty("iceberg.catalog.type", "rest") + .addIcebergProperty("iceberg.rest-catalog.uri", "https://glue.%s.amazonaws.com/iceberg".formatted(AWS_REGION)) + .addIcebergProperty("iceberg.rest-catalog.warehouse", "s3tablescatalog/" + S3_TABLES_BUCKET) + .addIcebergProperty("iceberg.rest-catalog.view-endpoints-enabled", "false") + .addIcebergProperty("iceberg.rest-catalog.security", "sigv4") + .addIcebergProperty("iceberg.rest-catalog.signing-name", "glue") + .addIcebergProperty("iceberg.writer-sort-buffer-size", "1MB") + .addIcebergProperty("iceberg.allowed-extra-properties", "write.metadata.delete-after-commit.enabled,write.metadata.previous-versions-max") + .addIcebergProperty("fs.native-s3.enabled", "true") + .addIcebergProperty("s3.region", AWS_REGION) + .addIcebergProperty("s3.aws-access-key", AWS_ACCESS_KEY_ID) + .addIcebergProperty("s3.aws-secret-key", AWS_SECRET_ACCESS_KEY) + .setInitialTables(REQUIRED_TPCH_TABLES) + .build(); + } + + @Override + protected String getMetadataLocation(String tableName) + { + DistributedQueryRunner queryRunner = getDistributedQueryRunner(); + TrinoCatalogFactory catalogFactory = ((IcebergConnector) queryRunner.getCoordinator().getConnector("iceberg")).getInjector().getInstance(TrinoCatalogFactory.class); + TrinoCatalog trinoCatalog = catalogFactory.create(getSession().getIdentity().toConnectorIdentity()); + BaseTable table = (BaseTable) trinoCatalog.loadTable(getSession().toConnectorSession(), new SchemaTableName(getSession().getSchema().orElseThrow(), tableName)); + return table.operations().current().metadataFileLocation(); + } + + @Override + protected String schemaPath() + { + return "dummy"; + } + + @Override + protected boolean locationExists(String location) + { + throw new UnsupportedOperationException(); + } + + @Override + protected boolean isFileSorted(Location path, String sortColumnName) + { + throw new UnsupportedOperationException(); + } + + @Override + protected void deleteDirectory(String location) + { + throw new UnsupportedOperationException(); + } + + @Override + protected void dropTableFromMetastore(String tableName) + { + throw new UnsupportedOperationException(); + } + + @Test + @Override // Override because the location pattern differs + public void testShowCreateTable() + { + assertThat((String) computeScalar("SHOW CREATE TABLE region")) + .matches("CREATE TABLE iceberg.tpch.region \\(\n" + + " regionkey bigint,\n" + + " name varchar,\n" + + " comment varchar\n" + + "\\)\n" + + "WITH \\(\n" + + " compression_codec = 'ZSTD',\n" + + " format = 'PARQUET',\n" + + " format_version = 2,\n" + + " location = 's3://.*--table-s3',\n" + + " max_commit_retry = 4\n" + + "\\)"); + } + + @Test + @Override + public void testRenameSchema() + { + assertThatThrownBy(super::testRenameSchema) + .hasMessageContaining("renameNamespace is not supported for Iceberg REST catalog"); + } + + @Test + @Override + public void testMaterializedView() + { + assertThatThrownBy(super::testMaterializedView) + .hasMessageContaining("createMaterializedView is not supported for Iceberg REST catalog"); + } + + @Test + @Override // Override because S3 Tables does not support specifying the location + public void testCreateTableWithTrailingSpaceInLocation() + { + String tableName = "test_create_table_with_trailing_space_" + randomNameSuffix(); + String tableLocationWithTrailingSpace = schemaPath() + tableName + " "; + + assertQueryFails( + format("CREATE TABLE %s WITH (location = '%s') AS SELECT 1 AS a, 'INDIA' AS b, true AS c", tableName, tableLocationWithTrailingSpace), + "Failed to create transaction"); + } + + @Test + @Override + public void testRenameTable() + { + assertThatThrownBy(super::testRenameTable) + .hasStackTraceContaining("Unable to process: RenameTable endpoint is not supported for Glue Catalog"); + } + + @Test + @Override + public void testView() + { + assertThatThrownBy(super::testView) + .hasMessageContaining("Server does not support endpoint: POST /v1/{prefix}/namespaces/{namespace}/views"); + } + + @Test + @Override + public void testCommentViewColumn() + { + assertThatThrownBy(super::testCommentViewColumn) + .hasMessageContaining("Server does not support endpoint: POST /v1/{prefix}/namespaces/{namespace}/views"); + } + + @Test + @Override + public void testCommentView() + { + assertThatThrownBy(super::testCommentView) + .hasMessageContaining("Server does not support endpoint: POST /v1/{prefix}/namespaces/{namespace}/views"); + } + + @Test + @Override // The locationExists helper method is unsupported + public void testCreateTableWithNonExistingSchemaVerifyLocation() {} + + @Test + @Override // The TrinoFileSystem.deleteFile is unsupported + public void testDropTableWithMissingMetadataFile() {} + + @Test + @Override // The TrinoFileSystem.deleteFile is unsupported + public void testDropTableWithMissingManifestListFile() {} + + @Test + @Override // The TrinoFileSystem.deleteFile is unsupported + public void testDropTableWithMissingSnapshotFile() {} + + @Test + @Override // The TrinoFileSystem.listFiles is unsupported + public void testDropTableWithMissingDataFile() {} + + @Test + @Override // The TrinoFileSystem.deleteDirectory is unsupported + public void testDropTableWithNonExistentTableLocation() {} + + @Test + @Override // BaseIcebergConnectorSmokeTest.isFileSorted method is unsupported + public void testSortedNationTable() {} + + @Test + @Override // The TrinoFileSystem.deleteFile is unsupported + public void testFileSortingWithLargerTable() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithTableLocation() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithComments() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithShowCreateTable() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithReInsert() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithDroppedTable() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithDifferentTableName() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithMetadataFile() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRegisterTableWithTrailingSpaceInLocation() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testUnregisterTable() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testUnregisterBrokenTable() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testUnregisterTableNotExistingSchema() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testUnregisterTableNotExistingTable() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testRepeatUnregisterTable() {} + + @Test + @Override // The procedure is unsupported in S3 Tables + public void testUnregisterTableAccessControl() {} +} diff --git a/plugin/trino-iceberg/src/test/java/org/apache/iceberg/rest/RestCatalogServlet.java b/plugin/trino-iceberg/src/test/java/org/apache/iceberg/rest/RestCatalogServlet.java index 0ccb550e88b2..51cd5327f700 100644 --- a/plugin/trino-iceberg/src/test/java/org/apache/iceberg/rest/RestCatalogServlet.java +++ b/plugin/trino-iceberg/src/test/java/org/apache/iceberg/rest/RestCatalogServlet.java @@ -22,7 +22,7 @@ import org.apache.iceberg.exceptions.RESTException; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.io.CharStreams; -import org.apache.iceberg.rest.RESTCatalogAdapter.HTTPMethod; +import org.apache.iceberg.rest.HTTPRequest.HTTPMethod; import org.apache.iceberg.rest.RESTCatalogAdapter.Route; import org.apache.iceberg.rest.responses.ErrorResponse; import org.apache.iceberg.util.Pair; @@ -98,15 +98,18 @@ protected void execute(ServletRequestContext context, HttpServletResponse respon return; } + HTTPRequest request = restCatalogAdapter.buildRequest( + context.method(), + context.path(), + context.queryParams(), + context.headers(), + context.body()); try { Object responseBody = restCatalogAdapter.execute( - context.method(), - context.path(), - context.queryParams(), - context.body(), + request, context.route().responseClass(), - context.headers(), - handle(response)); + handle(response), + x -> {}); if (responseBody != null) { RESTObjectMapper.mapper().writeValue(response.getWriter(), responseBody); diff --git a/plugin/trino-pinot/pom.xml b/plugin/trino-pinot/pom.xml index bc33a4d1b8c6..18897fb29f9d 100755 --- a/plugin/trino-pinot/pom.xml +++ b/plugin/trino-pinot/pom.xml @@ -144,6 +144,10 @@ log4j log4j + + org.apache.commons + commons-lang3 + org.apache.logging.log4j log4j-core @@ -197,6 +201,10 @@ commons-codec commons-codec + + commons-io + commons-io + commons-logging commons-logging @@ -313,6 +321,10 @@ com.fasterxml.jackson.core jackson-databind + + commons-io + commons-io + commons-logging commons-logging @@ -419,10 +431,18 @@ pinot-spi ${dep.pinot.version} + + commons-io + commons-io + commons-logging commons-logging + + org.apache.commons + commons-lang3 + org.apache.logging.log4j log4j-1.2-api @@ -500,7 +520,7 @@ org.apache.commons commons-lang3 - 3.11 + 3.18.0 runtime diff --git a/pom.xml b/pom.xml index 2b12d5a52bd5..f65d865fdaad 100644 --- a/pom.xml +++ b/pom.xml @@ -153,12 +153,14 @@ 4.13.0 235 12.0.1 - 1.11.1 + 1.12.0 + 2.0.17 + 2.19.2 ${dep.airlift.version} 2.1.1 - 1.12.505 - 2.20.93 - 0.11.5 + 1.12.788 + 2.32.25 + 0.12.7 21.9.0.0 1.21 200 @@ -171,16 +173,17 @@ 3.3.2 4.14.0 8.4.5 - 1.3.0 + 1.9.2 3.23.2 4.5.0 - 4.1.93.Final + 4.2.4.Final 5.13.0 3.3.0 9.21.0 - 1.13.1 + 1.15.2 1.37 + --add-modules=jdk.incubator.vector 81 @@ -328,6 +331,10 @@ joda-time joda-time + + org.apache.httpcomponents + httpclient + @@ -576,7 +583,7 @@ commons-codec commons-codec - 1.15 + 1.19.0 @@ -594,7 +601,7 @@ io.airlift aircompressor - 0.25 + 2.0.2 @@ -612,7 +619,7 @@ io.airlift units - 1.9 + 1.10 @@ -986,6 +993,12 @@ io.trino trino-hive-formats ${project.version} + + + com.fasterxml.jackson.core + jackson-databind + + @@ -1375,6 +1388,20 @@ io.trino.hive hive-apache 3.1.2-22 + + + javax.annotation + javax.annotation-api + + + org.apache.parquet + parquet-hadoop + + + org.apache.parquet + parquet-jackson + + @@ -1569,13 +1596,13 @@ org.apache.commons commons-compress - 1.23.0 + 1.28.0 org.apache.commons commons-lang3 - 3.12.0 + 3.18.0 @@ -1596,6 +1623,12 @@ + + org.apache.iceberg + iceberg-aws + ${dep.iceberg.version} + + org.apache.iceberg iceberg-core @@ -1863,7 +1896,7 @@ org.roaringbitmap RoaringBitmap - 0.9.47 + 1.3.0 diff --git a/testing/trino-product-tests-launcher/pom.xml b/testing/trino-product-tests-launcher/pom.xml index c76756253de5..aef9d5a3d30f 100644 --- a/testing/trino-product-tests-launcher/pom.xml +++ b/testing/trino-product-tests-launcher/pom.xml @@ -35,6 +35,12 @@ com.fasterxml.jackson.dataformat jackson-dataformat-yaml + + + org.yaml + snakeyaml + + diff --git a/testing/trino-product-tests/pom.xml b/testing/trino-product-tests/pom.xml index 816a244fcbad..81d6ca778557 100644 --- a/testing/trino-product-tests/pom.xml +++ b/testing/trino-product-tests/pom.xml @@ -36,6 +36,12 @@ com.datastax.oss java-driver-core + + + org.reactivestreams + reactive-streams + + @@ -183,6 +189,10 @@ io.trino.tempto tempto-core + + commons-io + commons-io + org.apache.httpcomponents httpclient diff --git a/testing/trino-testing-services/src/main/java/io/trino/testing/SystemEnvironmentUtils.java b/testing/trino-testing-services/src/main/java/io/trino/testing/SystemEnvironmentUtils.java new file mode 100644 index 000000000000..0f866033cf56 --- /dev/null +++ b/testing/trino-testing-services/src/main/java/io/trino/testing/SystemEnvironmentUtils.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.testing; + +import static java.util.Objects.requireNonNull; + +public final class SystemEnvironmentUtils +{ + private SystemEnvironmentUtils() {} + + /** + * Get the named environment variable, throwing an exception if it is not set. + */ + public static String requireEnv(String variable) + { + return requireNonNull(System.getenv(variable), () -> "environment variable not set: " + variable); + } + + public static boolean isEnvSet(String variable) + { + return System.getenv(variable) != null; + } +}