apache · ahmedabu98 · Jan 6, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 8, 2025
diff --git a/sdks/java/io/iceberg/bqms/build.gradle b/sdks/java/io/iceberg/bqms/build.gradle
@@ -21,7 +21,9 @@ plugins {
 
 applyJavaNature(
         automaticModuleName: 'org.apache.beam.sdk.io.iceberg.bqms',
-        shadowClosure: {},
+        shadowClosure: {
+            relocate "com.google.auth", getJavaRelocatedPath("bqms.com.google.auth")
+        },
         exportJavadoc: false,
         publish: false, // it's an intermediate jar for io-expansion-service
         validateShadowJar: false

diff --git a/sdks/java/io/iceberg/build.gradle b/sdks/java/io/iceberg/build.gradle
@@ -50,6 +50,7 @@ dependencies {
     implementation library.java.slf4j_api
     implementation library.java.joda_time
     implementation "org.apache.parquet:parquet-column:$parquet_version"
+    implementation "org.apache.parquet:parquet-hadoop:$parquet_version"
     implementation "org.apache.orc:orc-core:$orc_version"
     implementation "org.apache.iceberg:iceberg-core:$iceberg_version"
     implementation "org.apache.iceberg:iceberg-api:$iceberg_version"

diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/CreateReadTasksDoFn.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/CreateReadTasksDoFn.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.iceberg;
+
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Metrics;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.values.KV;
+import org.apache.iceberg.CombinedScanTask;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.IncrementalAppendScan;
+import org.apache.iceberg.ScanTaskParser;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.io.CloseableIterable;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Scans the given {@link SnapshotRange}, and creates multiple {@link ReadTask}s. Each task
+ * represents a portion of a data file that was appended within the snapshot range.
+ */
+class CreateReadTasksDoFn extends DoFn<SnapshotRange, KV<ReadTaskDescriptor, ReadTask>> {
+  private static final Logger LOG = LoggerFactory.getLogger(CreateReadTasksDoFn.class);
+  private static final Counter numFileScanTasks =
+      Metrics.counter(CreateReadTasksDoFn.class, "numFileScanTasks");
+  private final IcebergCatalogConfig catalogConfig;
+
+  CreateReadTasksDoFn(IcebergCatalogConfig catalogConfig) {
+    this.catalogConfig = catalogConfig;
+  }
+
+  @ProcessElement
+  public void process(
+      @Element SnapshotRange range, OutputReceiver<KV<ReadTaskDescriptor, ReadTask>> out)
+      throws IOException, ExecutionException {
+    Table table = TableCache.get(range.getTableIdentifier(), catalogConfig.catalog());
+    @Nullable Long fromSnapshot = range.getFromSnapshotExclusive();
+    long toSnapshot = range.getToSnapshot();
+
+    LOG.info("Planning to scan snapshot range ({}, {}]", fromSnapshot, toSnapshot);
+    IncrementalAppendScan scan =
+        table
+            .newIncrementalAppendScan()
+            .toSnapshot(toSnapshot)
+            .option(TableProperties.SPLIT_SIZE, String.valueOf(TableProperties.SPLIT_SIZE_DEFAULT));
+    if (fromSnapshot != null) {
+      scan = scan.fromSnapshotExclusive(fromSnapshot);
+    }
+
+    try (CloseableIterable<CombinedScanTask> combinedScanTasks = scan.planTasks()) {
+      for (CombinedScanTask combinedScanTask : combinedScanTasks) {
+        // A single DataFile can be broken up into multiple FileScanTasks
+        // if it is large enough.
+        for (FileScanTask fileScanTask : combinedScanTask.tasks()) {
+          ReadTask task =
+              ReadTask.builder()
+                  .setTableIdentifierString(range.getTableIdentifierString())
+                  .setFileScanTaskJson(ScanTaskParser.toJson(fileScanTask))
+                  .setByteSize(fileScanTask.sizeBytes())
+                  .build();
+          ReadTaskDescriptor descriptor =
+              ReadTaskDescriptor.builder()
+                  .setTableIdentifierString(range.getTableIdentifierString())
+                  .build();
+          out.output(KV.of(descriptor, task));
+          numFileScanTasks.inc();
+        }
+      }
+    }
+  }
+}
diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergCatalogConfig.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergCatalogConfig.java
@@ -51,26 +51,25 @@ public static Builder builder() {
   }
 
   public org.apache.iceberg.catalog.Catalog catalog() {
-    if (cachedCatalog != null) {
-      return cachedCatalog;
+    if (cachedCatalog == null) {
+      String catalogName = getCatalogName();
+      if (catalogName == null) {
+        catalogName = "apache-beam-" + ReleaseInfo.getReleaseInfo().getVersion();
+      }
+      Map<String, String> catalogProps = getCatalogProperties();
+      if (catalogProps == null) {
+        catalogProps = Maps.newHashMap();
+      }
+      Map<String, String> confProps = getConfigProperties();
+      if (confProps == null) {
+        confProps = Maps.newHashMap();
+      }
+      Configuration config = new Configuration();
+      for (Map.Entry<String, String> prop : confProps.entrySet()) {
+        config.set(prop.getKey(), prop.getValue());
+      }
+      cachedCatalog = CatalogUtil.buildIcebergCatalog(catalogName, catalogProps, config);
     }
-    String catalogName = getCatalogName();
-    if (catalogName == null) {
-      catalogName = "apache-beam-" + ReleaseInfo.getReleaseInfo().getVersion();
-    }
-    Map<String, String> catalogProps = getCatalogProperties();
-    if (catalogProps == null) {
-      catalogProps = Maps.newHashMap();
-    }
-    Map<String, String> confProps = getConfigProperties();
-    if (confProps == null) {
-      confProps = Maps.newHashMap();
-    }
-    Configuration config = new Configuration();
-    for (Map.Entry<String, String> prop : confProps.entrySet()) {
-      config.set(prop.getKey(), prop.getValue());
-    }
-    cachedCatalog = CatalogUtil.buildIcebergCatalog(catalogName, catalogProps, config);
     return cachedCatalog;
   }
 

diff --git a/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java b/sdks/java/io/iceberg/src/main/java/org/apache/beam/sdk/io/iceberg/IcebergIO.java
@@ -81,10 +81,15 @@
  *     template to use dynamic destinations (see the `Dynamic Destinations` section below for details). </td>
  *   </tr>
  *   <tr>
- *       <td> {@code triggering_frequency_seconds} </td> <td> {@code int} </td> <td> Required for streaming writes. Roughly every
+ *     <td> {@code triggering_frequency_seconds} </td>
+ *     <td> {@code int} </td>
+ *     <td>
+ *         <p><b>Sink:</b> Required for streaming writes. Roughly every
  *       {@code triggering_frequency_seconds} duration, the sink will write records to data files and produce a table snapshot.
  *       Generally, a higher value will produce fewer, larger data files.
- *       </td>
+ *       <p><b>Source:</b> Enables streaming reads. Roughly every {@code triggering_frequency_seconds} duration, the source
+ *       will scan the table for new snapshots and read new records.
+ *     </td>
  *   </tr>
  *   <tr>
  *     <td> {@code catalog_name} </td> <td> {@code str} </td> <td> The name of the catalog. Defaults to {@code apache-beam-<VERSION>}. </td>
@@ -101,6 +106,20 @@
  *     implementation, but <a href="https://iceberg.apache.org/docs/latest/configuration/#hadoop-configuration">this list</a>
  *     is a good starting point.
  *   </tr>
+ *   <tr>
+ *     <td> {@code from_snapshot_exclusive} </td>
+ *     <td> {@code long} </td>
+ *     <td> For the source; starts reading from this snapshot ID (exclusive). If unset, it will start reading from the
+ *     oldest snapshot (inclusive).
+ *     </td>
+ *   </tr>
+ *   <tr>
+ *     <td> {@code to_snapshot} </td>
+ *     <td> {@code long} </td>
+ *     <td> For the source; Reads up to this snapshot ID (inclusive). If unset and the source is bounded, it will read
+ *     up to the current snapshot (inclusive). If unset and source is unbounded, it will continue polling for new snapshots forever.
+ *     </td>
+ *   </tr>
  * </table>
  *
  * <p><b>Additional configuration options are provided in the `Pre-filtering Options` section below,
@@ -405,6 +424,12 @@ public abstract static class ReadRows extends PTransform<PBegin, PCollection<Row
 
     abstract @Nullable TableIdentifier getTableIdentifier();
 
+    abstract @Nullable Long getFromSnapshotExclusive();
+
+    abstract @Nullable Long getToSnapshot();
+
+    abstract @Nullable Duration getTriggeringFrequency();
+
     abstract Builder toBuilder();
 
     @AutoValue.Builder
@@ -413,29 +438,56 @@ abstract static class Builder {
 
       abstract Builder setTableIdentifier(TableIdentifier identifier);
 
+      abstract Builder setFromSnapshotExclusive(@Nullable Long fromSnapshotExclusive);
+
+      abstract Builder setToSnapshot(@Nullable Long toSnapshot);
+
+      abstract Builder setTriggeringFrequency(Duration triggeringFrequency);
+
       abstract ReadRows build();
     }
 
     public ReadRows from(TableIdentifier tableIdentifier) {
       return toBuilder().setTableIdentifier(tableIdentifier).build();
     }
 
+    public ReadRows fromSnapshotExclusive(@Nullable Long fromSnapshotExclusive) {
+      return toBuilder().setFromSnapshotExclusive(fromSnapshotExclusive).build();
+    }
+
+    public ReadRows toSnapshot(@Nullable Long toSnapshot) {
+      return toBuilder().setToSnapshot(toSnapshot).build();
+    }
+
+    public ReadRows withTriggeringFrequency(Duration triggeringFrequency) {
+      return toBuilder().setTriggeringFrequency(triggeringFrequency).build();
+    }
+
     @Override
     public PCollection<Row> expand(PBegin input) {
       TableIdentifier tableId =
           checkStateNotNull(getTableIdentifier(), "Must set a table to read from.");
 
       Table table = getCatalogConfig().catalog().loadTable(tableId);
 
-      return input.apply(
-          Read.from(
-              new ScanSource(
-                  IcebergScanConfig.builder()
-                      .setCatalogConfig(getCatalogConfig())
-                      .setScanType(IcebergScanConfig.ScanType.TABLE)
-                      .setTableIdentifier(tableId)
-                      .setSchema(IcebergUtils.icebergSchemaToBeamSchema(table.schema()))
-                      .build())));
+      IcebergScanConfig scanConfig =
+          IcebergScanConfig.builder()
+              .setCatalogConfig(getCatalogConfig())
+              .setScanType(IcebergScanConfig.ScanType.TABLE)
+              .setTableIdentifier(tableId)
+              .setSchema(IcebergUtils.icebergSchemaToBeamSchema(table.schema()))
+              .setFromSnapshotExclusive(getFromSnapshotExclusive())
+              .setToSnapshot(getToSnapshot())
+              .build();
+      if (getTriggeringFrequency() != null
+          || scanConfig.getToSnapshot() != null
+          || scanConfig.getFromSnapshotExclusive() != null) {
+        return input
+            .apply(new IncrementalScanSource(scanConfig, getTriggeringFrequency()))
+            .setRowSchema(IcebergUtils.icebergSchemaToBeamSchema(table.schema()));
+      }
+
+      return input.apply(Read.from(new ScanSource(scanConfig)));
     }
   }
 }