apache · jerrypeng · Oct 26, 2025 · Oct 29, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java b/common/utils-java/src/main/java/org/apache/spark/internal/LogKeys.java
@@ -823,6 +823,7 @@ public enum LogKeys implements LogKey {
   TIMEOUT,
   TIMER,
   TIMESTAMP,
+  TIMESTAMP_COLUMN_NAME,
   TIME_UNITS,
   TIP,
   TOKEN,

diff --git a/...fka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala b/...fka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala
@@ -19,15 +19,19 @@ package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
 
+import org.apache.kafka.common.record.TimestampType
+
 import org.apache.spark.TaskContext
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.{Logging, LogKeys}
 import org.apache.spark.internal.LogKeys._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.connector.metric.CustomTaskMetric
 import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
+import org.apache.spark.sql.connector.read.streaming.SupportsRealTimeRead
+import org.apache.spark.sql.connector.read.streaming.SupportsRealTimeRead.RecordStatus
 import org.apache.spark.sql.execution.streaming.runtime.{MicroBatchExecution, StreamExecution}
-import org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer
+import org.apache.spark.sql.kafka010.consumer.{KafkaDataConsumer, KafkaDataConsumerIterator}
 
 /** A [[InputPartition]] for reading Kafka data in a batch based streaming query. */
 private[kafka010] case class KafkaBatchInputPartition(
@@ -67,7 +71,8 @@ private case class KafkaBatchPartitionReader(
     executorKafkaParams: ju.Map[String, Object],
     pollTimeoutMs: Long,
     failOnDataLoss: Boolean,
-    includeHeaders: Boolean) extends PartitionReader[InternalRow] with Logging {
+    includeHeaders: Boolean)
+  extends SupportsRealTimeRead[InternalRow] with Logging {
 
   private val consumer = KafkaDataConsumer.acquire(offsetRange.topicPartition, executorKafkaParams)
 
@@ -77,6 +82,12 @@ private case class KafkaBatchPartitionReader(
 
   private var nextOffset = rangeToRead.fromOffset
   private var nextRow: UnsafeRow = _
+  private var iteratorForRealTimeMode: Option[KafkaDataConsumerIterator] = None
+
+  // Boolean flag that indicates whether we have logged the type of timestamp (i.e. create time,
+  // log-append time, etc.) for the Kafka source. We log upon reading the first record, and we
+  // then skip logging for subsequent records.
+  private var timestampTypeLogged = false
 
   override def next(): Boolean = {
     if (nextOffset < rangeToRead.untilOffset) {
@@ -93,6 +104,38 @@ private case class KafkaBatchPartitionReader(
     }
   }
 
+  override def nextWithTimeout(timeoutMs: java.lang.Long): RecordStatus = {
+    if (!iteratorForRealTimeMode.isDefined) {
+      logInfo(s"Getting a new kafka consuming iterator for ${offsetRange.topicPartition} " +
+        s"starting from ${nextOffset}, timeoutMs ${timeoutMs}")
+      iteratorForRealTimeMode = Some(consumer.getIterator(nextOffset))
+    }
+    assert(iteratorForRealTimeMode.isDefined)
+    val nextRecord = iteratorForRealTimeMode.get.nextWithTimeout(timeoutMs)
+    nextRecord.foreach { record =>
+
+      nextRow = unsafeRowProjector(record)
+      nextOffset = record.offset + 1
+      if (record.timestampType() == TimestampType.LOG_APPEND_TIME ||
+        record.timestampType() == TimestampType.CREATE_TIME) {
+        if (!timestampTypeLogged) {
+          logInfo(log"Kafka source record timestamp type is " +
+            log"${MDC(LogKeys.TIMESTAMP_COLUMN_NAME, record.timestampType())}")
+          timestampTypeLogged = true
+        }
+
+        RecordStatus.newStatusWithArrivalTimeMs(record.timestamp())
+      } else {
+        RecordStatus.newStatusWithoutArrivalTime(true)
+      }
+    }
+    RecordStatus.newStatusWithoutArrivalTime(nextRecord.isDefined)
+  }
+
+  override def getOffset(): KafkaSourcePartitionOffset = {
+    KafkaSourcePartitionOffset(offsetRange.topicPartition, nextOffset)
+  }
+
   override def get(): UnsafeRow = {
     assert(nextRow != null)
     nextRow

diff --git a/...r/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala b/...r/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala
@@ -60,7 +60,11 @@ private[kafka010] class KafkaMicroBatchStream(
     metadataPath: String,
     startingOffsets: KafkaOffsetRangeLimit,
     failOnDataLoss: Boolean)
-  extends SupportsTriggerAvailableNow with ReportsSourceMetrics with MicroBatchStream with Logging {
+    extends SupportsTriggerAvailableNow
+    with SupportsRealTimeMode
+    with ReportsSourceMetrics
+    with MicroBatchStream
+    with Logging {
 
   private[kafka010] val pollTimeoutMs = options.getLong(
     KafkaSourceProvider.CONSUMER_POLL_TIMEOUT,
@@ -93,6 +97,11 @@ private[kafka010] class KafkaMicroBatchStream(
 
   private var isTriggerAvailableNow: Boolean = false
 
+  private var inRealTimeMode = false
+  override def prepareForRealTimeMode(): Unit = {
+    inRealTimeMode = true
+  }
+
   /**
    * Lazily initialize `initialPartitionOffsets` to make sure that `KafkaConsumer.poll` is only
    * called in StreamExecutionThread. Otherwise, interrupting a thread while running
@@ -218,6 +227,93 @@ private[kafka010] class KafkaMicroBatchStream(
     }.toArray
   }
 
+  override def planInputPartitions(start: Offset): Array[InputPartition] = {
+    // This function is used for real time mode. Trigger restrictions won't be supported.
+    if (maxOffsetsPerTrigger.isDefined) {
+      throw new UnsupportedOperationException(
+        "maxOffsetsPerTrigger is not compatible with real time mode")
+    }
+    if (minOffsetPerTrigger.isDefined) {
+      throw new UnsupportedOperationException(
+        "minOffsetsPerTrigger is not compatible with real time mode"
+      )
+    }
+    if (options.containsKey(KafkaSourceProvider.MIN_PARTITIONS_OPTION_KEY)) {
+      throw new UnsupportedOperationException(
+        "minpartitions is not compatible with real time mode"
+      )
+    }
+    if (options.containsKey(KafkaSourceProvider.ENDING_TIMESTAMP_OPTION_KEY)) {
+      throw new UnsupportedOperationException(
+        "endingtimestamp is not compatible with real time mode"
+      )
+    }
+    if (options.containsKey(KafkaSourceProvider.MAX_TRIGGER_DELAY)) {
+      throw new UnsupportedOperationException(
+        "maxtriggerdelay is not compatible with real time mode"
+      )
+    }
+
+    // This function is used by Low Latency Mode, where we expect 1:1 mapping between a
-    // This function is used by Low Latency Mode, where we expect 1:1 mapping between a
+    // This function is used by real time mode, where we expect 1:1 mapping between a
-    // This function is used by Low Latency Mode, where we expect 1:1 mapping between a
+    // This function is used by real time mode, where we expect 1:1 mapping between a
+    // topic partition and an input partition.
+    // We are skipping partition range check for performance reason. We can always try to do
+    // it in tasks if needed.
+    val startPartitionOffsets = start.asInstanceOf[KafkaSourceOffset].partitionToOffsets
+
+    // Here we check previous topic partitions with latest partition offsets to see if we need to
+    // update the partition list. Here we don't need the updated partition topic to be absolutely
+    // up to date, because there might already be minutes' delay since new partition is created.
+    // latestPartitionOffsets should be fetched not long ago anyway.
+    // If the topic partitions change, we fetch the earliest offsets for all new partitions
+    // and add them to the list.
+    assert(latestPartitionOffsets != null, "latestPartitionOffsets should be set in latestOffset")
+    val latestTopicPartitions = latestPartitionOffsets.keySet
+    val newStartPartitionOffsets = if (startPartitionOffsets.keySet == latestTopicPartitions) {
+      startPartitionOffsets
+    } else {
+      val newPartitions = latestTopicPartitions.diff(startPartitionOffsets.keySet)
+      // Instead of fetching earliest offsets, we could fill offset 0 here and avoid this extra
+      // admin function call. But we consider new partition is rare and getting earliest offset
+      // aligns with what we do in micro-batch mode and can potentially enable more sanity checks
+      // in executor side.
+      val newPartitionOffsets = kafkaOffsetReader.fetchEarliestOffsets(newPartitions.toSeq)
+
+      assert(
+        newPartitionOffsets.keys.forall(!startPartitionOffsets.contains(_)),
+        "startPartitionOffsets should not contain any key in newPartitionOffsets")
+
+      // Filter out new partition offsets that are not 0 and log a warning
+      val nonZeroNewPartitionOffsets = newPartitionOffsets.filter {
+        case (_, offset) => offset != 0
+      }
+      // Log the non-zero new partition offsets
+      if (nonZeroNewPartitionOffsets.nonEmpty) {
+        logWarning(log"new partitions should start from offset 0: " +
+          log"${MDC(OFFSETS, nonZeroNewPartitionOffsets)}")
+      }
+
+      logInfo(log"Added new partition offsets: ${MDC(OFFSETS, newPartitionOffsets)}")
+      startPartitionOffsets ++ newPartitionOffsets
+    }
+
+    newStartPartitionOffsets.keySet.toSeq.map { tp =>
+      val fromOffset = newStartPartitionOffsets(tp)
+      KafkaBatchInputPartition(
+        KafkaOffsetRange(tp, fromOffset, Long.MaxValue, preferredLoc = None),
+        executorKafkaParams,
+        pollTimeoutMs,
+        failOnDataLoss,
+        includeHeaders)
+    }.toArray
+  }
+
+  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
+    val mergedMap = offsets.map {
+      case KafkaSourcePartitionOffset(p, o) => (p, o)
+    }.toMap
+    KafkaSourceOffset(mergedMap)
+  }
+
   override def createReaderFactory(): PartitionReaderFactory = {
     KafkaBatchReaderFactory
   }
@@ -235,7 +331,30 @@ private[kafka010] class KafkaMicroBatchStream(
   override def toString(): String = s"KafkaV2[$kafkaOffsetReader]"
 
   override def metrics(latestConsumedOffset: Optional[Offset]): ju.Map[String, String] = {
-    KafkaMicroBatchStream.metrics(latestConsumedOffset, latestPartitionOffsets)
+    var rtmFetchLatestOffsetsTimeMs = Option.empty[Long]
+    val reCalculatedLatestPartitionOffsets =
+      if (inRealTimeMode) {
+        if (!latestConsumedOffset.isPresent) {
+          // this means a batch has no end offsets, which should not happen
+          None
+        } else {
+          Some {
+            val startTime = System.currentTimeMillis()
+            val latestOffsets = kafkaOffsetReader.fetchLatestOffsets(
+              Some(latestConsumedOffset.get.asInstanceOf[KafkaSourceOffset].partitionToOffsets))
+            val endTime = System.currentTimeMillis()
+            rtmFetchLatestOffsetsTimeMs = Some(endTime - startTime)
+            latestOffsets
+          }
+        }
+      } else {
+        // If we are in micro-batch mode, we need to get the latest partition offsets at the
+        // start of the batch and recalculate the latest offsets at the end for backlog
+        // estimation.
+        Some(kafkaOffsetReader.fetchLatestOffsets(Some(latestPartitionOffsets)))
+      }
+
+      KafkaMicroBatchStream.metrics(latestConsumedOffset, reCalculatedLatestPartitionOffsets)
   }
 
   /**
@@ -386,13 +505,14 @@ object KafkaMicroBatchStream extends Logging {
    */
   def metrics(
       latestConsumedOffset: Optional[Offset],
-      latestAvailablePartitionOffsets: PartitionOffsetMap): ju.Map[String, String] = {
+      latestAvailablePartitionOffsets: Option[PartitionOffsetMap]): ju.Map[String, String] = {
     val offset = Option(latestConsumedOffset.orElse(null))
 
-    if (offset.nonEmpty && latestAvailablePartitionOffsets != null) {
+    if (offset.nonEmpty && latestAvailablePartitionOffsets.isDefined) {
       val consumedPartitionOffsets = offset.map(KafkaSourceOffset(_)).get.partitionToOffsets
-      val offsetsBehindLatest = latestAvailablePartitionOffsets
-        .map(partitionOffset => partitionOffset._2 - consumedPartitionOffsets(partitionOffset._1))
+      val offsetsBehindLatest = latestAvailablePartitionOffsets.get
+        .map(partitionOffset => partitionOffset._2 -
+          consumedPartitionOffsets.getOrElse(partitionOffset._1, 0L))
       if (offsetsBehindLatest.nonEmpty) {
         val avgOffsetBehindLatest = offsetsBehindLatest.sum.toDouble / offsetsBehindLatest.size
         return Map[String, String](

diff --git a/...ka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala b/...ka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala
@@ -63,6 +63,13 @@ private[kafka010] class InternalKafkaConsumer(
   private[consumer] var kafkaParamsWithSecurity: ju.Map[String, Object] = _
   private val consumer = createConsumer()
 
+  def poll(pollTimeoutMs: Long): ju.List[ConsumerRecord[Array[Byte], Array[Byte]]] = {
+    val p = consumer.poll(Duration.ofMillis(pollTimeoutMs))
+    val r = p.records(topicPartition)
+    logDebug(s"Polled $groupId ${p.partitions()}  ${r.size}")
+    r
+  }
+
   /**
    * Poll messages from Kafka starting from `offset` and returns a pair of "list of consumer record"
    * and "offset after poll". The list of consumer record may be empty if the Kafka consumer fetches
@@ -131,7 +138,7 @@ private[kafka010] class InternalKafkaConsumer(
     c
   }
 
-  private def seek(offset: Long): Unit = {
+  def seek(offset: Long): Unit = {
     logDebug(s"Seeking to $groupId $topicPartition $offset")
     consumer.seek(topicPartition, offset)
   }
@@ -228,6 +235,19 @@ private[consumer] case class FetchedRecord(
   }
 }
 
+/**
+ * This class keeps returning the next records. If no new record is available, it will keep
+ * polling until timeout. It is used by KafkaBatchPartitionReader.nextWithTimeout(), to reduce
+ * seeking overhead in real time mode.
+ */
+private[sql] trait KafkaDataConsumerIterator {
+  /**
+   * Return the next record
+   * @return None if no new record is available after `timeoutMs`.
+   */
+  def nextWithTimeout(timeoutMs: Long): Option[ConsumerRecord[Array[Byte], Array[Byte]]]
+}
+
 /**
  * This class helps caller to read from Kafka leveraging consumer pool as well as fetched data pool.
  * This class throws error when data loss is detected while reading from Kafka.
@@ -272,6 +292,82 @@ private[kafka010] class KafkaDataConsumer(
   // Starting timestamp when the consumer is created.
   private var startTimestampNano: Long = System.nanoTime()
 
+  /**
+   * Get an iterator that can return the next entry. It is used exclusively for real-time
+   * mode.
+   *
+   * It is called by KafkaBatchPartitionReader.nextWithTimeout(). Unlike get(), there is no
+   * out-of-bound check in this function. Since there is no endOffset given, we assume anything
+   * record is valid to return as long as it is at or after `offset`.
+   *
+   * @param startOffsets, the starting positions to read from, inclusive.
+   */
+  def getIterator(offset: Long): KafkaDataConsumerIterator = {
-  def getIterator(offset: Long): KafkaDataConsumerIterator = {
+  def getIterator(startOffsets: Long): KafkaDataConsumerIterator = {
-  def getIterator(offset: Long): KafkaDataConsumerIterator = {
+  def getIterator(startOffsets: Long): KafkaDataConsumerIterator = {
+    new KafkaDataConsumerIterator {
+      private var fetchedRecordList
+          : Option[ju.ListIterator[ConsumerRecord[Array[Byte], Array[Byte]]]] = None
+      private val consumer = getOrRetrieveConsumer()
+      private var firstRecord = true
+      private var _currentOffset: Long = offset - 1
+
+      private def fetchedRecordListHasNext(): Boolean = {
+        fetchedRecordList.map(_.hasNext).getOrElse(false)
+      }
+
+      override def nextWithTimeout(
+          timeoutMs: Long): Option[ConsumerRecord[Array[Byte], Array[Byte]]] = {
+        var timeLeftMs = timeoutMs
+
+        def timeAndDeductFromTimeLeftMs[T](body: => T): Unit = {
+          // To reduce timing the same operator twice, we reuse the timing results for
+          // totalTimeReadNanos and for timeoutMs.
+          val prevTime = totalTimeReadNanos
+          timeNanos {
+            body
+          }
+          timeLeftMs -= (totalTimeReadNanos - prevTime) / 1000000
+        }
+
+        if (firstRecord) {
+          timeAndDeductFromTimeLeftMs {
+            consumer.seek(offset)
+            firstRecord = false
+          }
+        }
+        while (!fetchedRecordListHasNext() && timeLeftMs > 0) {
+          timeAndDeductFromTimeLeftMs {
+            try {
+              val records = consumer.poll(timeLeftMs)
+              numPolls += 1
+              if (!records.isEmpty) {
+                numRecordsPolled += records.size
+                fetchedRecordList = Some(records.listIterator)
+              }
+            } catch {
+              case ex: OffsetOutOfRangeException =>
+                if (_currentOffset != -1) {
+                  throw ex
+                } else {
+                  Thread.sleep(10) // retry until the source partition is populated
+                  assert(offset == 0)
+                  consumer.seek(offset)
+                }
+            }
+          }
+        }
+        if (fetchedRecordListHasNext()) {
+          totalRecordsRead += 1
+          val nextRecord = fetchedRecordList.get.next()
+          assert(nextRecord.offset > _currentOffset, "Kafka offset should be incremental.")
+          _currentOffset = nextRecord.offset
+          Some(nextRecord)
+        } else {
+          None
+        }
+      }
+    }
+  }
+
   /**
    * Get the record for the given offset if available.
    *