|
1 | 1 | package com.highperformancespark.examples.structuredstreaming |
2 | 2 |
|
3 | | -// tag::streaming_ex_json_window_test[] |
4 | | -// Test for JsonWindowedAggExample: verifies late rows are dropped and state is bounded |
5 | | - |
6 | 3 | import org.scalatest.funsuite.AnyFunSuite |
7 | | -import org.apache.spark.sql.SparkSession |
8 | | -import org.apache.spark.sql.streaming.Trigger |
| 4 | +import org.apache.spark.sql.{SaveMode, SparkSession} |
9 | 5 | import org.apache.spark.sql.functions._ |
| 6 | +import org.apache.spark.sql.streaming.Trigger |
| 7 | +import java.nio.file.Files |
10 | 8 | import java.sql.Timestamp |
11 | 9 |
|
12 | | -class JsonWindowedAggExampleSuite extends AnyFunSuite { |
13 | | - test("windowed agg drops late rows beyond watermark") { |
14 | | - val spark = SparkSession.builder() |
| 10 | +class JsonWindowedAggExampleFileIT extends AnyFunSuite { |
| 11 | + |
| 12 | + private def withSpark[T](f: SparkSession => T): T = { |
| 13 | + val spark = SparkSession.builder() |
| 14 | + .appName("JsonWindowedAggExampleFileIT") |
15 | 15 | .master("local[2]") |
16 | | - .appName("JsonWindowedAggExampleSuite") |
| 16 | + .config("spark.ui.enabled", "false") |
| 17 | + .config("spark.sql.shuffle.partitions", "2") |
17 | 18 | .getOrCreate() |
18 | | - import spark.implicits._ |
19 | | - |
20 | | - import org.apache.spark.sql.execution.streaming.MemoryStream |
21 | | - val inputStream = MemoryStream[(Timestamp, String)](1, spark.sqlContext) |
22 | | - val now = System.currentTimeMillis() |
23 | | - val rows = Seq( |
24 | | - (new Timestamp(now - 1000 * 60 * 5), "foo"), // within window |
25 | | - (new Timestamp(now - 1000 * 60 * 50), "bar"), // late, beyond watermark |
26 | | - (new Timestamp(now - 1000 * 60 * 2), "foo") // within window |
27 | | - ) |
28 | | - inputStream.addData(rows: _*) |
29 | | - val df = inputStream.toDF().toDF("timestamp", "word") |
30 | | - val withWatermark = df.withWatermark("timestamp", "42 minutes") |
31 | | - val windowed = withWatermark |
32 | | - .groupBy(window(col("timestamp"), "10 minutes"), col("word")) |
33 | | - .count() |
34 | | - |
35 | | - val query = windowed.writeStream |
36 | | - .outputMode("append") |
37 | | - .format("memory") |
38 | | - .queryName("json_windowed_agg") |
39 | | - .trigger(Trigger.Once()) |
40 | | - .option("checkpointLocation", "./tmp/checkpoints/json_windowed_agg_test") |
41 | | - .start() |
42 | | - query.processAllAvailable() |
43 | | - query.awaitTermination() |
44 | | - |
45 | | - val result = spark.sql("select word, count from json_windowed_agg").collect().map(_.getString(0)).toSet |
46 | | - assert(result.contains("foo")) |
47 | | - assert(!result.contains("bar"), "Late row 'bar' should be dropped") |
48 | | - spark.stop() |
| 19 | + try f(spark) finally spark.stop() |
| 20 | + } |
| 21 | + |
| 22 | + test("file JSON source: sequential writes close windows via watermark (append mode)") { |
| 23 | + withSpark { spark => |
| 24 | + import spark.implicits._ |
| 25 | + |
| 26 | + val inputDir = Files.createTempDirectory("json-input-it").toFile.getAbsolutePath |
| 27 | + val chkDir = Files.createTempDirectory("chk-it").toFile.getAbsolutePath |
| 28 | + val qName = "json_winagg_mem_it" |
| 29 | + |
| 30 | + // Start the stream FIRST, using a periodic trigger and a watermark |
| 31 | + val q = JsonWindowedAggExample.makeQueryWith( |
| 32 | + spark, |
| 33 | + inputPath = inputDir, |
| 34 | + checkpointDir = chkDir, |
| 35 | + outputFormat = "memory", // assertable sink |
| 36 | + queryName = Some(qName), |
| 37 | + trigger = Trigger.ProcessingTime("250 milliseconds"), |
| 38 | + addWatermark = true // watermark = 5 minutes (set in builder) |
| 39 | + ) |
| 40 | + |
| 41 | + // --- Batch 1: events in [10:00,10:10) |
| 42 | + Seq( |
| 43 | + ("2025-01-01 10:01:00", "hello"), |
| 44 | + ("2025-01-01 10:05:00", "hello"), |
| 45 | + ("2025-01-01 10:05:00", "world") |
| 46 | + ).map { case (ts, w) => (Timestamp.valueOf(ts), w) } |
| 47 | + .toDF("timestamp","word") |
| 48 | + .write.mode(SaveMode.Append).json(inputDir) |
| 49 | + |
| 50 | + // Let the stream pick up batch 1 |
| 51 | + q.processAllAvailable() // ok in tests |
| 52 | + |
| 53 | + // Nothing should be emitted yet in append mode (window not closed) |
| 54 | + assert(spark.table(qName).count() == 0) |
| 55 | + |
| 56 | + // --- Batch 2: later event at 10:16 moves max event time to 10:16 |
| 57 | + // Watermark = maxEventTime - 5m = 10:11 >= 10:10, so [10:00,10:10) closes and emits. |
| 58 | + Seq(("2025-01-01 10:16:00", "hello")) |
| 59 | + .map { case (ts, w) => (Timestamp.valueOf(ts), w) } |
| 60 | + .toDF("timestamp","word") |
| 61 | + .write.mode(SaveMode.Append).json(inputDir) |
| 62 | + |
| 63 | + q.processAllAvailable() |
| 64 | + |
| 65 | + val afterBatch2 = spark.table(qName) |
| 66 | + .select( |
| 67 | + date_format(col("window.start"), "yyyy-MM-dd HH:mm:ss").as("start"), |
| 68 | + date_format(col("window.end"), "yyyy-MM-dd HH:mm:ss").as("end"), |
| 69 | + col("word"), |
| 70 | + col("count") |
| 71 | + ) |
| 72 | + .collect() |
| 73 | + .map(r => (r.getString(0), r.getString(1), r.getString(2), r.getLong(3))) |
| 74 | + .toSet |
| 75 | + |
| 76 | + val expectedAfterBatch2 = Set( |
| 77 | + ("2025-01-01 10:00:00", "2025-01-01 10:10:00", "hello", 2L), |
| 78 | + ("2025-01-01 10:00:00", "2025-01-01 10:10:00", "world", 1L) |
| 79 | + ) |
| 80 | + assert(afterBatch2 == expectedAfterBatch2) |
| 81 | + |
| 82 | + // --- Batch 3: event at 10:26 closes [10:10,10:20) |
| 83 | + // New watermark = 10:21 >= 10:20 ⇒ the second window can now emit. |
| 84 | + Seq(("2025-01-01 10:26:00", "noop")) |
| 85 | + .map { case (ts, w) => (Timestamp.valueOf(ts), w) } |
| 86 | + .toDF("timestamp","word") |
| 87 | + .write.mode(SaveMode.Append).json(inputDir) |
| 88 | + |
| 89 | + q.processAllAvailable() |
| 90 | + |
| 91 | + val finalOut = spark.table(qName) |
| 92 | + .select( |
| 93 | + date_format(col("window.start"), "yyyy-MM-dd HH:mm:ss").as("start"), |
| 94 | + date_format(col("window.end"), "yyyy-MM-dd HH:mm:ss").as("end"), |
| 95 | + col("word"), |
| 96 | + col("count") |
| 97 | + ) |
| 98 | + .collect() |
| 99 | + .map(r => (r.getString(0), r.getString(1), r.getString(2), r.getLong(3))) |
| 100 | + .toSet |
| 101 | + |
| 102 | + val expectedFinal = expectedAfterBatch2 ++ Set( |
| 103 | + ("2025-01-01 10:10:00", "2025-01-01 10:20:00", "hello", 1L) |
| 104 | + ) |
| 105 | + assert(finalOut == expectedFinal) |
| 106 | + |
| 107 | + q.stop() |
| 108 | + } |
49 | 109 | } |
50 | 110 | } |
51 | | -// end::streaming_ex_json_window_test[] |
|
0 commit comments