rapidsai · TomAugspurger · Jun 12, 2025 · Jun 16, 2025 · Jun 16, 2025 · Jun 18, 2025
@@ -14,6 +14,9 @@ python -m pytest --cache-clear "$@" tests
 # Test the "streaming" executor
 python -m pytest --cache-clear "$@" tests --executor streaming
 
+# Test the "streaming" executor with small blocksize
+python -m pytest --cache-clear "$@" tests --executor streaming --blocksize-mode small
+
 # Run experimental tests with Distributed cluster
 python -m pytest --cache-clear "$@" "tests/experimental" \
     --executor streaming \

@@ -6,13 +6,14 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal
 
 import polars as pl
 from polars import GPUEngine
 from polars.testing.asserts import assert_frame_equal
 
 from cudf_polars.dsl.translate import Translator
+from cudf_polars.utils.config import StreamingFallbackMode
 
 if TYPE_CHECKING:
     from cudf_polars.typing import OptimizationArgs
@@ -29,6 +30,7 @@
 # and `--scheduler` command-line arguments
 DEFAULT_EXECUTOR = "in-memory"
 DEFAULT_SCHEDULER = "synchronous"
+DEFAULT_BLOCKSIZE_MODE: Literal["small", "default"] = "default"
 
 
 def assert_gpu_result_equal(
@@ -46,6 +48,7 @@ def assert_gpu_result_equal(
     atol: float = 1e-08,
     categorical_as_str: bool = False,
     executor: str | None = None,
+    blocksize_mode: Literal["small", "default"] | None = None,
 ) -> None:
     """
     Assert that collection of a lazyframe on GPU produces correct results.
@@ -86,6 +89,12 @@ def assert_gpu_result_equal(
     executor
         The executor configuration to pass to `GPUEngine`. If not specified
         uses the module level `Executor` attribute.
+    blocksize_mode
+        The "mode" to use for choosing the blocksize for the streaming executor.
+        If not specified, uses the module level ``DEFAULT_BLOCKSIZE_MODE`` attribute.
+        Set to "small" to configure small values for ``max_rows_per_partition``
+        and ``target_partition_size``, which will typically cause many partitions
+        to be created while executing the query.
 
     Raises
     ------
@@ -95,13 +104,23 @@ def assert_gpu_result_equal(
         If GPU collection failed in some way.
     """
     if engine is None:
+        executor_options: dict[str, Any] = {}
         executor = executor or DEFAULT_EXECUTOR
+        if executor == "streaming":
+            executor_options["scheduler"] = DEFAULT_SCHEDULER
+
+            blocksize_mode = blocksize_mode or DEFAULT_BLOCKSIZE_MODE
+
+            if blocksize_mode == "small":  # pragma: no cover
+                executor_options["max_rows_per_partition"] = 4
+                executor_options["target_partition_size"] = 10
+                # We expect many tests to fall back, so silence the warnings
+                executor_options["fallback_mode"] = StreamingFallbackMode.SILENT
+
         engine = GPUEngine(
             raise_on_fail=True,
             executor=executor,
-            executor_options=(
-                {"scheduler": DEFAULT_SCHEDULER} if executor == "streaming" else {}
-            ),
+            executor_options=executor_options,
         )
 
     final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(

@@ -31,6 +31,17 @@ def pytest_addoption(parser):
         help="Scheduler to use for 'streaming' executor.",
     )
 
+    parser.addoption(
+        "--blocksize-mode",
+        action="store",
+        default="default",
+        choices=("small", "default"),
+        help=(
+            "Blocksize to use for 'streaming' executor. Set to 'small' "
+            "to run most tests with multiple partitions."
+        ),
+    )
+
 
 def pytest_configure(config):
     import cudf_polars.testing.asserts
@@ -43,6 +54,9 @@ def pytest_configure(config):
 
     cudf_polars.testing.asserts.DEFAULT_EXECUTOR = config.getoption("--executor")
     cudf_polars.testing.asserts.DEFAULT_SCHEDULER = config.getoption("--scheduler")
+    cudf_polars.testing.asserts.DEFAULT_BLOCKSIZE_MODE = config.getoption(
+        "--blocksize-mode"
+    )
 
 
 def pytest_sessionstart(session):

@@ -23,6 +23,25 @@ def test_non_scalar_access_raises():
         _ = column.obj_scalar
 
 
+def test_check_sorted():
+    dtype = DataType(pl.Int8())
+    column = Column(
+        plc.Column.from_iterable_of_py([0, 1, 2], dtype.plc),
+        dtype=dtype,
+    )
+    assert column.check_sorted(
+        order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER
+    )
+    column.set_sorted(
+        is_sorted=plc.types.Sorted.YES,
+        order=plc.types.Order.ASCENDING,
+        null_order=plc.types.NullOrder.AFTER,
+    )
+    assert column.check_sorted(
+        order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER
+    )
+
+
 @pytest.mark.parametrize("length", [0, 1])
 def test_length_leq_one_always_sorted(length):
     dtype = DataType(pl.Int8())

@@ -8,6 +8,7 @@
 
 from cudf_polars.dsl import expr
 from cudf_polars.testing.asserts import (
+    DEFAULT_BLOCKSIZE_MODE,
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
@@ -74,7 +75,11 @@ def test_agg(df, agg):
 
     # https://github.com/rapidsai/cudf/issues/15852
     check_dtypes = agg not in {"n_unique", "median"}
-    if not check_dtypes and q.collect_schema()["a"] != pl.Float64:
+    if (
+        not check_dtypes
+        and q.collect_schema()["a"] != pl.Float64
+        and DEFAULT_BLOCKSIZE_MODE == "default"
+    ):
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)
     assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)

@@ -258,7 +258,8 @@ def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
             )
         )
     q = df.lazy().group_by(key_names, maintain_order=True).agg(pl.col("value").sum())
-    assert_gpu_result_equal(q)
+    # The streaming executor is too slow for large n_rows with blocksize_mode="small"
+    assert_gpu_result_equal(q, blocksize_mode="default" if nrows > 30 else None)
 
 
 def test_groupby_len_with_nulls():

@@ -7,6 +7,7 @@
 import polars as pl
 
 from cudf_polars.testing.asserts import (
+    DEFAULT_BLOCKSIZE_MODE,
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
@@ -71,7 +72,7 @@ def test_non_coalesce_join(left, right, how, nulls_equal, join_expr):
     query = left.join(
         right, on=join_expr, how=how, nulls_equal=nulls_equal, coalesce=False
     )
-    assert_gpu_result_equal(query, check_row_order=how == "left")
+    assert_gpu_result_equal(query, check_row_order=False)
 
 
 @pytest.mark.parametrize(
@@ -85,16 +86,19 @@ def test_coalesce_join(left, right, how, nulls_equal, join_expr):
     query = left.join(
         right, on=join_expr, how=how, nulls_equal=nulls_equal, coalesce=True
     )
-    assert_gpu_result_equal(query, check_row_order=how == "left")
+    assert_gpu_result_equal(query, check_row_order=False)
 
 
 def test_left_join_with_slice(left, right, nulls_equal, zlice):
     q = left.join(right, on="a", how="left", nulls_equal=nulls_equal, coalesce=True)
 
     if zlice is not None:
+        if DEFAULT_BLOCKSIZE_MODE == "small" and zlice != (0, None):
+            pytest.skip("Cannot match polars' ordering with multiple partitions.")
+
         q = q.slice(*zlice)
 
-    assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_row_order=False)
 
 
 def test_cross_join(left, right, zlice):
@@ -114,7 +118,7 @@ def test_cross_join(left, right, zlice):
 )
 def test_join_literal_key(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
-    assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_row_order=False)
 
 
 @pytest.mark.parametrize(

@@ -331,10 +331,13 @@ def test_scan_parquet_only_row_index_raises(df, tmp_path):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
-def test_scan_include_file_path(request, tmp_path, format, scan_fn, df):
+@pytest.mark.parametrize("n_rows", [None, 2])
+def test_scan_include_file_path(request, tmp_path, format, scan_fn, df, n_rows):
+    if n_rows is not None:
+        df = df.head(n_rows)
     make_partitioned_source(df, tmp_path / "file", format)
 
-    q = scan_fn(tmp_path / "file", include_file_paths="files")
+    q = scan_fn(tmp_path / "file", include_file_paths="files", n_rows=n_rows)
 
     if format == "ndjson":
         assert_ir_translation_raises(q, NotImplementedError)