-
Notifications
You must be signed in to change notification settings - Fork 952
Configurable blocksize mode for streaming executor in unit tests #19146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: branch-25.08
Are you sure you want to change the base?
Changes from all commits
50d5846
188ac24
9f1f078
8741a94
4e26686
58ae53f
b391fad
c546d89
bd6092c
1ea178d
aaefe93
4b40775
8b4a871
d821a36
b16decf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
|
||
from cudf_polars.dsl import expr | ||
from cudf_polars.testing.asserts import ( | ||
DEFAULT_BLOCKSIZE_MODE, | ||
assert_gpu_result_equal, | ||
assert_ir_translation_raises, | ||
) | ||
|
@@ -74,7 +75,11 @@ def test_agg(df, agg): | |
|
||
# https://github.com/rapidsai/cudf/issues/15852 | ||
check_dtypes = agg not in {"n_unique", "median"} | ||
if not check_dtypes and q.collect_schema()["a"] != pl.Float64: | ||
if ( | ||
not check_dtypes | ||
and q.collect_schema()["a"] != pl.Float64 | ||
and DEFAULT_BLOCKSIZE_MODE == "default" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is the new condition. The IR graph produced by the multi-partition executor includes some There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: Should we introduce those casts in the in-memory executor instead (during translation time). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe... Matching the output type is obviously nice. But I do like that it's obvious that we're not sacrificing performance with an ad-hoc cast. |
||
): | ||
with pytest.raises(AssertionError): | ||
assert_gpu_result_equal(q) | ||
assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,7 +258,8 @@ def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): | |
) | ||
) | ||
q = df.lazy().group_by(key_names, maintain_order=True).agg(pl.col("value").sum()) | ||
assert_gpu_result_equal(q) | ||
# The streaming executor is too slow for large n_rows with blocksize_mode="small" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if there's a better way to find slow tests, other than watching out for tests that large number of rows and the default engine / There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you saying we could use |
||
assert_gpu_result_equal(q, blocksize_mode="default" if nrows > 30 else None) | ||
|
||
|
||
def test_groupby_len_with_nulls(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import polars as pl | ||
|
||
from cudf_polars.testing.asserts import ( | ||
DEFAULT_BLOCKSIZE_MODE, | ||
assert_gpu_result_equal, | ||
assert_ir_translation_raises, | ||
) | ||
|
@@ -71,7 +72,7 @@ def test_non_coalesce_join(left, right, how, nulls_equal, join_expr): | |
query = left.join( | ||
right, on=join_expr, how=how, nulls_equal=nulls_equal, coalesce=False | ||
) | ||
assert_gpu_result_equal(query, check_row_order=how == "left") | ||
assert_gpu_result_equal(query, check_row_order=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, and in many of the other join tests, we no longer check row order. We could make this slightly more complicated like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's a good decision. We should also remove the "by default reorder" for left joins if we don't already. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like @Matt711 is handling that at https://github.com/rapidsai/cudf/pull/17698/files#diff-58a9bfc6f3a54d841c1a798c95354d72b05555be355adec331ca5d5d1c67ad16R1260 in the PR adding support for |
||
|
||
|
||
@pytest.mark.parametrize( | ||
|
@@ -85,16 +86,19 @@ def test_coalesce_join(left, right, how, nulls_equal, join_expr): | |
query = left.join( | ||
right, on=join_expr, how=how, nulls_equal=nulls_equal, coalesce=True | ||
) | ||
assert_gpu_result_equal(query, check_row_order=how == "left") | ||
assert_gpu_result_equal(query, check_row_order=False) | ||
|
||
|
||
def test_left_join_with_slice(left, right, nulls_equal, zlice): | ||
q = left.join(right, on="a", how="left", nulls_equal=nulls_equal, coalesce=True) | ||
|
||
if zlice is not None: | ||
if DEFAULT_BLOCKSIZE_MODE == "small" and zlice != (0, None): | ||
pytest.skip("Cannot match polars' ordering with multiple partitions.") | ||
Comment on lines
+96
to
+97
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's reasonable. It also depends on row ordering for a single partition, so how do we match polars for single partitions? Do we sort afterwards? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just speculation, but I think that for polars and the streaming-executor-with-one-partition just happens to still maintain the order for left joins. Perhaps as https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.join.html notes
and we should relax this test to just check that the number of rows is 2 and that the schema matches. |
||
|
||
q = q.slice(*zlice) | ||
|
||
assert_gpu_result_equal(q) | ||
assert_gpu_result_equal(q, check_row_order=False) | ||
|
||
|
||
def test_cross_join(left, right, zlice): | ||
|
@@ -114,7 +118,7 @@ def test_cross_join(left, right, zlice): | |
) | ||
def test_join_literal_key(left, right, left_on, right_on): | ||
q = left.join(right, left_on=left_on, right_on=right_on, how="inner") | ||
assert_gpu_result_equal(q) | ||
assert_gpu_result_equal(q, check_row_order=False) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -331,10 +331,13 @@ def test_scan_parquet_only_row_index_raises(df, tmp_path): | |
assert_ir_translation_raises(q, NotImplementedError) | ||
|
||
|
||
def test_scan_include_file_path(request, tmp_path, format, scan_fn, df): | ||
@pytest.mark.parametrize("n_rows", [None, 2]) | ||
def test_scan_include_file_path(request, tmp_path, format, scan_fn, df, n_rows): | ||
if n_rows is not None: | ||
df = df.head(n_rows) | ||
Comment on lines
+336
to
+337
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was needed for test coverage with |
||
make_partitioned_source(df, tmp_path / "file", format) | ||
|
||
q = scan_fn(tmp_path / "file", include_file_paths="files") | ||
q = scan_fn(tmp_path / "file", include_file_paths="files", n_rows=n_rows) | ||
|
||
if format == "ndjson": | ||
assert_ir_translation_raises(q, NotImplementedError) | ||
|
Uh oh!
There was an error while loading. Please reload this page.