From 1455bf53cf1320ca2e7ad35e6737b18d7514811e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 5 Aug 2025 12:13:37 -0700 Subject: [PATCH 01/23] Add overflow support for signed integers and decimals --- .../cudf/detail/aggregation/aggregation.hpp | 6 +- .../detail/aggregation/device_aggregators.cuh | 43 +- cpp/src/aggregation/aggregation.cu | 19 +- cpp/src/groupby/groupby.cu | 24 +- .../hash/create_sparse_results_table.cu | 10 +- cpp/src/groupby/sort/aggregate.cpp | 5 + cpp/tests/CMakeLists.txt | 1 + cpp/tests/groupby/sum_tests.cpp | 204 ------- cpp/tests/groupby/sum_with_overflow_tests.cpp | 498 ++++++++++++++++++ 9 files changed, 579 insertions(+), 231 deletions(-) create mode 100644 cpp/tests/groupby/sum_with_overflow_tests.cpp diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 81084f8bdfb..b1f167fde58 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1411,8 +1411,12 @@ struct target_type_impl + requires((cudf::is_integral() && !cuda::std::is_same_v && + cuda::std::is_signed_v) || + cudf::is_fixed_point()) struct target_type_impl { using type = struct_view; // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields }; diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 0c5b57c51a7..a3d61b13292 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -23,7 +23,9 @@ #include #include #include +#include #include +#include #include #include @@ -156,7 +158,10 @@ struct update_target_element { }; template - requires(cuda::std::is_same_v) + requires(((cudf::is_integral() && !cuda::std::is_same_v && + cuda::std::is_signed_v) || + cudf::is_fixed_point()) && + cudf::has_atomic_support>()) struct update_target_element { __device__ void operator()(mutable_column_device_view target, size_type target_index, @@ -168,9 +173,20 @@ struct update_target_element { auto sum_column = target.child(0); auto overflow_column = target.child(1); - auto const source_value = source.element(source_index); + using DeviceType = device_storage_type_t; + + // Extract source value: for integral types, read as Source then cast; for decimal types, read + // as DeviceType directly + DeviceType device_source_value; + if constexpr (cudf::is_fixed_point()) { + device_source_value = source.element(source_index); + } else { + auto const source_value = source.element(source_index); + device_source_value = static_cast(source_value); + } + auto const old_sum = - cudf::detail::atomic_add(&sum_column.element(target_index), source_value); + cudf::detail::atomic_add(&sum_column.element(target_index), device_source_value); // Early exit if overflow is already set to avoid unnecessary overflow checking auto bool_ref = cuda::atomic_ref{ @@ -178,15 +194,18 @@ struct update_target_element { if (bool_ref.load(cuda::memory_order_relaxed)) { return; } // Check for overflow before performing the addition to avoid UB - // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value - // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value - // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is - // ready - auto constexpr int64_max = cuda::std::numeric_limits::max(); - auto constexpr int64_min = cuda::std::numeric_limits::min(); - auto const overflow = - ((old_sum > 0 && source_value > 0 && old_sum > int64_max - source_value) || - (old_sum < 0 && source_value < 0 && old_sum < int64_min - source_value)); + auto constexpr type_max = cuda::std::numeric_limits::max(); + auto constexpr type_min = cuda::std::numeric_limits::min(); + + bool overflow = false; + // Check for positive overflow (signed types only) + if (old_sum > 0 && device_source_value > 0 && old_sum > type_max - device_source_value) { + overflow = true; + } + // Check for negative overflow (signed types only) + if (old_sum < 0 && device_source_value < 0 && old_sum < type_min - device_source_value) { + overflow = true; + } if (overflow) { // Atomically set overflow flag to true (use atomic_max since true > false) cudf::detail::atomic_max(&overflow_column.element(target_index), true); diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index c58d1f7af7c..98f24e3b239 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -96,17 +96,22 @@ struct identity_initializer { requires(is_supported()) { if constexpr (k == aggregation::SUM_WITH_OVERFLOW) { - // SUM_WITH_OVERFLOW uses a struct with sum (int64_t) and overflow (bool) children - // Initialize sum child to 0 and overflow child to false + // SUM_WITH_OVERFLOW uses a struct with sum and overflow children + // We need to initialize each child separately since we don't know the sum column type at + // compile time auto sum_col = col.child(0); auto overflow_col = col.child(1); - auto zip_begin = thrust::make_zip_iterator( - thrust::make_tuple(sum_col.begin(), overflow_col.begin())); + // Initialize overflow column to false thrust::fill(rmm::exec_policy_nosync(stream), - zip_begin, - zip_begin + col.size(), - thrust::make_tuple(int64_t{0}, false)); + overflow_col.begin(), + overflow_col.end(), + false); + + // Initialize sum column to 0 based on its actual type + // Use SUM aggregation identity initializer which sets to 0 + dispatch_type_and_aggregation( + sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream); } else if constexpr (std::is_same_v) { // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation"); diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index bf3ee2a49aa..a67ac823d74 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -125,9 +125,10 @@ struct empty_column_constructor { if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); } if constexpr (k == aggregation::Kind::SUM_WITH_OVERFLOW) { - // SUM_WITH_OVERFLOW returns a struct with sum (int64_t) and overflow (bool) children + // SUM_WITH_OVERFLOW returns a struct with sum (same type as input) and overflow (bool) + // children std::vector> children; - children.push_back(make_empty_column(cudf::data_type{cudf::type_id::INT64})); + children.push_back(make_empty_column(values.type())); children.push_back(make_empty_column(cudf::data_type{cudf::type_id::BOOL8})); return make_structs_column(0, std::move(children), 0, {}, stream, mr); } @@ -212,6 +213,25 @@ void verify_valid_requests(host_span requests) }); }), "Invalid type/aggregation combination."); + + // Additional validation for SUM_WITH_OVERFLOW: only signed integers and decimals are supported + for (auto const& request : requests) { + for (auto const& agg : request.aggregations) { + if (agg->kind == aggregation::SUM_WITH_OVERFLOW) { + auto const type_id = request.values.type().id(); + bool is_supported = + (type_id == type_id::INT8 || type_id == type_id::INT16 || type_id == type_id::INT32 || + type_id == type_id::INT64 || type_id == type_id::DECIMAL32 || + type_id == type_id::DECIMAL64 || type_id == type_id::DECIMAL128); + + CUDF_EXPECTS( + is_supported, + "SUM_WITH_OVERFLOW aggregation only supports signed integer types (int8_t, int16_t, " + "int32_t, int64_t) and decimal types (decimal32, decimal64, decimal128). " + "Unsigned integers, bool, dictionary columns, and other types are not supported."); + } + } + } } } // namespace diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index dd622bbcc8f..d4bf95221b4 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -70,12 +70,12 @@ struct sparse_column_creator { }; // Lambda to create children for SUM_WITH_OVERFLOW struct column - auto make_children = [&make_empty_column](cudf::size_type size, cudf::mask_state mask_state) { + auto make_children = [&make_empty_column, &col_type](cudf::size_type size, + cudf::mask_state mask_state) { std::vector> children; - // Create sum child column (int64_t) - no null mask needed, struct-level mask handles - // nullability - children.push_back( - make_empty_column(cudf::type_id::INT64, size, cudf::mask_state::UNALLOCATED)); + // Create sum child column (same type as input column) - no null mask needed, struct-level + // mask handles nullability + children.push_back(make_empty_column(col_type.id(), size, cudf::mask_state::UNALLOCATED)); // Create overflow child column (bool) - no null mask needed, only value matters children.push_back( make_empty_column(cudf::type_id::BOOL8, size, cudf::mask_state::UNALLOCATED)); diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 793461a7b0e..0e6898b7f4e 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -875,6 +875,11 @@ std::pair, std::vector> groupby::sort auto store_functor = detail::aggregate_result_functor(request.values, helper(), cache, stream, mr); for (auto const& agg : request.aggregations) { + // SUM_WITH_OVERFLOW is only supported with hash-based groupby, not sort-based + CUDF_EXPECTS(agg->kind != aggregation::SUM_WITH_OVERFLOW, + "SUM_WITH_OVERFLOW aggregation is only supported with hash-based groupby, not " + "sort-based groupby"); + // TODO (dm): single pass compute all supported reductions cudf::detail::aggregation_dispatcher(agg->kind, store_functor, *agg); } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 65e3a608660..089897b7d2e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -160,6 +160,7 @@ ConfigureTest( groupby/sum_of_squares_tests.cpp groupby/sum_scan_tests.cpp groupby/sum_tests.cpp + groupby/sum_with_overflow_tests.cpp groupby/tdigest_tests.cu groupby/var_tests.cpp GPUS 1 diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp index f0ab60faab2..a537af0e08c 100644 --- a/cpp/tests/groupby/sum_tests.cpp +++ b/cpp/tests/groupby/sum_tests.cpp @@ -230,207 +230,3 @@ TYPED_TEST(GroupBySumFixedPointTest, GroupByHashSumDecimalAsValue) EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error); } } - -// SUM_WITH_OVERFLOW tests - only supports int64_t input values and outputs int64_t -template -struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {}; - -using sum_with_overflow_supported_types = cudf::test::Types; - -TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types); - -TYPED_TEST(groupby_sum_with_overflow_test, basic) -{ - using V = TypeParam; - - cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; - cudf::test::fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - - cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3}; - - // Create expected struct column with sum and overflow children - auto sum_col = cudf::test::fixed_width_column_wrapper{9, 19, 17}; - auto overflow_col = cudf::test::fixed_width_column_wrapper{false, false, false}; - std::vector> children; - children.push_back(sum_col.release()); - children.push_back(overflow_col.release()); - auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {}); - - auto agg = cudf::make_sum_with_overflow_aggregation(); - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); - - // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby -} - -TYPED_TEST(groupby_sum_with_overflow_test, empty_cols) -{ - using V = TypeParam; - - cudf::test::fixed_width_column_wrapper keys{}; - cudf::test::fixed_width_column_wrapper vals{}; - - cudf::test::fixed_width_column_wrapper expect_keys{}; - - // Create expected empty struct column with sum and overflow children - auto sum_col = cudf::test::fixed_width_column_wrapper{}; - auto overflow_col = cudf::test::fixed_width_column_wrapper{}; - std::vector> children; - children.push_back(sum_col.release()); - children.push_back(overflow_col.release()); - auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {}); - - auto agg = cudf::make_sum_with_overflow_aggregation(); - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); - - // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby -} - -TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys) -{ - using V = TypeParam; - - cudf::test::fixed_width_column_wrapper keys({1, 2, 3}, cudf::test::iterators::all_nulls()); - cudf::test::fixed_width_column_wrapper vals{3, 4, 5}; - - cudf::test::fixed_width_column_wrapper expect_keys{}; - - // Create expected empty struct column with sum and overflow children - auto sum_col = cudf::test::fixed_width_column_wrapper{}; - auto overflow_col = cudf::test::fixed_width_column_wrapper{}; - std::vector> children; - children.push_back(sum_col.release()); - children.push_back(overflow_col.release()); - auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {}); - - auto agg = cudf::make_sum_with_overflow_aggregation(); - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); - - // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby -} - -TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values) -{ - using V = TypeParam; - - cudf::test::fixed_width_column_wrapper keys{1, 1, 1}; - cudf::test::fixed_width_column_wrapper vals({3, 4, 5}, cudf::test::iterators::all_nulls()); - - cudf::test::fixed_width_column_wrapper expect_keys{1}; - - // Create expected struct column with sum and overflow children (null result) - // Child columns have no null masks, only struct-level null mask matters - auto sum_col = cudf::test::fixed_width_column_wrapper({0}); - auto overflow_col = cudf::test::fixed_width_column_wrapper({false}); - std::vector> children; - children.push_back(sum_col.release()); - children.push_back(overflow_col.release()); - std::vector validity{0}; // null struct - auto [validity_mask, null_count] = - cudf::test::detail::make_null_mask(validity.begin(), validity.end()); - auto expect_vals = - cudf::create_structs_hierarchy(1, std::move(children), null_count, std::move(validity_mask)); - - auto agg = cudf::make_sum_with_overflow_aggregation(); - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); - - // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby -} - -TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values) -{ - using V = TypeParam; - - cudf::test::fixed_width_column_wrapper keys( - {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, - {true, true, true, true, true, true, true, false, true, true, true}); - cudf::test::fixed_width_column_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, - {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); - - // { 1, 1, 2, 2, 2, 3, 3, 4} - cudf::test::fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, - cudf::test::iterators::no_nulls()); - - // Create expected struct column with sum and overflow children - // { 3, 6, 1, 4, 9, 2, 8, -} - // Child columns have no null masks, only struct-level null mask matters - auto sum_col = cudf::test::fixed_width_column_wrapper({9, 14, 10, 0}); - auto overflow_col = cudf::test::fixed_width_column_wrapper({false, false, false, false}); - std::vector> children; - children.push_back(sum_col.release()); - children.push_back(overflow_col.release()); - std::vector validity{1, 1, 1, 0}; - auto [validity_mask, null_count] = - cudf::test::detail::make_null_mask(validity.begin(), validity.end()); - auto expect_vals = - cudf::create_structs_hierarchy(4, std::move(children), null_count, std::move(validity_mask)); - - auto agg = cudf::make_sum_with_overflow_aggregation(); - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); - - // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby -} - -TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection) -{ - using V = TypeParam; - - cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4}; - // Mix of values that will cause positive and negative overflow for some groups but not others - cudf::test::fixed_width_column_wrapper vals{ - 9223372036854775800L, // Close to INT64_MAX - 100L, // Small value - 200L, // Small value - -9223372036854775800L, // Close to INT64_MIN - 20L, // Small value that will cause positive overflow when added to first - 200L, // Small value - 300L, // Small value - 9223372036854775800L, // Close to INT64_MAX - 9223372036854775800L, // Close to INT64_MAX - 1L, // Small value - 400L, // Small value - -20L, // Small value that will cause negative overflow when added to fourth - -9223372036854775800L}; // Close to INT64_MIN - - cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; - - // Create expected struct column with sum and overflow children - // Group 1: 9223372036854775800 + 20 + 9223372036854775800 = positive overflow - // Group 2: 100 + 200 + 300 + 400 = 1000 (no overflow) - // Group 3: 200 + 9223372036854775800 + 1 = positive overflow - // Group 4: -9223372036854775800 + (-20) + (-9223372036854775800) = negative overflow - auto sum_col = cudf::test::fixed_width_column_wrapper{ - 4L, // Positive overflow result for group 1 - 1000L, // Normal sum for group 2 (no overflow) - -9223372036854775615L, // Positive overflow result for group 3 - -4L // Negative overflow result for group 4 - }; - auto overflow_col = cudf::test::fixed_width_column_wrapper{true, false, true, true}; - std::vector> children; - children.push_back(sum_col.release()); - children.push_back(overflow_col.release()); - auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {}); - - auto agg = cudf::make_sum_with_overflow_aggregation(); - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); - - // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby -} - -// Test that SUM_WITH_OVERFLOW throws an error for invalid value types -TEST(groupby_sum_with_overflow_error_test, invalid_value_type) -{ - using K = int32_t; - using V = int32_t; // Invalid type for SUM_WITH_OVERFLOW, should only support int64_t - - cudf::test::fixed_width_column_wrapper keys{1, 1, 1, 2, 2, 2, 3, 3, 3}; - cudf::test::fixed_width_column_wrapper vals{1, 2, 3, 4, 5, 6, 7, 8, 9}; - - cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3}; - - auto agg = cudf::make_sum_with_overflow_aggregation(); - - // SUM_WITH_OVERFLOW should throw a logic_error when used with non-int64_t value types - EXPECT_THROW( - test_single_agg(keys, vals, expect_keys, {}, std::move(agg), force_use_sort_impl::NO), - cudf::logic_error); -} diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp new file mode 100644 index 00000000000..1b50659d8f7 --- /dev/null +++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +using namespace cudf::test::iterators; +using namespace numeric; + +using K = int32_t; + +// SUM_WITH_OVERFLOW tests - supports all integer and decimal types +template +struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {}; + +using sum_with_overflow_supported_types = + cudf::test::Concat, + cudf::test::FixedPointTypes>; + +TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types); + +TYPED_TEST(groupby_sum_with_overflow_test, basic) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3}; + + if constexpr (cudf::is_fixed_point()) { + // For decimal types + using RepType = cudf::device_storage_type_t; + auto const scale = scale_type{0}; + auto vals = + cudf::test::fixed_point_column_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale}; + + // Create expected struct column with sum and overflow children + auto sum_col = cudf::test::fixed_point_column_wrapper{{9, 19, 17}, scale}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{false, false, false}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // SUM_WITH_OVERFLOW should throw with sort-based groupby + auto agg_sort = cudf::make_sum_with_overflow_aggregation(); + EXPECT_THROW( + test_single_agg( + keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES), + cudf::logic_error); + } else { + // For integer types + cudf::test::fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + // Create expected struct column with sum and overflow children + // Sum column type matches input type V + auto sum_col = cudf::test::fixed_width_column_wrapper{9, 19, 17}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{false, false, false}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // SUM_WITH_OVERFLOW should throw with sort-based groupby + auto agg_sort = cudf::make_sum_with_overflow_aggregation(); + EXPECT_THROW( + test_single_agg( + keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES), + cudf::logic_error); + } + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, empty_cols) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys{}; + cudf::test::fixed_width_column_wrapper expect_keys{}; + + // For integer types + cudf::test::fixed_width_column_wrapper vals{}; + + // Create expected empty struct column with sum and overflow children + auto sum_col = cudf::test::fixed_width_column_wrapper{}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys({1, 2, 3}, cudf::test::iterators::all_nulls()); + cudf::test::fixed_width_column_wrapper expect_keys{}; + + // For integer types + cudf::test::fixed_width_column_wrapper vals{3, 4, 5}; + + // Create expected empty struct column with sum and overflow children + auto sum_col = cudf::test::fixed_width_column_wrapper{}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys{1, 1, 1}; + cudf::test::fixed_width_column_wrapper vals({3, 4, 5}, cudf::test::iterators::all_nulls()); + + cudf::test::fixed_width_column_wrapper expect_keys{1}; + + // Create expected struct column with sum and overflow children (null result) + // Child columns have no null masks, only struct-level null mask matters + auto sum_col = cudf::test::fixed_width_column_wrapper({0}); + auto overflow_col = cudf::test::fixed_width_column_wrapper({false}); + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + std::vector validity{0}; // null struct + auto [validity_mask, null_count] = + cudf::test::detail::make_null_mask(validity.begin(), validity.end()); + auto expect_vals = + cudf::create_structs_hierarchy(1, std::move(children), null_count, std::move(validity_mask)); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys( + {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {true, true, true, true, true, true, true, false, true, true, true}); + cudf::test::fixed_width_column_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, + {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // { 1, 1, 2, 2, 2, 3, 3, 4} + cudf::test::fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, + cudf::test::iterators::no_nulls()); + + // Create expected struct column with sum and overflow children + // { 3, 6, 1, 4, 9, 2, 8, -} + // Child columns have no null masks, only struct-level null mask matters + auto sum_col = cudf::test::fixed_width_column_wrapper({9, 14, 10, 0}); + auto overflow_col = cudf::test::fixed_width_column_wrapper({false, false, false, false}); + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + std::vector validity{1, 1, 1, 0}; + auto [validity_mask, null_count] = + cudf::test::detail::make_null_mask(validity.begin(), validity.end()); + auto expect_vals = + cudf::create_structs_hierarchy(4, std::move(children), null_count, std::move(validity_mask)); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection) +{ + using V = TypeParam; + + if constexpr (cudf::is_fixed_point()) { + using namespace numeric; + using RepType = cudf::device_storage_type_t; + + // Skip only decimal128 since it doesn't have atomic support + if constexpr (std::is_same_v) { + GTEST_SKIP() + << "decimal128 overflow detection test skipped - no atomic support for __int128_t"; + } else { + // Test decimal32 and decimal64 overflow detection + auto constexpr scale = scale_type{0}; // Use scale 0 for simplicity + + // Use type-specific values that will cause overflow for decimal types + auto constexpr type_max = cuda::std::numeric_limits::max(); + auto constexpr type_min = cuda::std::numeric_limits::min(); + + cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4}; + + // Create values that will cause overflow for this specific decimal type + RepType large_positive = type_max - 5; // Close to max + RepType small_increment = 10; // Will cause overflow when added to large_positive + RepType large_negative = type_min + 5; // Close to min (decimal types are always signed) + RepType small_decrement = -10; // Will cause underflow when added to large_negative + + // Use values that fit within the type range for non-overflowing groups + RepType small_val1 = 10; + RepType small_val2 = 20; + RepType small_val3 = 30; + RepType small_val4 = 40; + + cudf::test::fixed_point_column_wrapper vals{ + {large_positive, // Group 1: Close to max + small_val1, // Group 2: Small value + small_val2, // Group 3: Small value + large_negative, // Group 4: Close to min + small_increment, // Group 1: Will cause positive overflow + small_val2, // Group 2: Small value + small_val3, // Group 2: Small value + large_positive, // Group 1: Close to max (second occurrence) + large_positive, // Group 3: Close to max + 1, // Group 3: Small value + small_val4, // Group 2: Small value + small_decrement, // Group 4: Will cause negative overflow + large_negative}, // Group 4: Close to min (second occurrence) + scale}; + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; + + // Expected sums (with overflow handled by wrapping) + auto overflow_sum_1 = static_cast(static_cast(large_positive) + + static_cast(small_increment) + + static_cast(large_positive)); + auto normal_sum_2 = static_cast(small_val1 + small_val2 + small_val3 + small_val4); + auto overflow_sum_3 = + static_cast(static_cast(small_val2) + + static_cast(large_positive) + static_cast(1)); + auto overflow_sum_4 = static_cast(static_cast(large_negative) + + static_cast(small_decrement) + + static_cast(large_negative)); + + cudf::test::fixed_point_column_wrapper expect_sum_vals{ + {overflow_sum_1, normal_sum_2, overflow_sum_3, overflow_sum_4}, scale}; + cudf::test::fixed_width_column_wrapper expect_overflow_vals{true, false, true, true}; + + std::vector> children; + children.push_back(expect_sum_vals.release()); + children.push_back(expect_overflow_vals.release()); + auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Verify that sort-based groupby throws for decimals + auto agg2 = cudf::make_sum_with_overflow_aggregation(); + EXPECT_THROW( + test_single_agg( + keys, vals, expect_keys, *expect_vals, std::move(agg2), force_use_sort_impl::YES), + cudf::logic_error); + } + } else { + using DeviceType = cudf::device_storage_type_t; + + // Use type-specific values that will cause overflow for each integer type + auto constexpr type_max = cuda::std::numeric_limits::max(); + auto constexpr type_min = cuda::std::numeric_limits::min(); + + cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4}; + + // Create values that will cause overflow for this specific type + // Use smaller increments for smaller types to avoid immediate wrapping + DeviceType large_positive = type_max - 5; // Close to max + DeviceType small_increment = 10; // Will cause overflow when added to large_positive + DeviceType large_negative, small_decrement; + if constexpr (cuda::std::is_signed_v) { + large_negative = type_min + 5; // Close to min (only for signed types) + small_decrement = -10; // Will cause underflow when added to large_negative (signed only) + } + + // Use values that fit within the type range for non-overflowing groups + DeviceType small_val1 = 10; + DeviceType small_val2 = 20; + DeviceType small_val3 = 30; + DeviceType small_val4 = 40; + + if constexpr (cuda::std::is_signed_v) { + // For signed types: test both positive and negative overflow + cudf::test::fixed_width_column_wrapper vals{ + static_cast(large_positive), // Group 1: Close to max + static_cast(small_val1), // Group 2: Small value + static_cast(small_val2), // Group 3: Small value + static_cast(large_negative), // Group 4: Close to min + static_cast(small_increment), // Group 1: Will cause positive overflow + static_cast(small_val2), // Group 2: Small value + static_cast(small_val3), // Group 2: Small value + static_cast(large_positive), // Group 1: Close to max (second occurrence) + static_cast(large_positive), // Group 3: Close to max + static_cast(1), // Group 3: Small value + static_cast(small_val4), // Group 2: Small value + static_cast(small_decrement), // Group 4: Will cause negative overflow + static_cast(large_negative)}; // Group 4: Close to min (second occurrence) + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; + + // Expected results: Groups 1, 3, and 4 overflow; Group 2 does not + auto sum_col = cudf::test::fixed_width_column_wrapper{ + static_cast(static_cast(large_positive) + small_increment + + static_cast(large_positive)), // Group 1: overflowed result + static_cast(small_val1 + small_val2 + small_val3 + small_val4), // Group 2: no overflow + static_cast(static_cast(small_val2) + + static_cast(large_positive) + + static_cast(1)), // Group 3: overflowed result + static_cast(static_cast(large_negative) + small_decrement + + static_cast(large_negative)) // Group 4: overflowed result + }; + auto overflow_col = cudf::test::fixed_width_column_wrapper{true, false, true, true}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + } else { + // For unsigned types: only test positive overflow + cudf::test::fixed_width_column_wrapper vals{ + static_cast(large_positive), // Group 1: Close to max + static_cast(small_val1), // Group 2: Small value + static_cast(small_val2), // Group 3: Small value + static_cast(small_val1), // Group 4: Small value + static_cast(small_increment), // Group 1: Will cause positive overflow + static_cast(small_val2), // Group 2: Small value + static_cast(small_val3), // Group 2: Small value + static_cast(large_positive), // Group 1: Close to max (second occurrence) + static_cast(large_positive), // Group 3: Close to max + static_cast(1), // Group 3: Small value + static_cast(small_val4), // Group 2: Small value + static_cast(small_val2), // Group 4: Small value + static_cast(small_val3)}; // Group 4: Small value + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; + + // Expected results: Groups 1 and 3 overflow; Groups 2 and 4 do not + auto sum_col = cudf::test::fixed_width_column_wrapper{ + static_cast(static_cast(large_positive) + small_increment + + static_cast(large_positive)), // Group 1: overflowed result + static_cast(small_val1 + small_val2 + small_val3 + small_val4), // Group 2: no overflow + static_cast(static_cast(small_val2) + + static_cast(large_positive) + + static_cast(1)), // Group 3: overflowed result + static_cast(small_val1 + small_val2 + small_val3) // Group 4: no overflow + }; + auto overflow_col = cudf::test::fixed_width_column_wrapper{true, false, true, false}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + } + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby + } // end else block for non-decimal types +} + +// Test that SUM_WITH_OVERFLOW works with int32_t (which used to be unsupported) +TEST(groupby_sum_with_overflow_compatibility_test, int32_support) +{ + using K = int32_t; + using V = int32_t; // int32_t is now supported by SUM_WITH_OVERFLOW + + cudf::test::fixed_width_column_wrapper keys{1, 1, 1, 2, 2, 2, 3, 3, 3}; + cudf::test::fixed_width_column_wrapper vals{1, 2, 3, 4, 5, 6, 7, 8, 9}; + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3}; + + // Create expected struct column with sum and overflow children + auto sum_col = cudf::test::fixed_width_column_wrapper{6, 15, 24}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{false, false, false}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + + // SUM_WITH_OVERFLOW should now work with int32_t value types + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg), force_use_sort_impl::NO); + + // SUM_WITH_OVERFLOW should throw with sort-based groupby + auto agg_sort = cudf::make_sum_with_overflow_aggregation(); + EXPECT_THROW( + test_single_agg( + keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES), + cudf::logic_error); +} + +// Test that SUM_WITH_OVERFLOW throws an error for bool type (which is not supported) +TEST(groupby_sum_with_overflow_error_test, bool_type_not_supported) +{ + using K = int32_t; + using V = bool; // bool type should not be supported by SUM_WITH_OVERFLOW + + cudf::test::fixed_width_column_wrapper keys{1, 1, 1}; + cudf::test::fixed_width_column_wrapper vals{true, false, true}; + + auto agg = cudf::make_sum_with_overflow_aggregation(); + + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys})); + + // This should throw an exception since bool is not supported + EXPECT_THROW(gb_obj.aggregate(requests), cudf::logic_error); +} + +// Test that SUM_WITH_OVERFLOW throws an error for unsigned integer types (which are not supported) +TEST(groupby_sum_with_overflow_error_test, unsigned_types_not_supported) +{ + using K = int32_t; + + // Test uint32_t + { + using V = uint32_t; // unsigned types should not be supported by SUM_WITH_OVERFLOW + + cudf::test::fixed_width_column_wrapper keys{1, 1, 1}; + cudf::test::fixed_width_column_wrapper vals{1, 2, 3}; + + auto agg = cudf::make_sum_with_overflow_aggregation(); + + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys})); + + // This should throw an exception since unsigned types are not supported + EXPECT_THROW(gb_obj.aggregate(requests), cudf::logic_error); + } + + // Test uint64_t + { + using V = uint64_t; // unsigned types should not be supported by SUM_WITH_OVERFLOW + + cudf::test::fixed_width_column_wrapper keys{1, 1, 1}; + cudf::test::fixed_width_column_wrapper vals{1, 2, 3}; + + auto agg = cudf::make_sum_with_overflow_aggregation(); + + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys})); + + // This should throw an exception since unsigned types are not supported + EXPECT_THROW(gb_obj.aggregate(requests), cudf::logic_error); + } +} From 3badfc789e6f9f07d69b069ffdc4ba635e49da51 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 10:54:00 -0700 Subject: [PATCH 02/23] Resolve conflicts --- cpp/src/groupby/hash/output_utils.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu index 860845a5313..5a8b0fc2ed0 100644 --- a/cpp/src/groupby/hash/output_utils.cu +++ b/cpp/src/groupby/hash/output_utils.cu @@ -72,11 +72,11 @@ struct result_column_creator { return make_fixed_width_column(data_type{type_id}, size, mask_state, stream, mr); }; - auto make_children = [&make_empty_column](size_type size) { + auto make_children = [&make_empty_column, col_type = col.type()](size_type size) { std::vector> children; // Create sum child column (int64_t) - no null mask needed, struct-level mask handles // nullability - children.push_back(make_empty_column(type_id::INT64, size, mask_state::UNALLOCATED)); + children.push_back(make_empty_column(col_type.id(), size, mask_state::UNALLOCATED)); // Create overflow child column (bool) - no null mask needed, only value matters children.push_back(make_empty_column(type_id::BOOL8, size, mask_state::UNALLOCATED)); return children; From 8a446391cad95ab119d807fcac663b8eff7e5ac8 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 12:02:37 -0700 Subject: [PATCH 03/23] Cleanups --- .../cudf/detail/aggregation/aggregation.hpp | 3 +- .../detail/aggregation/device_aggregators.cuh | 48 ++++++------------- cpp/src/aggregation/aggregation.cu | 22 ++++----- cpp/src/groupby/groupby.cu | 13 ++--- 4 files changed, 30 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index d96a0cd244c..e13234d80ab 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1419,8 +1419,7 @@ struct target_type_impl - requires((cudf::is_integral() && !cuda::std::is_same_v && - cuda::std::is_signed_v) || + requires(cudf::is_integral_not_bool() && cudf::is_signed() || cudf::is_fixed_point()) struct target_type_impl { using type = struct_view; // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 4b6d2af5dcf..53b628a4cd1 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -140,58 +140,40 @@ struct update_target_element { }; template - requires(((cudf::is_integral() && !cuda::std::is_same_v && - cuda::std::is_signed_v) || - cudf::is_fixed_point()) && - cudf::has_atomic_support>()) + requires((cudf::is_integral_not_bool() && cudf::is_signed()) || + cudf::is_fixed_point() && + cudf::has_atomic_support>()) struct update_target_element { + using DeviceType = device_storage_type_t; + static constexpr auto type_max = cuda::std::numeric_limits::max(); + static constexpr auto type_min = cuda::std::numeric_limits::min(); + __device__ void operator()(mutable_column_device_view target, size_type target_index, column_device_view source, size_type source_index) const noexcept { - // For SUM_WITH_OVERFLOW, target is a struct with sum value at child(0) and overflow flag at - // child(1) auto sum_column = target.child(0); auto overflow_column = target.child(1); - using DeviceType = device_storage_type_t; - - // Extract source value: for integral types, read as Source then cast; for decimal types, read - // as DeviceType directly - DeviceType device_source_value; - if constexpr (cudf::is_fixed_point()) { - device_source_value = source.element(source_index); - } else { - auto const source_value = source.element(source_index); - device_source_value = static_cast(source_value); - } - + auto const source_value = source.element(source_index); auto const old_sum = - cudf::detail::atomic_add(&sum_column.element(target_index), device_source_value); + cudf::detail::atomic_add(&sum_column.element(target_index), source_value); - // Early exit if overflow is already set to avoid unnecessary overflow checking + // Early exit if overflow is already set auto bool_ref = cuda::atomic_ref{ *(overflow_column.data() + target_index)}; if (bool_ref.load(cuda::memory_order_relaxed)) { return; } - // Check for overflow before performing the addition to avoid UB - auto constexpr type_max = cuda::std::numeric_limits::max(); - auto constexpr type_min = cuda::std::numeric_limits::min(); - + // Check for overflow bool overflow = false; - // Check for positive overflow (signed types only) - if (old_sum > 0 && device_source_value > 0 && old_sum > type_max - device_source_value) { + if (source_value > 0 && old_sum > type_max - source_value) { overflow = true; - } - // Check for negative overflow (signed types only) - if (old_sum < 0 && device_source_value < 0 && old_sum < type_min - device_source_value) { + } else if (source_value < 0 && old_sum < type_min - source_value) { overflow = true; } - if (overflow) { - // Atomically set overflow flag to true (use atomic_max since true > false) - cudf::detail::atomic_max(&overflow_column.element(target_index), true); - } + + if (overflow) { cudf::detail::atomic_max(&overflow_column.element(target_index), true); } } }; diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index 652f8037de8..bf795c24d9b 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -97,21 +97,19 @@ struct identity_initializer { { if constexpr (k == aggregation::SUM_WITH_OVERFLOW) { // SUM_WITH_OVERFLOW uses a struct with sum and overflow children - // We need to initialize each child separately since we don't know the sum column type at - // compile time auto sum_col = col.child(0); auto overflow_col = col.child(1); - // Initialize overflow column to false - thrust::fill(rmm::exec_policy_nosync(stream), - overflow_col.begin(), - overflow_col.end(), - false); - - // Initialize sum column to 0 based on its actual type - // Use SUM aggregation identity initializer which sets to 0 - dispatch_type_and_aggregation( - sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream); + // Initialize both columns using device storage type + cudf::type_dispatcher(sum_col.type(), [&]() { + using DeviceType = device_storage_type_t; + auto zip_it = + thrust::make_zip_iterator(sum_col.begin(), overflow_col.begin()); + thrust::fill_n(rmm::exec_policy_nosync(stream), + zip_it, + col.size(), + thrust::make_tuple(DeviceType{0}, false)); + }); } else if constexpr (std::is_same_v) { // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation"); diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index a67ac823d74..90cf9fefd06 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -218,17 +218,12 @@ void verify_valid_requests(host_span requests) for (auto const& request : requests) { for (auto const& agg : request.aggregations) { if (agg->kind == aggregation::SUM_WITH_OVERFLOW) { - auto const type_id = request.values.type().id(); - bool is_supported = - (type_id == type_id::INT8 || type_id == type_id::INT16 || type_id == type_id::INT32 || - type_id == type_id::INT64 || type_id == type_id::DECIMAL32 || - type_id == type_id::DECIMAL64 || type_id == type_id::DECIMAL128); - CUDF_EXPECTS( - is_supported, + cudf::detail::is_valid_aggregation(request.values.type(), aggregation::SUM_WITH_OVERFLOW), "SUM_WITH_OVERFLOW aggregation only supports signed integer types (int8_t, int16_t, " - "int32_t, int64_t) and decimal types (decimal32, decimal64, decimal128). " - "Unsigned integers, bool, dictionary columns, and other types are not supported."); + "int32_t, int64_t) and decimal types (decimal32, decimal64). " + "decimal128, unsigned integers, bool, dictionary columns, and other types are not " + "supported."); } } } From e0616f42f613894a70c990e0bfec6e1bdc07272c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 12:18:40 -0700 Subject: [PATCH 04/23] Fix --- cpp/src/aggregation/aggregation.cu | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index bf795c24d9b..4f5a731979d 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -100,16 +100,15 @@ struct identity_initializer { auto sum_col = col.child(0); auto overflow_col = col.child(1); - // Initialize both columns using device storage type - cudf::type_dispatcher(sum_col.type(), [&]() { - using DeviceType = device_storage_type_t; - auto zip_it = - thrust::make_zip_iterator(sum_col.begin(), overflow_col.begin()); - thrust::fill_n(rmm::exec_policy_nosync(stream), - zip_it, - col.size(), - thrust::make_tuple(DeviceType{0}, false)); - }); + // Initialize overflow column to false + thrust::fill(rmm::exec_policy_nosync(stream), + overflow_col.begin(), + overflow_col.end(), + false); + + // Initialize sum column based on its actual type + dispatch_type_and_aggregation( + sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream); } else if constexpr (std::is_same_v) { // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation"); From e5fd7ee8bb28ef3197dd4170385dcd1e29e8f8be Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 12:47:54 -0700 Subject: [PATCH 05/23] Cleanups --- cpp/src/aggregation/aggregation.cu | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index 4f5a731979d..e761a9fa817 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -44,6 +44,7 @@ namespace { * * Only works on columns of fixed-width types. */ + struct identity_initializer { private: template @@ -100,15 +101,12 @@ struct identity_initializer { auto sum_col = col.child(0); auto overflow_col = col.child(1); - // Initialize overflow column to false - thrust::fill(rmm::exec_policy_nosync(stream), - overflow_col.begin(), - overflow_col.end(), - false); - - // Initialize sum column based on its actual type + // Initialize sum column using SUM aggregation dispatch (same supported types) + // and overflow column separately dispatch_type_and_aggregation( sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream); + thrust::fill_n( + rmm::exec_policy_nosync(stream), overflow_col.begin(), col.size(), false); } else if constexpr (std::is_same_v) { // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation"); From 11f716297c63b5e2c51efb8b5ce63adc62a3bc35 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 13:52:06 -0700 Subject: [PATCH 06/23] Disable decimal128 support for now --- cpp/include/cudf/detail/aggregation/aggregation.hpp | 5 +++-- cpp/src/aggregation/aggregation.cu | 3 +-- cpp/src/groupby/groupby.cu | 6 ++---- cpp/tests/groupby/sum_with_overflow_tests.cpp | 4 ++-- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index e13234d80ab..7ef2299c13c 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1417,10 +1417,11 @@ struct target_type_impl requires(cudf::is_integral_not_bool() && cudf::is_signed() || - cudf::is_fixed_point()) + (cudf::is_fixed_point() && !std::is_same_v)) struct target_type_impl { using type = struct_view; // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields }; diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index e761a9fa817..f27df722412 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -101,8 +101,7 @@ struct identity_initializer { auto sum_col = col.child(0); auto overflow_col = col.child(1); - // Initialize sum column using SUM aggregation dispatch (same supported types) - // and overflow column separately + // Initialize sum column using standard SUM aggregation dispatch dispatch_type_and_aggregation( sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream); thrust::fill_n( diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 90cf9fefd06..3397e79ebf7 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -220,10 +220,8 @@ void verify_valid_requests(host_span requests) if (agg->kind == aggregation::SUM_WITH_OVERFLOW) { CUDF_EXPECTS( cudf::detail::is_valid_aggregation(request.values.type(), aggregation::SUM_WITH_OVERFLOW), - "SUM_WITH_OVERFLOW aggregation only supports signed integer types (int8_t, int16_t, " - "int32_t, int64_t) and decimal types (decimal32, decimal64). " - "decimal128, unsigned integers, bool, dictionary columns, and other types are not " - "supported."); + "SUM_WITH_OVERFLOW aggregation only supports signed integer types and decimal types. " + "Unsigned integers, bool, dictionary columns, and other types are not supported."); } } } diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp index 1b50659d8f7..d0bef40b479 100644 --- a/cpp/tests/groupby/sum_with_overflow_tests.cpp +++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp @@ -30,13 +30,13 @@ using namespace numeric; using K = int32_t; -// SUM_WITH_OVERFLOW tests - supports all integer and decimal types +// SUM_WITH_OVERFLOW tests - supports signed integer and decimal types (excluding decimal128) template struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {}; using sum_with_overflow_supported_types = cudf::test::Concat, - cudf::test::FixedPointTypes>; + cudf::test::Types>; TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types); From a9149907e81f23ca2dcb2d180d80c3133517b91e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 14:19:40 -0700 Subject: [PATCH 07/23] Minor cleanups --- .../cudf/detail/aggregation/device_aggregators.cuh | 9 ++------- cpp/src/aggregation/aggregation.cu | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 53b628a4cd1..55f45a23873 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -165,13 +165,8 @@ struct update_target_element { *(overflow_column.data() + target_index)}; if (bool_ref.load(cuda::memory_order_relaxed)) { return; } - // Check for overflow - bool overflow = false; - if (source_value > 0 && old_sum > type_max - source_value) { - overflow = true; - } else if (source_value < 0 && old_sum < type_min - source_value) { - overflow = true; - } + auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) || + (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value)); if (overflow) { cudf::detail::atomic_max(&overflow_column.element(target_index), true); } } diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index f27df722412..271ba926159 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -44,7 +44,6 @@ namespace { * * Only works on columns of fixed-width types. */ - struct identity_initializer { private: template From 512701b05b95955d9994e1beb2b8f0fca057f301 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 14:23:21 -0700 Subject: [PATCH 08/23] Update comments --- cpp/include/cudf/detail/aggregation/device_aggregators.cuh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 55f45a23873..0861bd50057 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -165,6 +165,11 @@ struct update_target_element { *(overflow_column.data() + target_index)}; if (bool_ref.load(cuda::memory_order_relaxed)) { return; } + // Check for overflow before performing the addition to avoid UB + // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value + // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value + // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is + // ready auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) || (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value)); From 2de441774137f971751d89ff104970a495182cdc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 15:53:32 -0700 Subject: [PATCH 09/23] Add decimal128 support --- .../cudf/detail/aggregation/aggregation.hpp | 5 +- .../detail/aggregation/device_aggregators.cuh | 39 +++++ .../cudf/detail/utilities/device_atomics.cuh | 79 ++++++++++ .../device_atomics/device_atomics_test.cu | 40 ++++- cpp/tests/groupby/sum_with_overflow_tests.cpp | 138 +++++++++--------- 5 files changed, 225 insertions(+), 76 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 7ef2299c13c..e13234d80ab 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1417,11 +1417,10 @@ struct target_type_impl requires(cudf::is_integral_not_bool() && cudf::is_signed() || - (cudf::is_fixed_point() && !std::is_same_v)) + cudf::is_fixed_point()) struct target_type_impl { using type = struct_view; // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields }; diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 0861bd50057..9b29d69004a 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -177,6 +177,45 @@ struct update_target_element { } }; +/** + * @brief Specialization of update_target_element for SUM_WITH_OVERFLOW with decimal128 + * + * Uses two-64-bit atomic operations to provide atomic addition for __int128_t. + * This enables SUM_WITH_OVERFLOW for decimal128 on all GPU architectures. + */ +template <> +struct update_target_element { + using DeviceType = device_storage_type_t; // __int128_t + static constexpr auto type_max = cuda::std::numeric_limits::max(); + static constexpr auto type_min = cuda::std::numeric_limits::min(); + + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + auto sum_column = target.child(0); + auto overflow_column = target.child(1); + + auto const source_value = source.element(source_index); + + // Use the new 128-bit atomic addition implementation + auto const old_sum = + cudf::detail::atomic_add(&sum_column.element(target_index), source_value); + + // Early exit if overflow is already set + auto bool_ref = cuda::atomic_ref{ + *(overflow_column.data() + target_index)}; + if (bool_ref.load(cuda::memory_order_relaxed)) { return; } + + // Check for overflow using the improved logic + auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) || + (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value)); + + if (overflow) { cudf::detail::atomic_max(&overflow_column.element(target_index), true); } + } +}; + /** * @brief Function object to update a single element in a target column using * the dictionary key addressed by the specific index. diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 9eff232fa96..3ec86d9b7a0 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -346,5 +346,84 @@ __forceinline__ __device__ T atomic_cas(T* address, T compare, T val) return cudf::detail::typesAtomicCASImpl()(address, compare, val); } +/** + * @brief Helper function to calculate carry for 64-bit addition + * + * @param old_val The original 64-bit value + * @param add_val The value being added + * @param carry_in Carry from previous addition + * @return 1 if carry is generated, 0 otherwise + */ +__device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val, + uint64_t add_val, + uint64_t carry_in) +{ + // Use __uint128_t to detect overflow beyond 64-bit range + __uint128_t sum = static_cast<__uint128_t>(old_val) + add_val + carry_in; + return (sum > cuda::std::numeric_limits::max()) ? 1 : 0; +} + +/** + * @brief Atomic addition for __int128_t with GPU architecture optimization + * + * Uses native atomicAdd on Hopper+ GPUs (compute capability 9.0+) where 128-bit + * atomic operations are supported. Falls back to two 64-bit atomic CAS operations + * with carry propagation on older GPU architectures. + * + * @param address Pointer to the __int128_t value + * @param val Value to add + * @return The old value before addition + */ +__forceinline__ __device__ __int128_t atomic_add(__int128_t* address, __int128_t val) +{ +#if __CUDA_ARCH__ >= 900 // Hopper and newer (compute capability 9.0+) + // Use native 128-bit atomic support on Hopper+ GPUs + return atomicAdd(address, val); +#else + // Fall back to two 64-bit atomic implementation for older GPUs + + // Cast to signed for atomic_cas compatibility and treat as array of two int64_t + auto* target_ptr = reinterpret_cast(address); + auto add_val_unsigned = static_cast<__uint128_t>(val); + + // Split the 128-bit add value into two 64-bit parts + int64_t add_low = static_cast(static_cast(add_val_unsigned)); + int64_t add_high = static_cast(static_cast(add_val_unsigned >> 64)); + + int64_t carry = 0; + int64_t old_parts[2]; + + // Process both 64-bit parts with carry propagation + for (int i = 0; i < 2; ++i) { + int64_t current_add = (i == 0) ? add_low : add_high; + int64_t old_part, new_part, expected_part; + + // Atomic compare-and-swap loop for this 64-bit part + do { + expected_part = target_ptr[i]; + new_part = expected_part + current_add + carry; + old_part = atomicCAS(reinterpret_cast(&target_ptr[i]), + static_cast(expected_part), + static_cast(new_part)); + old_part = static_cast(old_part); + } while (old_part != expected_part); + + // Store the old value for result reconstruction + old_parts[i] = old_part; + + // Calculate carry for next iteration (convert back to unsigned for carry calculation) + carry = static_cast(calculate_carry_64(static_cast(old_part), + static_cast(current_add), + static_cast(carry))); + } + + // Reconstruct the original 128-bit value from the old parts + __uint128_t old_val_unsigned = + (static_cast<__uint128_t>(static_cast(old_parts[1])) << 64) | + static_cast(old_parts[0]); + return static_cast<__int128_t>(old_val_unsigned); +#endif +} + } // namespace detail } // namespace cudf diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index 2d0079f875b..b74b25c90a8 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -23,11 +23,14 @@ #include #include #include +#include #include #include #include -#include +#include + +#include #include @@ -261,4 +264,39 @@ TYPED_TEST(AtomicsTest, atomicCASRandom) this->atomic_test(input_array, is_cas_test, block_size, grid_size); } +__global__ void test_single_atomic_add_kernel(__int128_t* target, __int128_t value) +{ + cudf::detail::atomic_add(target, value); +} + +class Atomic128Test : public cudf::test::BaseFixture { + public: + void run_atomic_add_test(__int128_t initial_value, + __int128_t add_value, + __int128_t expected_result) + { + rmm::device_scalar<__int128_t> d_target(initial_value, cudf::test::get_default_stream()); + test_single_atomic_add_kernel<<<32, 256>>>(d_target.data(), add_value); + CUDF_CHECK_CUDA(cudf::test::get_default_stream().value()); + __int128_t result = d_target.value(cudf::test::get_default_stream()); + EXPECT_EQ(result, expected_result); + } +}; + +TEST_F(Atomic128Test, BasicAddition) { run_atomic_add_test(0, 1, 32 * 256 * 1); } + +TEST_F(Atomic128Test, CarryPropagation) +{ + constexpr int total_threads = 32 * 256; + __int128_t initial_value = cuda::std::numeric_limits::max() - total_threads + 1; + __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits::max()) + 1; + run_atomic_add_test(initial_value, 1, expected); +} + +TEST_F(Atomic128Test, NegativeNumbers) +{ + constexpr int total_threads = 32 * 256; + run_atomic_add_test(total_threads * 100, -50, total_threads * 50); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp index d0bef40b479..863fd72d5ca 100644 --- a/cpp/tests/groupby/sum_with_overflow_tests.cpp +++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp @@ -30,13 +30,13 @@ using namespace numeric; using K = int32_t; -// SUM_WITH_OVERFLOW tests - supports signed integer and decimal types (excluding decimal128) +// SUM_WITH_OVERFLOW tests - supports signed integer and decimal types template struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {}; using sum_with_overflow_supported_types = cudf::test::Concat, - cudf::test::Types>; + cudf::test::FixedPointTypes>; TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types); @@ -216,81 +216,75 @@ TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection) using namespace numeric; using RepType = cudf::device_storage_type_t; - // Skip only decimal128 since it doesn't have atomic support - if constexpr (std::is_same_v) { - GTEST_SKIP() - << "decimal128 overflow detection test skipped - no atomic support for __int128_t"; - } else { - // Test decimal32 and decimal64 overflow detection - auto constexpr scale = scale_type{0}; // Use scale 0 for simplicity - - // Use type-specific values that will cause overflow for decimal types - auto constexpr type_max = cuda::std::numeric_limits::max(); - auto constexpr type_min = cuda::std::numeric_limits::min(); - - cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4}; - - // Create values that will cause overflow for this specific decimal type - RepType large_positive = type_max - 5; // Close to max - RepType small_increment = 10; // Will cause overflow when added to large_positive - RepType large_negative = type_min + 5; // Close to min (decimal types are always signed) - RepType small_decrement = -10; // Will cause underflow when added to large_negative - - // Use values that fit within the type range for non-overflowing groups - RepType small_val1 = 10; - RepType small_val2 = 20; - RepType small_val3 = 30; - RepType small_val4 = 40; - - cudf::test::fixed_point_column_wrapper vals{ - {large_positive, // Group 1: Close to max - small_val1, // Group 2: Small value - small_val2, // Group 3: Small value - large_negative, // Group 4: Close to min - small_increment, // Group 1: Will cause positive overflow - small_val2, // Group 2: Small value - small_val3, // Group 2: Small value - large_positive, // Group 1: Close to max (second occurrence) - large_positive, // Group 3: Close to max - 1, // Group 3: Small value - small_val4, // Group 2: Small value - small_decrement, // Group 4: Will cause negative overflow - large_negative}, // Group 4: Close to min (second occurrence) - scale}; + // Test decimal overflow detection for all decimal types + auto constexpr scale = scale_type{0}; // Use scale 0 for simplicity - cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; + // Use type-specific values that will cause overflow for decimal types + auto constexpr type_max = cuda::std::numeric_limits::max(); + auto constexpr type_min = cuda::std::numeric_limits::min(); - // Expected sums (with overflow handled by wrapping) - auto overflow_sum_1 = static_cast(static_cast(large_positive) + - static_cast(small_increment) + - static_cast(large_positive)); - auto normal_sum_2 = static_cast(small_val1 + small_val2 + small_val3 + small_val4); - auto overflow_sum_3 = - static_cast(static_cast(small_val2) + - static_cast(large_positive) + static_cast(1)); - auto overflow_sum_4 = static_cast(static_cast(large_negative) + - static_cast(small_decrement) + - static_cast(large_negative)); - - cudf::test::fixed_point_column_wrapper expect_sum_vals{ - {overflow_sum_1, normal_sum_2, overflow_sum_3, overflow_sum_4}, scale}; - cudf::test::fixed_width_column_wrapper expect_overflow_vals{true, false, true, true}; + cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4}; - std::vector> children; - children.push_back(expect_sum_vals.release()); - children.push_back(expect_overflow_vals.release()); - auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {}); + // Create values that will cause overflow for this specific decimal type + RepType large_positive = type_max - 5; // Close to max + RepType small_increment = 10; // Will cause overflow when added to large_positive + RepType large_negative = type_min + 5; // Close to min (decimal types are always signed) + RepType small_decrement = -10; // Will cause underflow when added to large_negative - auto agg = cudf::make_sum_with_overflow_aggregation(); - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + // Use values that fit within the type range for non-overflowing groups + RepType small_val1 = 10; + RepType small_val2 = 20; + RepType small_val3 = 30; + RepType small_val4 = 40; + + cudf::test::fixed_point_column_wrapper vals{ + {large_positive, // Group 1: Close to max + small_val1, // Group 2: Small value + small_val2, // Group 3: Small value + large_negative, // Group 4: Close to min + small_increment, // Group 1: Will cause positive overflow + small_val2, // Group 2: Small value + small_val3, // Group 2: Small value + large_positive, // Group 1: Close to max (second occurrence) + large_positive, // Group 3: Close to max + 1, // Group 3: Small value + small_val4, // Group 2: Small value + small_decrement, // Group 4: Will cause negative overflow + large_negative}, // Group 4: Close to min (second occurrence) + scale}; + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; + + // Expected sums (with overflow handled by wrapping) + auto overflow_sum_1 = static_cast(static_cast(large_positive) + + static_cast(small_increment) + + static_cast(large_positive)); + auto normal_sum_2 = static_cast(small_val1 + small_val2 + small_val3 + small_val4); + auto overflow_sum_3 = + static_cast(static_cast(small_val2) + static_cast(large_positive) + + static_cast(1)); + auto overflow_sum_4 = static_cast(static_cast(large_negative) + + static_cast(small_decrement) + + static_cast(large_negative)); + + cudf::test::fixed_point_column_wrapper expect_sum_vals{ + {overflow_sum_1, normal_sum_2, overflow_sum_3, overflow_sum_4}, scale}; + cudf::test::fixed_width_column_wrapper expect_overflow_vals{true, false, true, true}; - // Verify that sort-based groupby throws for decimals - auto agg2 = cudf::make_sum_with_overflow_aggregation(); - EXPECT_THROW( - test_single_agg( - keys, vals, expect_keys, *expect_vals, std::move(agg2), force_use_sort_impl::YES), - cudf::logic_error); - } + std::vector> children; + children.push_back(expect_sum_vals.release()); + children.push_back(expect_overflow_vals.release()); + auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Verify that sort-based groupby throws for decimals + auto agg2 = cudf::make_sum_with_overflow_aggregation(); + EXPECT_THROW( + test_single_agg( + keys, vals, expect_keys, *expect_vals, std::move(agg2), force_use_sort_impl::YES), + cudf::logic_error); } else { using DeviceType = cudf::device_storage_type_t; From 7ae723ab56ee4861a76d9295dcddf2583a061ac5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 15:56:06 -0700 Subject: [PATCH 10/23] Update tests --- cpp/tests/device_atomics/device_atomics_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index b74b25c90a8..08e9c2db3d3 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -296,7 +296,7 @@ TEST_F(Atomic128Test, CarryPropagation) TEST_F(Atomic128Test, NegativeNumbers) { constexpr int total_threads = 32 * 256; - run_atomic_add_test(total_threads * 100, -50, total_threads * 50); + run_atomic_add_test(0 - 50, -total_threads * 50); } CUDF_TEST_PROGRAM_MAIN() From 0429b9e9242498fe7ac211e93cda5caea7fe949e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 16:16:09 -0700 Subject: [PATCH 11/23] Cleanups --- .../cudf/detail/utilities/device_atomics.cuh | 57 +++++++------------ .../device_atomics/device_atomics_test.cu | 14 ++++- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 3ec86d9b7a0..2d85da43efe 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -39,6 +39,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -376,51 +377,37 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val, */ __forceinline__ __device__ __int128_t atomic_add(__int128_t* address, __int128_t val) { -#if __CUDA_ARCH__ >= 900 // Hopper and newer (compute capability 9.0+) - // Use native 128-bit atomic support on Hopper+ GPUs +#if __CUDA_ARCH__ >= 900 return atomicAdd(address, val); #else - // Fall back to two 64-bit atomic implementation for older GPUs - - // Cast to signed for atomic_cas compatibility and treat as array of two int64_t - auto* target_ptr = reinterpret_cast(address); - auto add_val_unsigned = static_cast<__uint128_t>(val); + uint64_t* const target_ptr = reinterpret_cast(address); + __uint128_t const add_val_unsigned = static_cast<__uint128_t>(val); // Split the 128-bit add value into two 64-bit parts - int64_t add_low = static_cast(static_cast(add_val_unsigned)); - int64_t add_high = static_cast(static_cast(add_val_unsigned >> 64)); + uint64_t const add_low = static_cast(add_val_unsigned); + uint64_t const add_high = static_cast(add_val_unsigned >> 64); + + uint64_t carry = 0; + uint64_t old_parts[2]; - int64_t carry = 0; - int64_t old_parts[2]; + cuda::static_for<0, 2>([&](auto i) { + uint64_t const current_add = (i == 0) ? add_low : add_high; + uint64_t expected_part, new_part; - // Process both 64-bit parts with carry propagation - for (int i = 0; i < 2; ++i) { - int64_t current_add = (i == 0) ? add_low : add_high; - int64_t old_part, new_part, expected_part; + cuda::atomic_ref atomic_part{target_ptr[i]}; - // Atomic compare-and-swap loop for this 64-bit part do { - expected_part = target_ptr[i]; + expected_part = atomic_part.load(); new_part = expected_part + current_add + carry; - old_part = atomicCAS(reinterpret_cast(&target_ptr[i]), - static_cast(expected_part), - static_cast(new_part)); - old_part = static_cast(old_part); - } while (old_part != expected_part); - - // Store the old value for result reconstruction - old_parts[i] = old_part; - - // Calculate carry for next iteration (convert back to unsigned for carry calculation) - carry = static_cast(calculate_carry_64(static_cast(old_part), - static_cast(current_add), - static_cast(carry))); - } + } while ( + !atomic_part.compare_exchange_weak(expected_part, new_part, cuda::memory_order_relaxed)); + + old_parts[i] = expected_part; + carry = calculate_carry_64(expected_part, current_add, carry); + }); - // Reconstruct the original 128-bit value from the old parts - __uint128_t old_val_unsigned = - (static_cast<__uint128_t>(static_cast(old_parts[1])) << 64) | - static_cast(old_parts[0]); + __uint128_t const old_val_unsigned = + (static_cast<__uint128_t>(old_parts[1]) << 64) | old_parts[0]; return static_cast<__int128_t>(old_val_unsigned); #endif } diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index 08e9c2db3d3..e517e3bfa25 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -288,15 +288,23 @@ TEST_F(Atomic128Test, BasicAddition) { run_atomic_add_test(0, 1, 32 * 256 * 1); TEST_F(Atomic128Test, CarryPropagation) { constexpr int total_threads = 32 * 256; - __int128_t initial_value = cuda::std::numeric_limits::max() - total_threads + 1; - __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits::max()) + 1; + __int128_t initial_value = cuda::std::numeric_limits::max() - total_threads + 1; + __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits::max()) + 1; run_atomic_add_test(initial_value, 1, expected); } TEST_F(Atomic128Test, NegativeNumbers) { constexpr int total_threads = 32 * 256; - run_atomic_add_test(0 - 50, -total_threads * 50); + run_atomic_add_test(0, -50, -total_threads * 50); +} + +TEST_F(Atomic128Test, NegativeCarryPropagation) +{ + constexpr int total_threads = 32 * 256; + __int128_t initial_value = cuda::std::numeric_limits::min() + total_threads - 1; + __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits::min()) - 1; + run_atomic_add_test(initial_value, -1, expected); } CUDF_TEST_PROGRAM_MAIN() From 7a9dc29e09f7cf5c2985a6609583e055d442c9bc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 16:19:47 -0700 Subject: [PATCH 12/23] Minor fix --- cpp/include/cudf/detail/aggregation/aggregation.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index e13234d80ab..51c31d259a3 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1419,7 +1419,7 @@ struct target_type_impl - requires(cudf::is_integral_not_bool() && cudf::is_signed() || + requires((cudf::is_integral_not_bool() && cudf::is_signed()) || cudf::is_fixed_point()) struct target_type_impl { using type = struct_view; // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields From 2710856bdf91319ce6b93b903b4e7a48e9b24d51 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 16:58:01 -0700 Subject: [PATCH 13/23] Fix Hopper atomicAdd --- .../cudf/detail/utilities/device_atomics.cuh | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 2d85da43efe..d9dd163da9e 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -365,11 +365,11 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val, } /** - * @brief Atomic addition for __int128_t with GPU architecture optimization + * @brief Atomic addition for __int128_t with architecture-specific optimization * - * Uses native atomicAdd on Hopper+ GPUs (compute capability 9.0+) where 128-bit - * atomic operations are supported. Falls back to two 64-bit atomic CAS operations - * with carry propagation on older GPU architectures. + * Uses native 128-bit CAS on Hopper+ GPUs (compute capability 9.0+) for optimal + * performance. Falls back to two 64-bit atomic CAS operations with carry propagation + * on older GPU architectures. * * @param address Pointer to the __int128_t value * @param val Value to add @@ -378,7 +378,14 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val, __forceinline__ __device__ __int128_t atomic_add(__int128_t* address, __int128_t val) { #if __CUDA_ARCH__ >= 900 - return atomicAdd(address, val); + __int128_t expected, desired; + + do { + expected = *address; + desired = expected + val; + } while (atomicCAS(address, expected, desired) != expected); + + return expected; #else uint64_t* const target_ptr = reinterpret_cast(address); __uint128_t const add_val_unsigned = static_cast<__uint128_t>(val); From 7c0f1dac52213c150dde35c1f191212bacdd86fd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 17:34:24 -0700 Subject: [PATCH 14/23] Cleanups --- .../detail/aggregation/device_aggregators.cuh | 46 ++----------------- 1 file changed, 4 insertions(+), 42 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 9b29d69004a..550eecac875 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -140,9 +140,10 @@ struct update_target_element { }; template - requires((cudf::is_integral_not_bool() && cudf::is_signed()) || - cudf::is_fixed_point() && - cudf::has_atomic_support>()) + requires( + (cudf::is_integral_not_bool() && cudf::is_signed()) || + (cudf::is_fixed_point() && cudf::has_atomic_support>()) || + cuda::std::is_same_v) struct update_target_element { using DeviceType = device_storage_type_t; static constexpr auto type_max = cuda::std::numeric_limits::max(); @@ -177,45 +178,6 @@ struct update_target_element { } }; -/** - * @brief Specialization of update_target_element for SUM_WITH_OVERFLOW with decimal128 - * - * Uses two-64-bit atomic operations to provide atomic addition for __int128_t. - * This enables SUM_WITH_OVERFLOW for decimal128 on all GPU architectures. - */ -template <> -struct update_target_element { - using DeviceType = device_storage_type_t; // __int128_t - static constexpr auto type_max = cuda::std::numeric_limits::max(); - static constexpr auto type_min = cuda::std::numeric_limits::min(); - - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept - { - auto sum_column = target.child(0); - auto overflow_column = target.child(1); - - auto const source_value = source.element(source_index); - - // Use the new 128-bit atomic addition implementation - auto const old_sum = - cudf::detail::atomic_add(&sum_column.element(target_index), source_value); - - // Early exit if overflow is already set - auto bool_ref = cuda::atomic_ref{ - *(overflow_column.data() + target_index)}; - if (bool_ref.load(cuda::memory_order_relaxed)) { return; } - - // Check for overflow using the improved logic - auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) || - (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value)); - - if (overflow) { cudf::detail::atomic_max(&overflow_column.element(target_index), true); } - } -}; - /** * @brief Function object to update a single element in a target column using * the dictionary key addressed by the specific index. From 60bde43186f928aa5682c510499d06000fd407f4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 21 Oct 2025 19:13:01 -0700 Subject: [PATCH 15/23] Fix the stream issue in tests --- cpp/tests/device_atomics/device_atomics_test.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index e517e3bfa25..fa8b5f7f01b 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -275,10 +275,11 @@ class Atomic128Test : public cudf::test::BaseFixture { __int128_t add_value, __int128_t expected_result) { - rmm::device_scalar<__int128_t> d_target(initial_value, cudf::test::get_default_stream()); - test_single_atomic_add_kernel<<<32, 256>>>(d_target.data(), add_value); - CUDF_CHECK_CUDA(cudf::test::get_default_stream().value()); - __int128_t result = d_target.value(cudf::test::get_default_stream()); + rmm::device_scalar<__int128_t> d_target(initial_value, cudf::get_default_stream()); + test_single_atomic_add_kernel<<<32, 256, 0, cudf::get_default_stream().value()>>>( + d_target.data(), add_value); + CUDF_CHECK_CUDA(cudf::get_default_stream().value()); + __int128_t result = d_target.value(cudf::get_default_stream()); EXPECT_EQ(result, expected_result); } }; From 64fe2593f4fecc25e6f1d2c8e3523353a855b93d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 22 Oct 2025 11:55:38 -0700 Subject: [PATCH 16/23] Minor cleanups --- cpp/include/cudf/detail/aggregation/device_aggregators.cuh | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 550eecac875..8b3723b457e 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -166,13 +166,10 @@ struct update_target_element { *(overflow_column.data() + target_index)}; if (bool_ref.load(cuda::memory_order_relaxed)) { return; } - // Check for overflow before performing the addition to avoid UB - // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value - // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is // ready - auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) || - (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value)); + auto const overflow = + source_value > 0 ? old_sum > type_max - source_value : old_sum < type_min - source_value; if (overflow) { cudf::detail::atomic_max(&overflow_column.element(target_index), true); } } From a11afbe2889872ffc326d008e981e3e2bffaa574 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 22 Oct 2025 14:36:18 -0700 Subject: [PATCH 17/23] Apply suggestion from @davidwendt Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/include/cudf/detail/utilities/device_atomics.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index d9dd163da9e..99f6aaa7472 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -361,7 +361,7 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val, { // Use __uint128_t to detect overflow beyond 64-bit range __uint128_t sum = static_cast<__uint128_t>(old_val) + add_val + carry_in; - return (sum > cuda::std::numeric_limits::max()) ? 1 : 0; + return sum > cuda::std::numeric_limits::max(); } /** From d520a28bfca6a1b0c802c5943c29780d4fa29cd3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 22 Oct 2025 14:36:35 -0700 Subject: [PATCH 18/23] Apply suggestion from @davidwendt Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/aggregation/aggregation.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index 271ba926159..7be37125dc9 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -103,7 +103,7 @@ struct identity_initializer { // Initialize sum column using standard SUM aggregation dispatch dispatch_type_and_aggregation( sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream); - thrust::fill_n( + thrust::uninitialized_fill_n( rmm::exec_policy_nosync(stream), overflow_col.begin(), col.size(), false); } else if constexpr (std::is_same_v) { // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case From 0748f55c0653e5ec02d2eb7e2dc66bc297fd80a4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 23 Oct 2025 10:02:33 -0700 Subject: [PATCH 19/23] Clean up tests --- cpp/tests/groupby/sum_with_overflow_tests.cpp | 40 +++---------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp index 863fd72d5ca..9009d2f803e 100644 --- a/cpp/tests/groupby/sum_with_overflow_tests.cpp +++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp @@ -28,8 +28,6 @@ using namespace cudf::test::iterators; using namespace numeric; -using K = int32_t; - // SUM_WITH_OVERFLOW tests - supports signed integer and decimal types template struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {}; @@ -42,6 +40,7 @@ TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_typ TYPED_TEST(groupby_sum_with_overflow_test, basic) { + using K = int32_t; using V = TypeParam; cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; @@ -100,6 +99,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, basic) TYPED_TEST(groupby_sum_with_overflow_test, empty_cols) { + using K = int32_t; using V = TypeParam; cudf::test::fixed_width_column_wrapper keys{}; @@ -124,6 +124,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, empty_cols) TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys) { + using K = int32_t; using V = TypeParam; cudf::test::fixed_width_column_wrapper keys({1, 2, 3}, cudf::test::iterators::all_nulls()); @@ -148,6 +149,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys) TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values) { + using K = int32_t; using V = TypeParam; cudf::test::fixed_width_column_wrapper keys{1, 1, 1}; @@ -176,6 +178,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values) TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values) { + using K = int32_t; using V = TypeParam; cudf::test::fixed_width_column_wrapper keys( @@ -210,6 +213,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values) TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection) { + using K = int32_t; using V = TypeParam; if constexpr (cudf::is_fixed_point()) { @@ -391,38 +395,6 @@ TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection) } // end else block for non-decimal types } -// Test that SUM_WITH_OVERFLOW works with int32_t (which used to be unsupported) -TEST(groupby_sum_with_overflow_compatibility_test, int32_support) -{ - using K = int32_t; - using V = int32_t; // int32_t is now supported by SUM_WITH_OVERFLOW - - cudf::test::fixed_width_column_wrapper keys{1, 1, 1, 2, 2, 2, 3, 3, 3}; - cudf::test::fixed_width_column_wrapper vals{1, 2, 3, 4, 5, 6, 7, 8, 9}; - - cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3}; - - // Create expected struct column with sum and overflow children - auto sum_col = cudf::test::fixed_width_column_wrapper{6, 15, 24}; - auto overflow_col = cudf::test::fixed_width_column_wrapper{false, false, false}; - std::vector> children; - children.push_back(sum_col.release()); - children.push_back(overflow_col.release()); - auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {}); - - auto agg = cudf::make_sum_with_overflow_aggregation(); - - // SUM_WITH_OVERFLOW should now work with int32_t value types - test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg), force_use_sort_impl::NO); - - // SUM_WITH_OVERFLOW should throw with sort-based groupby - auto agg_sort = cudf::make_sum_with_overflow_aggregation(); - EXPECT_THROW( - test_single_agg( - keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES), - cudf::logic_error); -} - // Test that SUM_WITH_OVERFLOW throws an error for bool type (which is not supported) TEST(groupby_sum_with_overflow_error_test, bool_type_not_supported) { From 2f154ab587c52b897ba9f8bca94951a9fb76bf58 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 23 Oct 2025 12:01:24 -0700 Subject: [PATCH 20/23] Header cleanups --- cpp/tests/groupby/sum_with_overflow_tests.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp index 9009d2f803e..87686429e02 100644 --- a/cpp/tests/groupby/sum_with_overflow_tests.cpp +++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp @@ -21,8 +21,11 @@ #include #include +#include #include -#include +#include +#include +#include #include using namespace cudf::test::iterators; From da628af6bc85da0a5942dc8d569dd74b5a9566d9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 27 Oct 2025 11:41:57 -0700 Subject: [PATCH 21/23] Update copyrights --- cpp/tests/groupby/sum_with_overflow_tests.cpp | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp index 87686429e02..a320f4c1f64 100644 --- a/cpp/tests/groupby/sum_with_overflow_tests.cpp +++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp @@ -1,17 +1,6 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 */ #include From 3856f1d596a5a083cd973e1adff5a94a07517cdc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 3 Nov 2025 09:45:16 -0800 Subject: [PATCH 22/23] Update comment --- cpp/src/groupby/hash/output_utils.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu index 6029134ab94..20adec1b0f2 100644 --- a/cpp/src/groupby/hash/output_utils.cu +++ b/cpp/src/groupby/hash/output_utils.cu @@ -63,8 +63,7 @@ struct result_column_creator { auto make_children = [&make_empty_column, col_type = col.type()](size_type size) { std::vector> children; - // Create sum child column (int64_t) - no null mask needed, struct-level mask handles - // nullability + // Create sum child column - no null mask needed, struct-level mask handles nullability children.push_back(make_empty_column(col_type.id(), size, mask_state::UNALLOCATED)); // Create overflow child column (bool) - no null mask needed, only value matters children.push_back(make_empty_column(type_id::BOOL8, size, mask_state::UNALLOCATED)); From 4372f92c9bf44d1923585a3538e78c576e70cf1f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 3 Nov 2025 14:05:01 -0800 Subject: [PATCH 23/23] Update cpp/include/cudf/detail/aggregation/aggregation.hpp Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/include/cudf/detail/aggregation/aggregation.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 6ef51f4af95..24832238981 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1406,7 +1406,7 @@ struct target_type_impl requires((cudf::is_integral_not_bool() && cudf::is_signed()) || cudf::is_fixed_point())