From 1455bf53cf1320ca2e7ad35e6737b18d7514811e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 5 Aug 2025 12:13:37 -0700
Subject: [PATCH 01/23] Add overflow support for signed integers and decimals

---
 .../cudf/detail/aggregation/aggregation.hpp   |   6 +-
 .../detail/aggregation/device_aggregators.cuh |  43 +-
 cpp/src/aggregation/aggregation.cu            |  19 +-
 cpp/src/groupby/groupby.cu                    |  24 +-
 .../hash/create_sparse_results_table.cu       |  10 +-
 cpp/src/groupby/sort/aggregate.cpp            |   5 +
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/groupby/sum_tests.cpp               | 204 -------
 cpp/tests/groupby/sum_with_overflow_tests.cpp | 498 ++++++++++++++++++
 9 files changed, 579 insertions(+), 231 deletions(-)
 create mode 100644 cpp/tests/groupby/sum_with_overflow_tests.cpp
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 81084f8bdfb..b1f167fde58 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1411,8 +1411,12 @@ struct target_type_impl<Source,
   using type = Source;
 };
 
-// SUM_WITH_OVERFLOW always outputs a struct {sum: int64_t, overflow: bool} regardless of input type
+// SUM_WITH_OVERFLOW outputs a struct {sum: Source, overflow: bool} where sum type matches input
+// type Only supports signed integral types (excluding bool) and decimal types
 template <typename Source>
+  requires((cudf::is_integral<Source>() && !cuda::std::is_same_v<Source, bool> &&
+            cuda::std::is_signed_v<Source>) ||
+           cudf::is_fixed_point<Source>())
 struct target_type_impl<Source, aggregation::SUM_WITH_OVERFLOW> {
   using type = struct_view;  // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields
 };
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 0c5b57c51a7..a3d61b13292 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -23,7 +23,9 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
 #include <cudf/utilities/traits.cuh>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -156,7 +158,10 @@ struct update_target_element<Source, aggregation::SUM> {
 };
 
 template <typename Source>
-  requires(cuda::std::is_same_v<Source, int64_t>)
+  requires(((cudf::is_integral<Source>() && !cuda::std::is_same_v<Source, bool> &&
+             cuda::std::is_signed_v<Source>) ||
+            cudf::is_fixed_point<Source>()) &&
+           cudf::has_atomic_support<device_storage_type_t<Source>>())
 struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
@@ -168,9 +173,20 @@ struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
     auto sum_column      = target.child(0);
     auto overflow_column = target.child(1);
 
-    auto const source_value = source.element<Source>(source_index);
+    using DeviceType = device_storage_type_t<Source>;
+
+    // Extract source value: for integral types, read as Source then cast; for decimal types, read
+    // as DeviceType directly
+    DeviceType device_source_value;
+    if constexpr (cudf::is_fixed_point<Source>()) {
+      device_source_value = source.element<DeviceType>(source_index);
+    } else {
+      auto const source_value = source.element<Source>(source_index);
+      device_source_value     = static_cast<DeviceType>(source_value);
+    }
+
     auto const old_sum =
-      cudf::detail::atomic_add(&sum_column.element<int64_t>(target_index), source_value);
+      cudf::detail::atomic_add(&sum_column.element<DeviceType>(target_index), device_source_value);
 
     // Early exit if overflow is already set to avoid unnecessary overflow checking
     auto bool_ref = cuda::atomic_ref<bool, cuda::thread_scope_device>{
@@ -178,15 +194,18 @@ struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
     if (bool_ref.load(cuda::memory_order_relaxed)) { return; }
 
     // Check for overflow before performing the addition to avoid UB
-    // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value
-    // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value
-    // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is
-    // ready
-    auto constexpr int64_max = cuda::std::numeric_limits<int64_t>::max();
-    auto constexpr int64_min = cuda::std::numeric_limits<int64_t>::min();
-    auto const overflow =
-      ((old_sum > 0 && source_value > 0 && old_sum > int64_max - source_value) ||
-       (old_sum < 0 && source_value < 0 && old_sum < int64_min - source_value));
+    auto constexpr type_max = cuda::std::numeric_limits<DeviceType>::max();
+    auto constexpr type_min = cuda::std::numeric_limits<DeviceType>::min();
+
+    bool overflow = false;
+    // Check for positive overflow (signed types only)
+    if (old_sum > 0 && device_source_value > 0 && old_sum > type_max - device_source_value) {
+      overflow = true;
+    }
+    // Check for negative overflow (signed types only)
+    if (old_sum < 0 && device_source_value < 0 && old_sum < type_min - device_source_value) {
+      overflow = true;
+    }
     if (overflow) {
       // Atomically set overflow flag to true (use atomic_max since true > false)
       cudf::detail::atomic_max(&overflow_column.element<bool>(target_index), true);
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index c58d1f7af7c..98f24e3b239 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -96,17 +96,22 @@ struct identity_initializer {
     requires(is_supported<T, k>())
   {
     if constexpr (k == aggregation::SUM_WITH_OVERFLOW) {
-      // SUM_WITH_OVERFLOW uses a struct with sum (int64_t) and overflow (bool) children
-      // Initialize sum child to 0 and overflow child to false
+      // SUM_WITH_OVERFLOW uses a struct with sum and overflow children
+      // We need to initialize each child separately since we don't know the sum column type at
+      // compile time
       auto sum_col      = col.child(0);
       auto overflow_col = col.child(1);
 
-      auto zip_begin = thrust::make_zip_iterator(
-        thrust::make_tuple(sum_col.begin<int64_t>(), overflow_col.begin<bool>()));
+      // Initialize overflow column to false
       thrust::fill(rmm::exec_policy_nosync(stream),
-                   zip_begin,
-                   zip_begin + col.size(),
-                   thrust::make_tuple(int64_t{0}, false));
+                   overflow_col.begin<bool>(),
+                   overflow_col.end<bool>(),
+                   false);
+
+      // Initialize sum column to 0 based on its actual type
+      // Use SUM aggregation identity initializer which sets to 0
+      dispatch_type_and_aggregation(
+        sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream);
     } else if constexpr (std::is_same_v<T, cudf::struct_view>) {
       // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case
       CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation");
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index bf3ee2a49aa..a67ac823d74 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -125,9 +125,10 @@ struct empty_column_constructor {
     if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); }
 
     if constexpr (k == aggregation::Kind::SUM_WITH_OVERFLOW) {
-      // SUM_WITH_OVERFLOW returns a struct with sum (int64_t) and overflow (bool) children
+      // SUM_WITH_OVERFLOW returns a struct with sum (same type as input) and overflow (bool)
+      // children
       std::vector<std::unique_ptr<cudf::column>> children;
-      children.push_back(make_empty_column(cudf::data_type{cudf::type_id::INT64}));
+      children.push_back(make_empty_column(values.type()));
       children.push_back(make_empty_column(cudf::data_type{cudf::type_id::BOOL8}));
       return make_structs_column(0, std::move(children), 0, {}, stream, mr);
     }
@@ -212,6 +213,25 @@ void verify_valid_requests(host_span<RequestType const> requests)
           });
       }),
     "Invalid type/aggregation combination.");
+
+  // Additional validation for SUM_WITH_OVERFLOW: only signed integers and decimals are supported
+  for (auto const& request : requests) {
+    for (auto const& agg : request.aggregations) {
+      if (agg->kind == aggregation::SUM_WITH_OVERFLOW) {
+        auto const type_id = request.values.type().id();
+        bool is_supported =
+          (type_id == type_id::INT8 || type_id == type_id::INT16 || type_id == type_id::INT32 ||
+           type_id == type_id::INT64 || type_id == type_id::DECIMAL32 ||
+           type_id == type_id::DECIMAL64 || type_id == type_id::DECIMAL128);
+
+        CUDF_EXPECTS(
+          is_supported,
+          "SUM_WITH_OVERFLOW aggregation only supports signed integer types (int8_t, int16_t, "
+          "int32_t, int64_t) and decimal types (decimal32, decimal64, decimal128). "
+          "Unsigned integers, bool, dictionary columns, and other types are not supported.");
+      }
+    }
+  }
 }
 
 }  // namespace
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
index dd622bbcc8f..d4bf95221b4 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.cu
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -70,12 +70,12 @@ struct sparse_column_creator {
       };
 
       // Lambda to create children for SUM_WITH_OVERFLOW struct column
-      auto make_children = [&make_empty_column](cudf::size_type size, cudf::mask_state mask_state) {
+      auto make_children = [&make_empty_column, &col_type](cudf::size_type size,
+                                                           cudf::mask_state mask_state) {
         std::vector<std::unique_ptr<cudf::column>> children;
-        // Create sum child column (int64_t) - no null mask needed, struct-level mask handles
-        // nullability
-        children.push_back(
-          make_empty_column(cudf::type_id::INT64, size, cudf::mask_state::UNALLOCATED));
+        // Create sum child column (same type as input column) - no null mask needed, struct-level
+        // mask handles nullability
+        children.push_back(make_empty_column(col_type.id(), size, cudf::mask_state::UNALLOCATED));
         // Create overflow child column (bool) - no null mask needed, only value matters
         children.push_back(
           make_empty_column(cudf::type_id::BOOL8, size, cudf::mask_state::UNALLOCATED));
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 793461a7b0e..0e6898b7f4e 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -875,6 +875,11 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
     auto store_functor =
       detail::aggregate_result_functor(request.values, helper(), cache, stream, mr);
     for (auto const& agg : request.aggregations) {
+      // SUM_WITH_OVERFLOW is only supported with hash-based groupby, not sort-based
+      CUDF_EXPECTS(agg->kind != aggregation::SUM_WITH_OVERFLOW,
+                   "SUM_WITH_OVERFLOW aggregation is only supported with hash-based groupby, not "
+                   "sort-based groupby");
+
       // TODO (dm): single pass compute all supported reductions
       cudf::detail::aggregation_dispatcher(agg->kind, store_functor, *agg);
     }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 65e3a608660..089897b7d2e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -160,6 +160,7 @@ ConfigureTest(
   groupby/sum_of_squares_tests.cpp
   groupby/sum_scan_tests.cpp
   groupby/sum_tests.cpp
+  groupby/sum_with_overflow_tests.cpp
   groupby/tdigest_tests.cu
   groupby/var_tests.cpp
   GPUS 1
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index f0ab60faab2..a537af0e08c 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -230,207 +230,3 @@ TYPED_TEST(GroupBySumFixedPointTest, GroupByHashSumDecimalAsValue)
     EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error);
   }
 }
-
-// SUM_WITH_OVERFLOW tests - only supports int64_t input values and outputs int64_t
-template <typename V>
-struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {};
-
-using sum_with_overflow_supported_types = cudf::test::Types<int64_t>;
-
-TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types);
-
-TYPED_TEST(groupby_sum_with_overflow_test, basic)
-{
-  using V = TypeParam;
-
-  cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  cudf::test::fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
-
-  // Create expected struct column with sum and overflow children
-  auto sum_col      = cudf::test::fixed_width_column_wrapper<int64_t>{9, 19, 17};
-  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{false, false, false};
-  std::vector<std::unique_ptr<cudf::column>> children;
-  children.push_back(sum_col.release());
-  children.push_back(overflow_col.release());
-  auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {});
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
-
-  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
-}
-
-TYPED_TEST(groupby_sum_with_overflow_test, empty_cols)
-{
-  using V = TypeParam;
-
-  cudf::test::fixed_width_column_wrapper<K> keys{};
-  cudf::test::fixed_width_column_wrapper<V> vals{};
-
-  cudf::test::fixed_width_column_wrapper<K> expect_keys{};
-
-  // Create expected empty struct column with sum and overflow children
-  auto sum_col      = cudf::test::fixed_width_column_wrapper<int64_t>{};
-  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{};
-  std::vector<std::unique_ptr<cudf::column>> children;
-  children.push_back(sum_col.release());
-  children.push_back(overflow_col.release());
-  auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {});
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
-
-  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
-}
-
-TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys)
-{
-  using V = TypeParam;
-
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3}, cudf::test::iterators::all_nulls());
-  cudf::test::fixed_width_column_wrapper<V> vals{3, 4, 5};
-
-  cudf::test::fixed_width_column_wrapper<K> expect_keys{};
-
-  // Create expected empty struct column with sum and overflow children
-  auto sum_col      = cudf::test::fixed_width_column_wrapper<int64_t>{};
-  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{};
-  std::vector<std::unique_ptr<cudf::column>> children;
-  children.push_back(sum_col.release());
-  children.push_back(overflow_col.release());
-  auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {});
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
-
-  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
-}
-
-TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values)
-{
-  using V = TypeParam;
-
-  cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1};
-  cudf::test::fixed_width_column_wrapper<V> vals({3, 4, 5}, cudf::test::iterators::all_nulls());
-
-  cudf::test::fixed_width_column_wrapper<K> expect_keys{1};
-
-  // Create expected struct column with sum and overflow children (null result)
-  // Child columns have no null masks, only struct-level null mask matters
-  auto sum_col      = cudf::test::fixed_width_column_wrapper<int64_t>({0});
-  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>({false});
-  std::vector<std::unique_ptr<cudf::column>> children;
-  children.push_back(sum_col.release());
-  children.push_back(overflow_col.release());
-  std::vector<int> validity{0};  // null struct
-  auto [validity_mask, null_count] =
-    cudf::test::detail::make_null_mask(validity.begin(), validity.end());
-  auto expect_vals =
-    cudf::create_structs_hierarchy(1, std::move(children), null_count, std::move(validity_mask));
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
-
-  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
-}
-
-TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values)
-{
-  using V = TypeParam;
-
-  cudf::test::fixed_width_column_wrapper<K> keys(
-    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-    {true, true, true, true, true, true, true, false, true, true, true});
-  cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                                 {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-  //  { 1, 1,     2, 2, 2,   3, 3,    4}
-  cudf::test::fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4},
-                                                        cudf::test::iterators::no_nulls());
-
-  // Create expected struct column with sum and overflow children
-  //  { 3, 6,     1, 4, 9,   2, 8,    -}
-  // Child columns have no null masks, only struct-level null mask matters
-  auto sum_col      = cudf::test::fixed_width_column_wrapper<int64_t>({9, 14, 10, 0});
-  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>({false, false, false, false});
-  std::vector<std::unique_ptr<cudf::column>> children;
-  children.push_back(sum_col.release());
-  children.push_back(overflow_col.release());
-  std::vector<int> validity{1, 1, 1, 0};
-  auto [validity_mask, null_count] =
-    cudf::test::detail::make_null_mask(validity.begin(), validity.end());
-  auto expect_vals =
-    cudf::create_structs_hierarchy(4, std::move(children), null_count, std::move(validity_mask));
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
-
-  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
-}
-
-TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection)
-{
-  using V = TypeParam;
-
-  cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4};
-  // Mix of values that will cause positive and negative overflow for some groups but not others
-  cudf::test::fixed_width_column_wrapper<V> vals{
-    9223372036854775800L,    // Close to INT64_MAX
-    100L,                    // Small value
-    200L,                    // Small value
-    -9223372036854775800L,   // Close to INT64_MIN
-    20L,                     // Small value that will cause positive overflow when added to first
-    200L,                    // Small value
-    300L,                    // Small value
-    9223372036854775800L,    // Close to INT64_MAX
-    9223372036854775800L,    // Close to INT64_MAX
-    1L,                      // Small value
-    400L,                    // Small value
-    -20L,                    // Small value that will cause negative overflow when added to fourth
-    -9223372036854775800L};  // Close to INT64_MIN
-
-  cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3, 4};
-
-  // Create expected struct column with sum and overflow children
-  // Group 1: 9223372036854775800 + 20 + 9223372036854775800 = positive overflow
-  // Group 2: 100 + 200 + 300 + 400 = 1000 (no overflow)
-  // Group 3: 200 + 9223372036854775800 + 1 = positive overflow
-  // Group 4: -9223372036854775800 + (-20) + (-9223372036854775800) = negative overflow
-  auto sum_col = cudf::test::fixed_width_column_wrapper<int64_t>{
-    4L,                     // Positive overflow result for group 1
-    1000L,                  // Normal sum for group 2 (no overflow)
-    -9223372036854775615L,  // Positive overflow result for group 3
-    -4L                     // Negative overflow result for group 4
-  };
-  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{true, false, true, true};
-  std::vector<std::unique_ptr<cudf::column>> children;
-  children.push_back(sum_col.release());
-  children.push_back(overflow_col.release());
-  auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {});
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
-
-  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
-}
-
-// Test that SUM_WITH_OVERFLOW throws an error for invalid value types
-TEST(groupby_sum_with_overflow_error_test, invalid_value_type)
-{
-  using K = int32_t;
-  using V = int32_t;  // Invalid type for SUM_WITH_OVERFLOW, should only support int64_t
-
-  cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1, 2, 2, 2, 3, 3, 3};
-  cudf::test::fixed_width_column_wrapper<V> vals{1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-
-  // SUM_WITH_OVERFLOW should throw a logic_error when used with non-int64_t value types
-  EXPECT_THROW(
-    test_single_agg(keys, vals, expect_keys, {}, std::move(agg), force_use_sort_impl::NO),
-    cudf::logic_error);
-}
diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp
new file mode 100644
index 00000000000..1b50659d8f7
--- /dev/null
+++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+using namespace cudf::test::iterators;
+using namespace numeric;
+
+using K = int32_t;
+
+// SUM_WITH_OVERFLOW tests - supports all integer and decimal types
+template <typename V>
+struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {};
+
+using sum_with_overflow_supported_types =
+  cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                     cudf::test::FixedPointTypes>;
+
+TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types);
+
+TYPED_TEST(groupby_sum_with_overflow_test, basic)
+{
+  using V = TypeParam;
+
+  cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  if constexpr (cudf::is_fixed_point<V>()) {
+    // For decimal types
+    using RepType    = cudf::device_storage_type_t<V>;
+    auto const scale = scale_type{0};
+    auto vals =
+      cudf::test::fixed_point_column_wrapper<RepType>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+
+    // Create expected struct column with sum and overflow children
+    auto sum_col      = cudf::test::fixed_point_column_wrapper<RepType>{{9, 19, 17}, scale};
+    auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{false, false, false};
+    std::vector<std::unique_ptr<cudf::column>> children;
+    children.push_back(sum_col.release());
+    children.push_back(overflow_col.release());
+    auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {});
+
+    auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+    // SUM_WITH_OVERFLOW should throw with sort-based groupby
+    auto agg_sort = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+    EXPECT_THROW(
+      test_single_agg(
+        keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES),
+      cudf::logic_error);
+  } else {
+    // For integer types
+    cudf::test::fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+    // Create expected struct column with sum and overflow children
+    // Sum column type matches input type V
+    auto sum_col      = cudf::test::fixed_width_column_wrapper<V>{9, 19, 17};
+    auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{false, false, false};
+    std::vector<std::unique_ptr<cudf::column>> children;
+    children.push_back(sum_col.release());
+    children.push_back(overflow_col.release());
+    auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {});
+
+    auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+    // SUM_WITH_OVERFLOW should throw with sort-based groupby
+    auto agg_sort = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+    EXPECT_THROW(
+      test_single_agg(
+        keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES),
+      cudf::logic_error);
+  }
+
+  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
+}
+
+TYPED_TEST(groupby_sum_with_overflow_test, empty_cols)
+{
+  using V = TypeParam;
+
+  cudf::test::fixed_width_column_wrapper<K> keys{};
+  cudf::test::fixed_width_column_wrapper<K> expect_keys{};
+
+  // For integer types
+  cudf::test::fixed_width_column_wrapper<V> vals{};
+
+  // Create expected empty struct column with sum and overflow children
+  auto sum_col      = cudf::test::fixed_width_column_wrapper<V>{};
+  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{};
+  std::vector<std::unique_ptr<cudf::column>> children;
+  children.push_back(sum_col.release());
+  children.push_back(overflow_col.release());
+  auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {});
+
+  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
+}
+
+TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys)
+{
+  using V = TypeParam;
+
+  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3}, cudf::test::iterators::all_nulls());
+  cudf::test::fixed_width_column_wrapper<K> expect_keys{};
+
+  // For integer types
+  cudf::test::fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  // Create expected empty struct column with sum and overflow children
+  auto sum_col      = cudf::test::fixed_width_column_wrapper<V>{};
+  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{};
+  std::vector<std::unique_ptr<cudf::column>> children;
+  children.push_back(sum_col.release());
+  children.push_back(overflow_col.release());
+  auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {});
+
+  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
+}
+
+TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values)
+{
+  using V = TypeParam;
+
+  cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<V> vals({3, 4, 5}, cudf::test::iterators::all_nulls());
+
+  cudf::test::fixed_width_column_wrapper<K> expect_keys{1};
+
+  // Create expected struct column with sum and overflow children (null result)
+  // Child columns have no null masks, only struct-level null mask matters
+  auto sum_col      = cudf::test::fixed_width_column_wrapper<V>({0});
+  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>({false});
+  std::vector<std::unique_ptr<cudf::column>> children;
+  children.push_back(sum_col.release());
+  children.push_back(overflow_col.release());
+  std::vector<int> validity{0};  // null struct
+  auto [validity_mask, null_count] =
+    cudf::test::detail::make_null_mask(validity.begin(), validity.end());
+  auto expect_vals =
+    cudf::create_structs_hierarchy(1, std::move(children), null_count, std::move(validity_mask));
+
+  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
+}
+
+TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values)
+{
+  using V = TypeParam;
+
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
+  cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                                 {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  cudf::test::fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4},
+                                                        cudf::test::iterators::no_nulls());
+
+  // Create expected struct column with sum and overflow children
+  //  { 3, 6,     1, 4, 9,   2, 8,    -}
+  // Child columns have no null masks, only struct-level null mask matters
+  auto sum_col      = cudf::test::fixed_width_column_wrapper<V>({9, 14, 10, 0});
+  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>({false, false, false, false});
+  std::vector<std::unique_ptr<cudf::column>> children;
+  children.push_back(sum_col.release());
+  children.push_back(overflow_col.release());
+  std::vector<int> validity{1, 1, 1, 0};
+  auto [validity_mask, null_count] =
+    cudf::test::detail::make_null_mask(validity.begin(), validity.end());
+  auto expect_vals =
+    cudf::create_structs_hierarchy(4, std::move(children), null_count, std::move(validity_mask));
+
+  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+  // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
+}
+
+TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection)
+{
+  using V = TypeParam;
+
+  if constexpr (cudf::is_fixed_point<V>()) {
+    using namespace numeric;
+    using RepType = cudf::device_storage_type_t<V>;
+
+    // Skip only decimal128 since it doesn't have atomic support
+    if constexpr (std::is_same_v<V, numeric::decimal128>) {
+      GTEST_SKIP()
+        << "decimal128 overflow detection test skipped - no atomic support for __int128_t";
+    } else {
+      // Test decimal32 and decimal64 overflow detection
+      auto constexpr scale = scale_type{0};  // Use scale 0 for simplicity
+
+      // Use type-specific values that will cause overflow for decimal types
+      auto constexpr type_max = cuda::std::numeric_limits<RepType>::max();
+      auto constexpr type_min = cuda::std::numeric_limits<RepType>::min();
+
+      cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4};
+
+      // Create values that will cause overflow for this specific decimal type
+      RepType large_positive  = type_max - 5;  // Close to max
+      RepType small_increment = 10;            // Will cause overflow when added to large_positive
+      RepType large_negative  = type_min + 5;  // Close to min (decimal types are always signed)
+      RepType small_decrement = -10;           // Will cause underflow when added to large_negative
+
+      // Use values that fit within the type range for non-overflowing groups
+      RepType small_val1 = 10;
+      RepType small_val2 = 20;
+      RepType small_val3 = 30;
+      RepType small_val4 = 40;
+
+      cudf::test::fixed_point_column_wrapper<RepType> vals{
+        {large_positive,   // Group 1: Close to max
+         small_val1,       // Group 2: Small value
+         small_val2,       // Group 3: Small value
+         large_negative,   // Group 4: Close to min
+         small_increment,  // Group 1: Will cause positive overflow
+         small_val2,       // Group 2: Small value
+         small_val3,       // Group 2: Small value
+         large_positive,   // Group 1: Close to max (second occurrence)
+         large_positive,   // Group 3: Close to max
+         1,                // Group 3: Small value
+         small_val4,       // Group 2: Small value
+         small_decrement,  // Group 4: Will cause negative overflow
+         large_negative},  // Group 4: Close to min (second occurrence)
+        scale};
+
+      cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3, 4};
+
+      // Expected sums (with overflow handled by wrapping)
+      auto overflow_sum_1 = static_cast<RepType>(static_cast<RepType>(large_positive) +
+                                                 static_cast<RepType>(small_increment) +
+                                                 static_cast<RepType>(large_positive));
+      auto normal_sum_2   = static_cast<RepType>(small_val1 + small_val2 + small_val3 + small_val4);
+      auto overflow_sum_3 =
+        static_cast<RepType>(static_cast<RepType>(small_val2) +
+                             static_cast<RepType>(large_positive) + static_cast<RepType>(1));
+      auto overflow_sum_4 = static_cast<RepType>(static_cast<RepType>(large_negative) +
+                                                 static_cast<RepType>(small_decrement) +
+                                                 static_cast<RepType>(large_negative));
+
+      cudf::test::fixed_point_column_wrapper<RepType> expect_sum_vals{
+        {overflow_sum_1, normal_sum_2, overflow_sum_3, overflow_sum_4}, scale};
+      cudf::test::fixed_width_column_wrapper<bool> expect_overflow_vals{true, false, true, true};
+
+      std::vector<std::unique_ptr<cudf::column>> children;
+      children.push_back(expect_sum_vals.release());
+      children.push_back(expect_overflow_vals.release());
+      auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {});
+
+      auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+      test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+      // Verify that sort-based groupby throws for decimals
+      auto agg2 = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+      EXPECT_THROW(
+        test_single_agg(
+          keys, vals, expect_keys, *expect_vals, std::move(agg2), force_use_sort_impl::YES),
+        cudf::logic_error);
+    }
+  } else {
+    using DeviceType = cudf::device_storage_type_t<V>;
+
+    // Use type-specific values that will cause overflow for each integer type
+    auto constexpr type_max = cuda::std::numeric_limits<DeviceType>::max();
+    auto constexpr type_min = cuda::std::numeric_limits<DeviceType>::min();
+
+    cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4};
+
+    // Create values that will cause overflow for this specific type
+    // Use smaller increments for smaller types to avoid immediate wrapping
+    DeviceType large_positive  = type_max - 5;  // Close to max
+    DeviceType small_increment = 10;            // Will cause overflow when added to large_positive
+    DeviceType large_negative, small_decrement;
+    if constexpr (cuda::std::is_signed_v<DeviceType>) {
+      large_negative  = type_min + 5;  // Close to min (only for signed types)
+      small_decrement = -10;  // Will cause underflow when added to large_negative (signed only)
+    }
+
+    // Use values that fit within the type range for non-overflowing groups
+    DeviceType small_val1 = 10;
+    DeviceType small_val2 = 20;
+    DeviceType small_val3 = 30;
+    DeviceType small_val4 = 40;
+
+    if constexpr (cuda::std::is_signed_v<DeviceType>) {
+      // For signed types: test both positive and negative overflow
+      cudf::test::fixed_width_column_wrapper<V> vals{
+        static_cast<V>(large_positive),   // Group 1: Close to max
+        static_cast<V>(small_val1),       // Group 2: Small value
+        static_cast<V>(small_val2),       // Group 3: Small value
+        static_cast<V>(large_negative),   // Group 4: Close to min
+        static_cast<V>(small_increment),  // Group 1: Will cause positive overflow
+        static_cast<V>(small_val2),       // Group 2: Small value
+        static_cast<V>(small_val3),       // Group 2: Small value
+        static_cast<V>(large_positive),   // Group 1: Close to max (second occurrence)
+        static_cast<V>(large_positive),   // Group 3: Close to max
+        static_cast<V>(1),                // Group 3: Small value
+        static_cast<V>(small_val4),       // Group 2: Small value
+        static_cast<V>(small_decrement),  // Group 4: Will cause negative overflow
+        static_cast<V>(large_negative)};  // Group 4: Close to min (second occurrence)
+
+      cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3, 4};
+
+      // Expected results: Groups 1, 3, and 4 overflow; Group 2 does not
+      auto sum_col = cudf::test::fixed_width_column_wrapper<V>{
+        static_cast<V>(static_cast<DeviceType>(large_positive) + small_increment +
+                       static_cast<DeviceType>(large_positive)),  // Group 1: overflowed result
+        static_cast<V>(small_val1 + small_val2 + small_val3 + small_val4),  // Group 2: no overflow
+        static_cast<V>(static_cast<DeviceType>(small_val2) +
+                       static_cast<DeviceType>(large_positive) +
+                       static_cast<DeviceType>(1)),  // Group 3: overflowed result
+        static_cast<V>(static_cast<DeviceType>(large_negative) + small_decrement +
+                       static_cast<DeviceType>(large_negative))  // Group 4: overflowed result
+      };
+      auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{true, false, true, true};
+      std::vector<std::unique_ptr<cudf::column>> children;
+      children.push_back(sum_col.release());
+      children.push_back(overflow_col.release());
+      auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {});
+
+      auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+      test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+    } else {
+      // For unsigned types: only test positive overflow
+      cudf::test::fixed_width_column_wrapper<V> vals{
+        static_cast<V>(large_positive),   // Group 1: Close to max
+        static_cast<V>(small_val1),       // Group 2: Small value
+        static_cast<V>(small_val2),       // Group 3: Small value
+        static_cast<V>(small_val1),       // Group 4: Small value
+        static_cast<V>(small_increment),  // Group 1: Will cause positive overflow
+        static_cast<V>(small_val2),       // Group 2: Small value
+        static_cast<V>(small_val3),       // Group 2: Small value
+        static_cast<V>(large_positive),   // Group 1: Close to max (second occurrence)
+        static_cast<V>(large_positive),   // Group 3: Close to max
+        static_cast<V>(1),                // Group 3: Small value
+        static_cast<V>(small_val4),       // Group 2: Small value
+        static_cast<V>(small_val2),       // Group 4: Small value
+        static_cast<V>(small_val3)};      // Group 4: Small value
+
+      cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3, 4};
+
+      // Expected results: Groups 1 and 3 overflow; Groups 2 and 4 do not
+      auto sum_col = cudf::test::fixed_width_column_wrapper<V>{
+        static_cast<V>(static_cast<DeviceType>(large_positive) + small_increment +
+                       static_cast<DeviceType>(large_positive)),  // Group 1: overflowed result
+        static_cast<V>(small_val1 + small_val2 + small_val3 + small_val4),  // Group 2: no overflow
+        static_cast<V>(static_cast<DeviceType>(small_val2) +
+                       static_cast<DeviceType>(large_positive) +
+                       static_cast<DeviceType>(1)),           // Group 3: overflowed result
+        static_cast<V>(small_val1 + small_val2 + small_val3)  // Group 4: no overflow
+      };
+      auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{true, false, true, false};
+      std::vector<std::unique_ptr<cudf::column>> children;
+      children.push_back(sum_col.release());
+      children.push_back(overflow_col.release());
+      auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {});
+
+      auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+      test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+    }
+
+    // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby
+  }  // end else block for non-decimal types
+}
+
+// Test that SUM_WITH_OVERFLOW works with int32_t (which used to be unsupported)
+TEST(groupby_sum_with_overflow_compatibility_test, int32_support)
+{
+  using K = int32_t;
+  using V = int32_t;  // int32_t is now supported by SUM_WITH_OVERFLOW
+
+  cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1, 2, 2, 2, 3, 3, 3};
+  cudf::test::fixed_width_column_wrapper<V> vals{1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  // Create expected struct column with sum and overflow children
+  auto sum_col      = cudf::test::fixed_width_column_wrapper<V>{6, 15, 24};
+  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{false, false, false};
+  std::vector<std::unique_ptr<cudf::column>> children;
+  children.push_back(sum_col.release());
+  children.push_back(overflow_col.release());
+  auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {});
+
+  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+
+  // SUM_WITH_OVERFLOW should now work with int32_t value types
+  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg), force_use_sort_impl::NO);
+
+  // SUM_WITH_OVERFLOW should throw with sort-based groupby
+  auto agg_sort = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+  EXPECT_THROW(
+    test_single_agg(
+      keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES),
+    cudf::logic_error);
+}
+
+// Test that SUM_WITH_OVERFLOW throws an error for bool type (which is not supported)
+TEST(groupby_sum_with_overflow_error_test, bool_type_not_supported)
+{
+  using K = int32_t;
+  using V = bool;  // bool type should not be supported by SUM_WITH_OVERFLOW
+
+  cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<V> vals{true, false, true};
+
+  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back();
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(std::move(agg));
+
+  auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
+
+  // This should throw an exception since bool is not supported
+  EXPECT_THROW(gb_obj.aggregate(requests), cudf::logic_error);
+}
+
+// Test that SUM_WITH_OVERFLOW throws an error for unsigned integer types (which are not supported)
+TEST(groupby_sum_with_overflow_error_test, unsigned_types_not_supported)
+{
+  using K = int32_t;
+
+  // Test uint32_t
+  {
+    using V = uint32_t;  // unsigned types should not be supported by SUM_WITH_OVERFLOW
+
+    cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1};
+    cudf::test::fixed_width_column_wrapper<V> vals{1, 2, 3};
+
+    auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.emplace_back();
+    requests[0].values = vals;
+    requests[0].aggregations.push_back(std::move(agg));
+
+    auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
+
+    // This should throw an exception since unsigned types are not supported
+    EXPECT_THROW(gb_obj.aggregate(requests), cudf::logic_error);
+  }
+
+  // Test uint64_t
+  {
+    using V = uint64_t;  // unsigned types should not be supported by SUM_WITH_OVERFLOW
+
+    cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1};
+    cudf::test::fixed_width_column_wrapper<V> vals{1, 2, 3};
+
+    auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.emplace_back();
+    requests[0].values = vals;
+    requests[0].aggregations.push_back(std::move(agg));
+
+    auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
+
+    // This should throw an exception since unsigned types are not supported
+    EXPECT_THROW(gb_obj.aggregate(requests), cudf::logic_error);
+  }
+}

From 3badfc789e6f9f07d69b069ffdc4ba635e49da51 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 10:54:00 -0700
Subject: [PATCH 02/23] Resolve conflicts

---
 cpp/src/groupby/hash/output_utils.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu
index 860845a5313..5a8b0fc2ed0 100644
--- a/cpp/src/groupby/hash/output_utils.cu
+++ b/cpp/src/groupby/hash/output_utils.cu
@@ -72,11 +72,11 @@ struct result_column_creator {
       return make_fixed_width_column(data_type{type_id}, size, mask_state, stream, mr);
     };
 
-    auto make_children = [&make_empty_column](size_type size) {
+    auto make_children = [&make_empty_column, col_type = col.type()](size_type size) {
       std::vector<std::unique_ptr<column>> children;
       // Create sum child column (int64_t) - no null mask needed, struct-level mask handles
       // nullability
-      children.push_back(make_empty_column(type_id::INT64, size, mask_state::UNALLOCATED));
+      children.push_back(make_empty_column(col_type.id(), size, mask_state::UNALLOCATED));
       // Create overflow child column (bool) - no null mask needed, only value matters
       children.push_back(make_empty_column(type_id::BOOL8, size, mask_state::UNALLOCATED));
       return children;

From 8a446391cad95ab119d807fcac663b8eff7e5ac8 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 12:02:37 -0700
Subject: [PATCH 03/23] Cleanups

---
 .../cudf/detail/aggregation/aggregation.hpp   |  3 +-
 .../detail/aggregation/device_aggregators.cuh | 48 ++++++-------------
 cpp/src/aggregation/aggregation.cu            | 22 ++++-----
 cpp/src/groupby/groupby.cu                    | 13 ++---
 4 files changed, 30 insertions(+), 56 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index d96a0cd244c..e13234d80ab 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1419,8 +1419,7 @@ struct target_type_impl<Source,
 // SUM_WITH_OVERFLOW outputs a struct {sum: Source, overflow: bool} where sum type matches input
 // type Only supports signed integral types (excluding bool) and decimal types
 template <typename Source>
-  requires((cudf::is_integral<Source>() && !cuda::std::is_same_v<Source, bool> &&
-            cuda::std::is_signed_v<Source>) ||
+  requires(cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>() ||
            cudf::is_fixed_point<Source>())
 struct target_type_impl<Source, aggregation::SUM_WITH_OVERFLOW> {
   using type = struct_view;  // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 4b6d2af5dcf..53b628a4cd1 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -140,58 +140,40 @@ struct update_target_element<Source, aggregation::SUM> {
 };
 
 template <typename Source>
-  requires(((cudf::is_integral<Source>() && !cuda::std::is_same_v<Source, bool> &&
-             cuda::std::is_signed_v<Source>) ||
-            cudf::is_fixed_point<Source>()) &&
-           cudf::has_atomic_support<device_storage_type_t<Source>>())
+  requires((cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>()) ||
+           cudf::is_fixed_point<Source>() &&
+             cudf::has_atomic_support<device_storage_type_t<Source>>())
 struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
+  using DeviceType               = device_storage_type_t<Source>;
+  static constexpr auto type_max = cuda::std::numeric_limits<DeviceType>::max();
+  static constexpr auto type_min = cuda::std::numeric_limits<DeviceType>::min();
+
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-    // For SUM_WITH_OVERFLOW, target is a struct with sum value at child(0) and overflow flag at
-    // child(1)
     auto sum_column      = target.child(0);
     auto overflow_column = target.child(1);
 
-    using DeviceType = device_storage_type_t<Source>;
-
-    // Extract source value: for integral types, read as Source then cast; for decimal types, read
-    // as DeviceType directly
-    DeviceType device_source_value;
-    if constexpr (cudf::is_fixed_point<Source>()) {
-      device_source_value = source.element<DeviceType>(source_index);
-    } else {
-      auto const source_value = source.element<Source>(source_index);
-      device_source_value     = static_cast<DeviceType>(source_value);
-    }
-
+    auto const source_value = source.element<DeviceType>(source_index);
     auto const old_sum =
-      cudf::detail::atomic_add(&sum_column.element<DeviceType>(target_index), device_source_value);
+      cudf::detail::atomic_add(&sum_column.element<DeviceType>(target_index), source_value);
 
-    // Early exit if overflow is already set to avoid unnecessary overflow checking
+    // Early exit if overflow is already set
     auto bool_ref = cuda::atomic_ref<bool, cuda::thread_scope_device>{
       *(overflow_column.data<bool>() + target_index)};
     if (bool_ref.load(cuda::memory_order_relaxed)) { return; }
 
-    // Check for overflow before performing the addition to avoid UB
-    auto constexpr type_max = cuda::std::numeric_limits<DeviceType>::max();
-    auto constexpr type_min = cuda::std::numeric_limits<DeviceType>::min();
-
+    // Check for overflow
     bool overflow = false;
-    // Check for positive overflow (signed types only)
-    if (old_sum > 0 && device_source_value > 0 && old_sum > type_max - device_source_value) {
+    if (source_value > 0 && old_sum > type_max - source_value) {
       overflow = true;
-    }
-    // Check for negative overflow (signed types only)
-    if (old_sum < 0 && device_source_value < 0 && old_sum < type_min - device_source_value) {
+    } else if (source_value < 0 && old_sum < type_min - source_value) {
       overflow = true;
     }
-    if (overflow) {
-      // Atomically set overflow flag to true (use atomic_max since true > false)
-      cudf::detail::atomic_max(&overflow_column.element<bool>(target_index), true);
-    }
+
+    if (overflow) { cudf::detail::atomic_max(&overflow_column.element<bool>(target_index), true); }
   }
 };
 
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index 652f8037de8..bf795c24d9b 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -97,21 +97,19 @@ struct identity_initializer {
   {
     if constexpr (k == aggregation::SUM_WITH_OVERFLOW) {
       // SUM_WITH_OVERFLOW uses a struct with sum and overflow children
-      // We need to initialize each child separately since we don't know the sum column type at
-      // compile time
       auto sum_col      = col.child(0);
       auto overflow_col = col.child(1);
 
-      // Initialize overflow column to false
-      thrust::fill(rmm::exec_policy_nosync(stream),
-                   overflow_col.begin<bool>(),
-                   overflow_col.end<bool>(),
-                   false);
-
-      // Initialize sum column to 0 based on its actual type
-      // Use SUM aggregation identity initializer which sets to 0
-      dispatch_type_and_aggregation(
-        sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream);
+      // Initialize both columns using device storage type
+      cudf::type_dispatcher(sum_col.type(), [&]<typename SumType>() {
+        using DeviceType = device_storage_type_t<SumType>;
+        auto zip_it =
+          thrust::make_zip_iterator(sum_col.begin<DeviceType>(), overflow_col.begin<bool>());
+        thrust::fill_n(rmm::exec_policy_nosync(stream),
+                       zip_it,
+                       col.size(),
+                       thrust::make_tuple(DeviceType{0}, false));
+      });
     } else if constexpr (std::is_same_v<T, cudf::struct_view>) {
       // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case
       CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation");
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index a67ac823d74..90cf9fefd06 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -218,17 +218,12 @@ void verify_valid_requests(host_span<RequestType const> requests)
   for (auto const& request : requests) {
     for (auto const& agg : request.aggregations) {
       if (agg->kind == aggregation::SUM_WITH_OVERFLOW) {
-        auto const type_id = request.values.type().id();
-        bool is_supported =
-          (type_id == type_id::INT8 || type_id == type_id::INT16 || type_id == type_id::INT32 ||
-           type_id == type_id::INT64 || type_id == type_id::DECIMAL32 ||
-           type_id == type_id::DECIMAL64 || type_id == type_id::DECIMAL128);
-
         CUDF_EXPECTS(
-          is_supported,
+          cudf::detail::is_valid_aggregation(request.values.type(), aggregation::SUM_WITH_OVERFLOW),
           "SUM_WITH_OVERFLOW aggregation only supports signed integer types (int8_t, int16_t, "
-          "int32_t, int64_t) and decimal types (decimal32, decimal64, decimal128). "
-          "Unsigned integers, bool, dictionary columns, and other types are not supported.");
+          "int32_t, int64_t) and decimal types (decimal32, decimal64). "
+          "decimal128, unsigned integers, bool, dictionary columns, and other types are not "
+          "supported.");
       }
     }
   }

From e0616f42f613894a70c990e0bfec6e1bdc07272c Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 12:18:40 -0700
Subject: [PATCH 04/23] Fix

---
 cpp/src/aggregation/aggregation.cu | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index bf795c24d9b..4f5a731979d 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -100,16 +100,15 @@ struct identity_initializer {
       auto sum_col      = col.child(0);
       auto overflow_col = col.child(1);
 
-      // Initialize both columns using device storage type
-      cudf::type_dispatcher(sum_col.type(), [&]<typename SumType>() {
-        using DeviceType = device_storage_type_t<SumType>;
-        auto zip_it =
-          thrust::make_zip_iterator(sum_col.begin<DeviceType>(), overflow_col.begin<bool>());
-        thrust::fill_n(rmm::exec_policy_nosync(stream),
-                       zip_it,
-                       col.size(),
-                       thrust::make_tuple(DeviceType{0}, false));
-      });
+      // Initialize overflow column to false
+      thrust::fill(rmm::exec_policy_nosync(stream),
+                   overflow_col.begin<bool>(),
+                   overflow_col.end<bool>(),
+                   false);
+
+      // Initialize sum column based on its actual type
+      dispatch_type_and_aggregation(
+        sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream);
     } else if constexpr (std::is_same_v<T, cudf::struct_view>) {
       // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case
       CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation");

From e5fd7ee8bb28ef3197dd4170385dcd1e29e8f8be Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 12:47:54 -0700
Subject: [PATCH 05/23] Cleanups

---
 cpp/src/aggregation/aggregation.cu | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index 4f5a731979d..e761a9fa817 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -44,6 +44,7 @@ namespace {
  *
  * Only works on columns of fixed-width types.
  */
+
 struct identity_initializer {
  private:
   template <typename T, aggregation::Kind k>
@@ -100,15 +101,12 @@ struct identity_initializer {
       auto sum_col      = col.child(0);
       auto overflow_col = col.child(1);
 
-      // Initialize overflow column to false
-      thrust::fill(rmm::exec_policy_nosync(stream),
-                   overflow_col.begin<bool>(),
-                   overflow_col.end<bool>(),
-                   false);
-
-      // Initialize sum column based on its actual type
+      // Initialize sum column using SUM aggregation dispatch (same supported types)
+      // and overflow column separately
       dispatch_type_and_aggregation(
         sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream);
+      thrust::fill_n(
+        rmm::exec_policy_nosync(stream), overflow_col.begin<bool>(), col.size(), false);
     } else if constexpr (std::is_same_v<T, cudf::struct_view>) {
       // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case
       CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation");

From 11f716297c63b5e2c51efb8b5ce63adc62a3bc35 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 13:52:06 -0700
Subject: [PATCH 06/23] Disable decimal128 support for now

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 5 +++--
 cpp/src/aggregation/aggregation.cu                  | 3 +--
 cpp/src/groupby/groupby.cu                          | 6 ++----
 cpp/tests/groupby/sum_with_overflow_tests.cpp       | 4 ++--
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index e13234d80ab..7ef2299c13c 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1417,10 +1417,11 @@ struct target_type_impl<Source,
 };
 
 // SUM_WITH_OVERFLOW outputs a struct {sum: Source, overflow: bool} where sum type matches input
-// type Only supports signed integral types (excluding bool) and decimal types
+// type Only supports signed integral types (excluding bool) and decimal types (excluding
+// decimal128)
 template <typename Source>
   requires(cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>() ||
-           cudf::is_fixed_point<Source>())
+           (cudf::is_fixed_point<Source>() && !std::is_same_v<Source, numeric::decimal128>))
 struct target_type_impl<Source, aggregation::SUM_WITH_OVERFLOW> {
   using type = struct_view;  // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields
 };
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index e761a9fa817..f27df722412 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -101,8 +101,7 @@ struct identity_initializer {
       auto sum_col      = col.child(0);
       auto overflow_col = col.child(1);
 
-      // Initialize sum column using SUM aggregation dispatch (same supported types)
-      // and overflow column separately
+      // Initialize sum column using standard SUM aggregation dispatch
       dispatch_type_and_aggregation(
         sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream);
       thrust::fill_n(
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 90cf9fefd06..3397e79ebf7 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -220,10 +220,8 @@ void verify_valid_requests(host_span<RequestType const> requests)
       if (agg->kind == aggregation::SUM_WITH_OVERFLOW) {
         CUDF_EXPECTS(
           cudf::detail::is_valid_aggregation(request.values.type(), aggregation::SUM_WITH_OVERFLOW),
-          "SUM_WITH_OVERFLOW aggregation only supports signed integer types (int8_t, int16_t, "
-          "int32_t, int64_t) and decimal types (decimal32, decimal64). "
-          "decimal128, unsigned integers, bool, dictionary columns, and other types are not "
-          "supported.");
+          "SUM_WITH_OVERFLOW aggregation only supports signed integer types and decimal types. "
+          "Unsigned integers, bool, dictionary columns, and other types are not supported.");
       }
     }
   }
diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp
index 1b50659d8f7..d0bef40b479 100644
--- a/cpp/tests/groupby/sum_with_overflow_tests.cpp
+++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp
@@ -30,13 +30,13 @@ using namespace numeric;
 
 using K = int32_t;
 
-// SUM_WITH_OVERFLOW tests - supports all integer and decimal types
+// SUM_WITH_OVERFLOW tests - supports signed integer and decimal types (excluding decimal128)
 template <typename V>
 struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {};
 
 using sum_with_overflow_supported_types =
   cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
-                     cudf::test::FixedPointTypes>;
+                     cudf::test::Types<numeric::decimal32, numeric::decimal64>>;
 
 TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types);
 

From a9149907e81f23ca2dcb2d180d80c3133517b91e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 14:19:40 -0700
Subject: [PATCH 07/23] Minor cleanups

---
 .../cudf/detail/aggregation/device_aggregators.cuh       | 9 ++-------
 cpp/src/aggregation/aggregation.cu                       | 1 -
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 53b628a4cd1..55f45a23873 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -165,13 +165,8 @@ struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
       *(overflow_column.data<bool>() + target_index)};
     if (bool_ref.load(cuda::memory_order_relaxed)) { return; }
 
-    // Check for overflow
-    bool overflow = false;
-    if (source_value > 0 && old_sum > type_max - source_value) {
-      overflow = true;
-    } else if (source_value < 0 && old_sum < type_min - source_value) {
-      overflow = true;
-    }
+    auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) ||
+                           (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value));
 
     if (overflow) { cudf::detail::atomic_max(&overflow_column.element<bool>(target_index), true); }
   }
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index f27df722412..271ba926159 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -44,7 +44,6 @@ namespace {
  *
  * Only works on columns of fixed-width types.
  */
-
 struct identity_initializer {
  private:
   template <typename T, aggregation::Kind k>

From 512701b05b95955d9994e1beb2b8f0fca057f301 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 14:23:21 -0700
Subject: [PATCH 08/23] Update comments

---
 cpp/include/cudf/detail/aggregation/device_aggregators.cuh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 55f45a23873..0861bd50057 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -165,6 +165,11 @@ struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
       *(overflow_column.data<bool>() + target_index)};
     if (bool_ref.load(cuda::memory_order_relaxed)) { return; }
 
+    // Check for overflow before performing the addition to avoid UB
+    // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value
+    // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value
+    // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is
+    // ready
     auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) ||
                            (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value));
 

From 2de441774137f971751d89ff104970a495182cdc Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 15:53:32 -0700
Subject: [PATCH 09/23] Add decimal128 support

---
 .../cudf/detail/aggregation/aggregation.hpp   |   5 +-
 .../detail/aggregation/device_aggregators.cuh |  39 +++++
 .../cudf/detail/utilities/device_atomics.cuh  |  79 ++++++++++
 .../device_atomics/device_atomics_test.cu     |  40 ++++-
 cpp/tests/groupby/sum_with_overflow_tests.cpp | 138 +++++++++---------
 5 files changed, 225 insertions(+), 76 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 7ef2299c13c..e13234d80ab 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1417,11 +1417,10 @@ struct target_type_impl<Source,
 };
 
 // SUM_WITH_OVERFLOW outputs a struct {sum: Source, overflow: bool} where sum type matches input
-// type Only supports signed integral types (excluding bool) and decimal types (excluding
-// decimal128)
+// type Only supports signed integral types (excluding bool) and decimal types
 template <typename Source>
   requires(cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>() ||
-           (cudf::is_fixed_point<Source>() && !std::is_same_v<Source, numeric::decimal128>))
+           cudf::is_fixed_point<Source>())
 struct target_type_impl<Source, aggregation::SUM_WITH_OVERFLOW> {
   using type = struct_view;  // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields
 };
diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 0861bd50057..9b29d69004a 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -177,6 +177,45 @@ struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
   }
 };
 
+/**
+ * @brief Specialization of update_target_element for SUM_WITH_OVERFLOW with decimal128
+ *
+ * Uses two-64-bit atomic operations to provide atomic addition for __int128_t.
+ * This enables SUM_WITH_OVERFLOW for decimal128 on all GPU architectures.
+ */
+template <>
+struct update_target_element<numeric::decimal128, aggregation::SUM_WITH_OVERFLOW> {
+  using DeviceType               = device_storage_type_t<numeric::decimal128>;  // __int128_t
+  static constexpr auto type_max = cuda::std::numeric_limits<DeviceType>::max();
+  static constexpr auto type_min = cuda::std::numeric_limits<DeviceType>::min();
+
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    auto sum_column      = target.child(0);
+    auto overflow_column = target.child(1);
+
+    auto const source_value = source.element<DeviceType>(source_index);
+
+    // Use the new 128-bit atomic addition implementation
+    auto const old_sum =
+      cudf::detail::atomic_add(&sum_column.element<DeviceType>(target_index), source_value);
+
+    // Early exit if overflow is already set
+    auto bool_ref = cuda::atomic_ref<bool, cuda::thread_scope_device>{
+      *(overflow_column.data<bool>() + target_index)};
+    if (bool_ref.load(cuda::memory_order_relaxed)) { return; }
+
+    // Check for overflow using the improved logic
+    auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) ||
+                           (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value));
+
+    if (overflow) { cudf::detail::atomic_max(&overflow_column.element<bool>(target_index), true); }
+  }
+};
+
 /**
  * @brief Function object to update a single element in a target column using
  * the dictionary key addressed by the specific index.
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 9eff232fa96..3ec86d9b7a0 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -346,5 +346,84 @@ __forceinline__ __device__ T atomic_cas(T* address, T compare, T val)
   return cudf::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
 
+/**
+ * @brief Helper function to calculate carry for 64-bit addition
+ *
+ * @param old_val The original 64-bit value
+ * @param add_val The value being added
+ * @param carry_in Carry from previous addition
+ * @return 1 if carry is generated, 0 otherwise
+ */
+__device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val,
+                                                       uint64_t add_val,
+                                                       uint64_t carry_in)
+{
+  // Use __uint128_t to detect overflow beyond 64-bit range
+  __uint128_t sum = static_cast<__uint128_t>(old_val) + add_val + carry_in;
+  return (sum > cuda::std::numeric_limits<uint64_t>::max()) ? 1 : 0;
+}
+
+/**
+ * @brief Atomic addition for __int128_t with GPU architecture optimization
+ *
+ * Uses native atomicAdd on Hopper+ GPUs (compute capability 9.0+) where 128-bit
+ * atomic operations are supported. Falls back to two 64-bit atomic CAS operations
+ * with carry propagation on older GPU architectures.
+ *
+ * @param address Pointer to the __int128_t value
+ * @param val Value to add
+ * @return The old value before addition
+ */
+__forceinline__ __device__ __int128_t atomic_add(__int128_t* address, __int128_t val)
+{
+#if __CUDA_ARCH__ >= 900  // Hopper and newer (compute capability 9.0+)
+  // Use native 128-bit atomic support on Hopper+ GPUs
+  return atomicAdd(address, val);
+#else
+  // Fall back to two 64-bit atomic implementation for older GPUs
+
+  // Cast to signed for atomic_cas compatibility and treat as array of two int64_t
+  auto* target_ptr      = reinterpret_cast<int64_t*>(address);
+  auto add_val_unsigned = static_cast<__uint128_t>(val);
+
+  // Split the 128-bit add value into two 64-bit parts
+  int64_t add_low  = static_cast<int64_t>(static_cast<uint64_t>(add_val_unsigned));
+  int64_t add_high = static_cast<int64_t>(static_cast<uint64_t>(add_val_unsigned >> 64));
+
+  int64_t carry = 0;
+  int64_t old_parts[2];
+
+  // Process both 64-bit parts with carry propagation
+  for (int i = 0; i < 2; ++i) {
+    int64_t current_add = (i == 0) ? add_low : add_high;
+    int64_t old_part, new_part, expected_part;
+
+    // Atomic compare-and-swap loop for this 64-bit part
+    do {
+      expected_part = target_ptr[i];
+      new_part      = expected_part + current_add + carry;
+      old_part      = atomicCAS(reinterpret_cast<unsigned long long*>(&target_ptr[i]),
+                           static_cast<unsigned long long>(expected_part),
+                           static_cast<unsigned long long>(new_part));
+      old_part      = static_cast<int64_t>(old_part);
+    } while (old_part != expected_part);
+
+    // Store the old value for result reconstruction
+    old_parts[i] = old_part;
+
+    // Calculate carry for next iteration (convert back to unsigned for carry calculation)
+    carry = static_cast<int64_t>(calculate_carry_64(static_cast<uint64_t>(old_part),
+                                                    static_cast<uint64_t>(current_add),
+                                                    static_cast<uint64_t>(carry)));
+  }
+
+  // Reconstruct the original 128-bit value from the old parts
+  __uint128_t old_val_unsigned =
+    (static_cast<__uint128_t>(static_cast<uint64_t>(old_parts[1])) << 64) |
+    static_cast<uint64_t>(old_parts[0]);
+  return static_cast<__int128_t>(old_val_unsigned);
+#endif
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 2d0079f875b..b74b25c90a8 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -23,11 +23,14 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/host_vector.h>
+#include <rmm/device_scalar.hpp>
+
+#include <cuda/std/limits>
 
 #include <algorithm>
 
@@ -261,4 +264,39 @@ TYPED_TEST(AtomicsTest, atomicCASRandom)
   this->atomic_test(input_array, is_cas_test, block_size, grid_size);
 }
 
+__global__ void test_single_atomic_add_kernel(__int128_t* target, __int128_t value)
+{
+  cudf::detail::atomic_add(target, value);
+}
+
+class Atomic128Test : public cudf::test::BaseFixture {
+ public:
+  void run_atomic_add_test(__int128_t initial_value,
+                           __int128_t add_value,
+                           __int128_t expected_result)
+  {
+    rmm::device_scalar<__int128_t> d_target(initial_value, cudf::test::get_default_stream());
+    test_single_atomic_add_kernel<<<32, 256>>>(d_target.data(), add_value);
+    CUDF_CHECK_CUDA(cudf::test::get_default_stream().value());
+    __int128_t result = d_target.value(cudf::test::get_default_stream());
+    EXPECT_EQ(result, expected_result);
+  }
+};
+
+TEST_F(Atomic128Test, BasicAddition) { run_atomic_add_test(0, 1, 32 * 256 * 1); }
+
+TEST_F(Atomic128Test, CarryPropagation)
+{
+  constexpr int total_threads = 32 * 256;
+  __int128_t initial_value    = cuda::std::numeric_limits<uint64_t>::max() - total_threads + 1;
+  __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits<uint64_t>::max()) + 1;
+  run_atomic_add_test(initial_value, 1, expected);
+}
+
+TEST_F(Atomic128Test, NegativeNumbers)
+{
+  constexpr int total_threads = 32 * 256;
+  run_atomic_add_test(total_threads * 100, -50, total_threads * 50);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp
index d0bef40b479..863fd72d5ca 100644
--- a/cpp/tests/groupby/sum_with_overflow_tests.cpp
+++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp
@@ -30,13 +30,13 @@ using namespace numeric;
 
 using K = int32_t;
 
-// SUM_WITH_OVERFLOW tests - supports signed integer and decimal types (excluding decimal128)
+// SUM_WITH_OVERFLOW tests - supports signed integer and decimal types
 template <typename V>
 struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {};
 
 using sum_with_overflow_supported_types =
   cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
-                     cudf::test::Types<numeric::decimal32, numeric::decimal64>>;
+                     cudf::test::FixedPointTypes>;
 
 TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types);
 
@@ -216,81 +216,75 @@ TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection)
     using namespace numeric;
     using RepType = cudf::device_storage_type_t<V>;
 
-    // Skip only decimal128 since it doesn't have atomic support
-    if constexpr (std::is_same_v<V, numeric::decimal128>) {
-      GTEST_SKIP()
-        << "decimal128 overflow detection test skipped - no atomic support for __int128_t";
-    } else {
-      // Test decimal32 and decimal64 overflow detection
-      auto constexpr scale = scale_type{0};  // Use scale 0 for simplicity
-
-      // Use type-specific values that will cause overflow for decimal types
-      auto constexpr type_max = cuda::std::numeric_limits<RepType>::max();
-      auto constexpr type_min = cuda::std::numeric_limits<RepType>::min();
-
-      cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4};
-
-      // Create values that will cause overflow for this specific decimal type
-      RepType large_positive  = type_max - 5;  // Close to max
-      RepType small_increment = 10;            // Will cause overflow when added to large_positive
-      RepType large_negative  = type_min + 5;  // Close to min (decimal types are always signed)
-      RepType small_decrement = -10;           // Will cause underflow when added to large_negative
-
-      // Use values that fit within the type range for non-overflowing groups
-      RepType small_val1 = 10;
-      RepType small_val2 = 20;
-      RepType small_val3 = 30;
-      RepType small_val4 = 40;
-
-      cudf::test::fixed_point_column_wrapper<RepType> vals{
-        {large_positive,   // Group 1: Close to max
-         small_val1,       // Group 2: Small value
-         small_val2,       // Group 3: Small value
-         large_negative,   // Group 4: Close to min
-         small_increment,  // Group 1: Will cause positive overflow
-         small_val2,       // Group 2: Small value
-         small_val3,       // Group 2: Small value
-         large_positive,   // Group 1: Close to max (second occurrence)
-         large_positive,   // Group 3: Close to max
-         1,                // Group 3: Small value
-         small_val4,       // Group 2: Small value
-         small_decrement,  // Group 4: Will cause negative overflow
-         large_negative},  // Group 4: Close to min (second occurrence)
-        scale};
+    // Test decimal overflow detection for all decimal types
+    auto constexpr scale = scale_type{0};  // Use scale 0 for simplicity
 
-      cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3, 4};
+    // Use type-specific values that will cause overflow for decimal types
+    auto constexpr type_max = cuda::std::numeric_limits<RepType>::max();
+    auto constexpr type_min = cuda::std::numeric_limits<RepType>::min();
 
-      // Expected sums (with overflow handled by wrapping)
-      auto overflow_sum_1 = static_cast<RepType>(static_cast<RepType>(large_positive) +
-                                                 static_cast<RepType>(small_increment) +
-                                                 static_cast<RepType>(large_positive));
-      auto normal_sum_2   = static_cast<RepType>(small_val1 + small_val2 + small_val3 + small_val4);
-      auto overflow_sum_3 =
-        static_cast<RepType>(static_cast<RepType>(small_val2) +
-                             static_cast<RepType>(large_positive) + static_cast<RepType>(1));
-      auto overflow_sum_4 = static_cast<RepType>(static_cast<RepType>(large_negative) +
-                                                 static_cast<RepType>(small_decrement) +
-                                                 static_cast<RepType>(large_negative));
-
-      cudf::test::fixed_point_column_wrapper<RepType> expect_sum_vals{
-        {overflow_sum_1, normal_sum_2, overflow_sum_3, overflow_sum_4}, scale};
-      cudf::test::fixed_width_column_wrapper<bool> expect_overflow_vals{true, false, true, true};
+    cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4};
 
-      std::vector<std::unique_ptr<cudf::column>> children;
-      children.push_back(expect_sum_vals.release());
-      children.push_back(expect_overflow_vals.release());
-      auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {});
+    // Create values that will cause overflow for this specific decimal type
+    RepType large_positive  = type_max - 5;  // Close to max
+    RepType small_increment = 10;            // Will cause overflow when added to large_positive
+    RepType large_negative  = type_min + 5;  // Close to min (decimal types are always signed)
+    RepType small_decrement = -10;           // Will cause underflow when added to large_negative
 
-      auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-      test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+    // Use values that fit within the type range for non-overflowing groups
+    RepType small_val1 = 10;
+    RepType small_val2 = 20;
+    RepType small_val3 = 30;
+    RepType small_val4 = 40;
+
+    cudf::test::fixed_point_column_wrapper<RepType> vals{
+      {large_positive,   // Group 1: Close to max
+       small_val1,       // Group 2: Small value
+       small_val2,       // Group 3: Small value
+       large_negative,   // Group 4: Close to min
+       small_increment,  // Group 1: Will cause positive overflow
+       small_val2,       // Group 2: Small value
+       small_val3,       // Group 2: Small value
+       large_positive,   // Group 1: Close to max (second occurrence)
+       large_positive,   // Group 3: Close to max
+       1,                // Group 3: Small value
+       small_val4,       // Group 2: Small value
+       small_decrement,  // Group 4: Will cause negative overflow
+       large_negative},  // Group 4: Close to min (second occurrence)
+      scale};
+
+    cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3, 4};
+
+    // Expected sums (with overflow handled by wrapping)
+    auto overflow_sum_1 = static_cast<RepType>(static_cast<RepType>(large_positive) +
+                                               static_cast<RepType>(small_increment) +
+                                               static_cast<RepType>(large_positive));
+    auto normal_sum_2   = static_cast<RepType>(small_val1 + small_val2 + small_val3 + small_val4);
+    auto overflow_sum_3 =
+      static_cast<RepType>(static_cast<RepType>(small_val2) + static_cast<RepType>(large_positive) +
+                           static_cast<RepType>(1));
+    auto overflow_sum_4 = static_cast<RepType>(static_cast<RepType>(large_negative) +
+                                               static_cast<RepType>(small_decrement) +
+                                               static_cast<RepType>(large_negative));
+
+    cudf::test::fixed_point_column_wrapper<RepType> expect_sum_vals{
+      {overflow_sum_1, normal_sum_2, overflow_sum_3, overflow_sum_4}, scale};
+    cudf::test::fixed_width_column_wrapper<bool> expect_overflow_vals{true, false, true, true};
 
-      // Verify that sort-based groupby throws for decimals
-      auto agg2 = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-      EXPECT_THROW(
-        test_single_agg(
-          keys, vals, expect_keys, *expect_vals, std::move(agg2), force_use_sort_impl::YES),
-        cudf::logic_error);
-    }
+    std::vector<std::unique_ptr<cudf::column>> children;
+    children.push_back(expect_sum_vals.release());
+    children.push_back(expect_overflow_vals.release());
+    auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {});
+
+    auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg));
+
+    // Verify that sort-based groupby throws for decimals
+    auto agg2 = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
+    EXPECT_THROW(
+      test_single_agg(
+        keys, vals, expect_keys, *expect_vals, std::move(agg2), force_use_sort_impl::YES),
+      cudf::logic_error);
   } else {
     using DeviceType = cudf::device_storage_type_t<V>;
 

From 7ae723ab56ee4861a76d9295dcddf2583a061ac5 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 15:56:06 -0700
Subject: [PATCH 10/23] Update tests

---
 cpp/tests/device_atomics/device_atomics_test.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index b74b25c90a8..08e9c2db3d3 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -296,7 +296,7 @@ TEST_F(Atomic128Test, CarryPropagation)
 TEST_F(Atomic128Test, NegativeNumbers)
 {
   constexpr int total_threads = 32 * 256;
-  run_atomic_add_test(total_threads * 100, -50, total_threads * 50);
+  run_atomic_add_test(0 - 50, -total_threads * 50);
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From 0429b9e9242498fe7ac211e93cda5caea7fe949e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 16:16:09 -0700
Subject: [PATCH 11/23] Cleanups

---
 .../cudf/detail/utilities/device_atomics.cuh  | 57 +++++++------------
 .../device_atomics/device_atomics_test.cu     | 14 ++++-
 2 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 3ec86d9b7a0..2d85da43efe 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -39,6 +39,7 @@
 
 #include <cuda/atomic>
 #include <cuda/std/type_traits>
+#include <cuda/utility>
 
 namespace cudf {
 namespace detail {
@@ -376,51 +377,37 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val,
  */
 __forceinline__ __device__ __int128_t atomic_add(__int128_t* address, __int128_t val)
 {
-#if __CUDA_ARCH__ >= 900  // Hopper and newer (compute capability 9.0+)
-  // Use native 128-bit atomic support on Hopper+ GPUs
+#if __CUDA_ARCH__ >= 900
   return atomicAdd(address, val);
 #else
-  // Fall back to two 64-bit atomic implementation for older GPUs
-
-  // Cast to signed for atomic_cas compatibility and treat as array of two int64_t
-  auto* target_ptr      = reinterpret_cast<int64_t*>(address);
-  auto add_val_unsigned = static_cast<__uint128_t>(val);
+  uint64_t* const target_ptr         = reinterpret_cast<uint64_t*>(address);
+  __uint128_t const add_val_unsigned = static_cast<__uint128_t>(val);
 
   // Split the 128-bit add value into two 64-bit parts
-  int64_t add_low  = static_cast<int64_t>(static_cast<uint64_t>(add_val_unsigned));
-  int64_t add_high = static_cast<int64_t>(static_cast<uint64_t>(add_val_unsigned >> 64));
+  uint64_t const add_low  = static_cast<uint64_t>(add_val_unsigned);
+  uint64_t const add_high = static_cast<uint64_t>(add_val_unsigned >> 64);
+
+  uint64_t carry = 0;
+  uint64_t old_parts[2];
 
-  int64_t carry = 0;
-  int64_t old_parts[2];
+  cuda::static_for<0, 2>([&](auto i) {
+    uint64_t const current_add = (i == 0) ? add_low : add_high;
+    uint64_t expected_part, new_part;
 
-  // Process both 64-bit parts with carry propagation
-  for (int i = 0; i < 2; ++i) {
-    int64_t current_add = (i == 0) ? add_low : add_high;
-    int64_t old_part, new_part, expected_part;
+    cuda::atomic_ref<uint64_t, cuda::thread_scope_device> atomic_part{target_ptr[i]};
 
-    // Atomic compare-and-swap loop for this 64-bit part
     do {
-      expected_part = target_ptr[i];
+      expected_part = atomic_part.load();
       new_part      = expected_part + current_add + carry;
-      old_part      = atomicCAS(reinterpret_cast<unsigned long long*>(&target_ptr[i]),
-                           static_cast<unsigned long long>(expected_part),
-                           static_cast<unsigned long long>(new_part));
-      old_part      = static_cast<int64_t>(old_part);
-    } while (old_part != expected_part);
-
-    // Store the old value for result reconstruction
-    old_parts[i] = old_part;
-
-    // Calculate carry for next iteration (convert back to unsigned for carry calculation)
-    carry = static_cast<int64_t>(calculate_carry_64(static_cast<uint64_t>(old_part),
-                                                    static_cast<uint64_t>(current_add),
-                                                    static_cast<uint64_t>(carry)));
-  }
+    } while (
+      !atomic_part.compare_exchange_weak(expected_part, new_part, cuda::memory_order_relaxed));
+
+    old_parts[i] = expected_part;
+    carry        = calculate_carry_64(expected_part, current_add, carry);
+  });
 
-  // Reconstruct the original 128-bit value from the old parts
-  __uint128_t old_val_unsigned =
-    (static_cast<__uint128_t>(static_cast<uint64_t>(old_parts[1])) << 64) |
-    static_cast<uint64_t>(old_parts[0]);
+  __uint128_t const old_val_unsigned =
+    (static_cast<__uint128_t>(old_parts[1]) << 64) | old_parts[0];
   return static_cast<__int128_t>(old_val_unsigned);
 #endif
 }
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 08e9c2db3d3..e517e3bfa25 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -288,15 +288,23 @@ TEST_F(Atomic128Test, BasicAddition) { run_atomic_add_test(0, 1, 32 * 256 * 1);
 TEST_F(Atomic128Test, CarryPropagation)
 {
   constexpr int total_threads = 32 * 256;
-  __int128_t initial_value    = cuda::std::numeric_limits<uint64_t>::max() - total_threads + 1;
-  __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits<uint64_t>::max()) + 1;
+  __int128_t initial_value    = cuda::std::numeric_limits<int64_t>::max() - total_threads + 1;
+  __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits<int64_t>::max()) + 1;
   run_atomic_add_test(initial_value, 1, expected);
 }
 
 TEST_F(Atomic128Test, NegativeNumbers)
 {
   constexpr int total_threads = 32 * 256;
-  run_atomic_add_test(0 - 50, -total_threads * 50);
+  run_atomic_add_test(0, -50, -total_threads * 50);
+}
+
+TEST_F(Atomic128Test, NegativeCarryPropagation)
+{
+  constexpr int total_threads = 32 * 256;
+  __int128_t initial_value    = cuda::std::numeric_limits<int64_t>::min() + total_threads - 1;
+  __int128_t expected = static_cast<__int128_t>(cuda::std::numeric_limits<int64_t>::min()) - 1;
+  run_atomic_add_test(initial_value, -1, expected);
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From 7a9dc29e09f7cf5c2985a6609583e055d442c9bc Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 16:19:47 -0700
Subject: [PATCH 12/23] Minor fix

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index e13234d80ab..51c31d259a3 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1419,7 +1419,7 @@ struct target_type_impl<Source,
 // SUM_WITH_OVERFLOW outputs a struct {sum: Source, overflow: bool} where sum type matches input
 // type Only supports signed integral types (excluding bool) and decimal types
 template <typename Source>
-  requires(cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>() ||
+  requires((cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>()) ||
            cudf::is_fixed_point<Source>())
 struct target_type_impl<Source, aggregation::SUM_WITH_OVERFLOW> {
   using type = struct_view;  // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields

From 2710856bdf91319ce6b93b903b4e7a48e9b24d51 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 16:58:01 -0700
Subject: [PATCH 13/23] Fix Hopper atomicAdd

---
 .../cudf/detail/utilities/device_atomics.cuh    | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 2d85da43efe..d9dd163da9e 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -365,11 +365,11 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val,
 }
 
 /**
- * @brief Atomic addition for __int128_t with GPU architecture optimization
+ * @brief Atomic addition for __int128_t with architecture-specific optimization
  *
- * Uses native atomicAdd on Hopper+ GPUs (compute capability 9.0+) where 128-bit
- * atomic operations are supported. Falls back to two 64-bit atomic CAS operations
- * with carry propagation on older GPU architectures.
+ * Uses native 128-bit CAS on Hopper+ GPUs (compute capability 9.0+) for optimal
+ * performance. Falls back to two 64-bit atomic CAS operations with carry propagation
+ * on older GPU architectures.
  *
  * @param address Pointer to the __int128_t value
  * @param val Value to add
@@ -378,7 +378,14 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val,
 __forceinline__ __device__ __int128_t atomic_add(__int128_t* address, __int128_t val)
 {
 #if __CUDA_ARCH__ >= 900
-  return atomicAdd(address, val);
+  __int128_t expected, desired;
+
+  do {
+    expected = *address;
+    desired  = expected + val;
+  } while (atomicCAS(address, expected, desired) != expected);
+
+  return expected;
 #else
   uint64_t* const target_ptr         = reinterpret_cast<uint64_t*>(address);
   __uint128_t const add_val_unsigned = static_cast<__uint128_t>(val);

From 7c0f1dac52213c150dde35c1f191212bacdd86fd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 17:34:24 -0700
Subject: [PATCH 14/23] Cleanups

---
 .../detail/aggregation/device_aggregators.cuh | 46 ++-----------------
 1 file changed, 4 insertions(+), 42 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 9b29d69004a..550eecac875 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -140,9 +140,10 @@ struct update_target_element<Source, aggregation::SUM> {
 };
 
 template <typename Source>
-  requires((cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>()) ||
-           cudf::is_fixed_point<Source>() &&
-             cudf::has_atomic_support<device_storage_type_t<Source>>())
+  requires(
+    (cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>()) ||
+    (cudf::is_fixed_point<Source>() && cudf::has_atomic_support<device_storage_type_t<Source>>()) ||
+    cuda::std::is_same_v<Source, numeric::decimal128>)
 struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
   using DeviceType               = device_storage_type_t<Source>;
   static constexpr auto type_max = cuda::std::numeric_limits<DeviceType>::max();
@@ -177,45 +178,6 @@ struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
   }
 };
 
-/**
- * @brief Specialization of update_target_element for SUM_WITH_OVERFLOW with decimal128
- *
- * Uses two-64-bit atomic operations to provide atomic addition for __int128_t.
- * This enables SUM_WITH_OVERFLOW for decimal128 on all GPU architectures.
- */
-template <>
-struct update_target_element<numeric::decimal128, aggregation::SUM_WITH_OVERFLOW> {
-  using DeviceType               = device_storage_type_t<numeric::decimal128>;  // __int128_t
-  static constexpr auto type_max = cuda::std::numeric_limits<DeviceType>::max();
-  static constexpr auto type_min = cuda::std::numeric_limits<DeviceType>::min();
-
-  __device__ void operator()(mutable_column_device_view target,
-                             size_type target_index,
-                             column_device_view source,
-                             size_type source_index) const noexcept
-  {
-    auto sum_column      = target.child(0);
-    auto overflow_column = target.child(1);
-
-    auto const source_value = source.element<DeviceType>(source_index);
-
-    // Use the new 128-bit atomic addition implementation
-    auto const old_sum =
-      cudf::detail::atomic_add(&sum_column.element<DeviceType>(target_index), source_value);
-
-    // Early exit if overflow is already set
-    auto bool_ref = cuda::atomic_ref<bool, cuda::thread_scope_device>{
-      *(overflow_column.data<bool>() + target_index)};
-    if (bool_ref.load(cuda::memory_order_relaxed)) { return; }
-
-    // Check for overflow using the improved logic
-    auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) ||
-                           (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value));
-
-    if (overflow) { cudf::detail::atomic_max(&overflow_column.element<bool>(target_index), true); }
-  }
-};
-
 /**
  * @brief Function object to update a single element in a target column using
  * the dictionary key addressed by the specific index.

From 60bde43186f928aa5682c510499d06000fd407f4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 21 Oct 2025 19:13:01 -0700
Subject: [PATCH 15/23] Fix the stream issue in tests

---
 cpp/tests/device_atomics/device_atomics_test.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index e517e3bfa25..fa8b5f7f01b 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -275,10 +275,11 @@ class Atomic128Test : public cudf::test::BaseFixture {
                            __int128_t add_value,
                            __int128_t expected_result)
   {
-    rmm::device_scalar<__int128_t> d_target(initial_value, cudf::test::get_default_stream());
-    test_single_atomic_add_kernel<<<32, 256>>>(d_target.data(), add_value);
-    CUDF_CHECK_CUDA(cudf::test::get_default_stream().value());
-    __int128_t result = d_target.value(cudf::test::get_default_stream());
+    rmm::device_scalar<__int128_t> d_target(initial_value, cudf::get_default_stream());
+    test_single_atomic_add_kernel<<<32, 256, 0, cudf::get_default_stream().value()>>>(
+      d_target.data(), add_value);
+    CUDF_CHECK_CUDA(cudf::get_default_stream().value());
+    __int128_t result = d_target.value(cudf::get_default_stream());
     EXPECT_EQ(result, expected_result);
   }
 };

From 64fe2593f4fecc25e6f1d2c8e3523353a855b93d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 22 Oct 2025 11:55:38 -0700
Subject: [PATCH 16/23] Minor cleanups

---
 cpp/include/cudf/detail/aggregation/device_aggregators.cuh | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
index 550eecac875..8b3723b457e 100644
--- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
+++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh
@@ -166,13 +166,10 @@ struct update_target_element<Source, aggregation::SUM_WITH_OVERFLOW> {
       *(overflow_column.data<bool>() + target_index)};
     if (bool_ref.load(cuda::memory_order_relaxed)) { return; }
 
-    // Check for overflow before performing the addition to avoid UB
-    // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value
-    // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value
     // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is
     // ready
-    auto const overflow = ((old_sum > 0 && source_value > 0 && old_sum > type_max - source_value) ||
-                           (old_sum < 0 && source_value < 0 && old_sum < type_min - source_value));
+    auto const overflow =
+      source_value > 0 ? old_sum > type_max - source_value : old_sum < type_min - source_value;
 
     if (overflow) { cudf::detail::atomic_max(&overflow_column.element<bool>(target_index), true); }
   }

From a11afbe2889872ffc326d008e981e3e2bffaa574 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 22 Oct 2025 14:36:18 -0700
Subject: [PATCH 17/23] Apply suggestion from @davidwendt

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/include/cudf/detail/utilities/device_atomics.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index d9dd163da9e..99f6aaa7472 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -361,7 +361,7 @@ __device__ __forceinline__ uint64_t calculate_carry_64(uint64_t old_val,
 {
   // Use __uint128_t to detect overflow beyond 64-bit range
   __uint128_t sum = static_cast<__uint128_t>(old_val) + add_val + carry_in;
-  return (sum > cuda::std::numeric_limits<uint64_t>::max()) ? 1 : 0;
+  return sum > cuda::std::numeric_limits<uint64_t>::max();
 }
 
 /**

From d520a28bfca6a1b0c802c5943c29780d4fa29cd3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 22 Oct 2025 14:36:35 -0700
Subject: [PATCH 18/23] Apply suggestion from @davidwendt

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/aggregation/aggregation.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index 271ba926159..7be37125dc9 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -103,7 +103,7 @@ struct identity_initializer {
       // Initialize sum column using standard SUM aggregation dispatch
       dispatch_type_and_aggregation(
         sum_col.type(), aggregation::SUM, identity_initializer{}, sum_col, stream);
-      thrust::fill_n(
+      thrust::uninitialized_fill_n(
         rmm::exec_policy_nosync(stream), overflow_col.begin<bool>(), col.size(), false);
     } else if constexpr (std::is_same_v<T, cudf::struct_view>) {
       // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case

From 0748f55c0653e5ec02d2eb7e2dc66bc297fd80a4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 23 Oct 2025 10:02:33 -0700
Subject: [PATCH 19/23] Clean up tests

---
 cpp/tests/groupby/sum_with_overflow_tests.cpp | 40 +++----------------
 1 file changed, 6 insertions(+), 34 deletions(-)

diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp
index 863fd72d5ca..9009d2f803e 100644
--- a/cpp/tests/groupby/sum_with_overflow_tests.cpp
+++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp
@@ -28,8 +28,6 @@
 using namespace cudf::test::iterators;
 using namespace numeric;
 
-using K = int32_t;
-
 // SUM_WITH_OVERFLOW tests - supports signed integer and decimal types
 template <typename V>
 struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {};
@@ -42,6 +40,7 @@ TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_typ
 
 TYPED_TEST(groupby_sum_with_overflow_test, basic)
 {
+  using K = int32_t;
   using V = TypeParam;
 
   cudf::test::fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
@@ -100,6 +99,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, basic)
 
 TYPED_TEST(groupby_sum_with_overflow_test, empty_cols)
 {
+  using K = int32_t;
   using V = TypeParam;
 
   cudf::test::fixed_width_column_wrapper<K> keys{};
@@ -124,6 +124,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, empty_cols)
 
 TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys)
 {
+  using K = int32_t;
   using V = TypeParam;
 
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3}, cudf::test::iterators::all_nulls());
@@ -148,6 +149,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys)
 
 TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values)
 {
+  using K = int32_t;
   using V = TypeParam;
 
   cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1};
@@ -176,6 +178,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values)
 
 TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values)
 {
+  using K = int32_t;
   using V = TypeParam;
 
   cudf::test::fixed_width_column_wrapper<K> keys(
@@ -210,6 +213,7 @@ TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values)
 
 TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection)
 {
+  using K = int32_t;
   using V = TypeParam;
 
   if constexpr (cudf::is_fixed_point<V>()) {
@@ -391,38 +395,6 @@ TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection)
   }  // end else block for non-decimal types
 }
 
-// Test that SUM_WITH_OVERFLOW works with int32_t (which used to be unsupported)
-TEST(groupby_sum_with_overflow_compatibility_test, int32_support)
-{
-  using K = int32_t;
-  using V = int32_t;  // int32_t is now supported by SUM_WITH_OVERFLOW
-
-  cudf::test::fixed_width_column_wrapper<K> keys{1, 1, 1, 2, 2, 2, 3, 3, 3};
-  cudf::test::fixed_width_column_wrapper<V> vals{1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  cudf::test::fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
-
-  // Create expected struct column with sum and overflow children
-  auto sum_col      = cudf::test::fixed_width_column_wrapper<V>{6, 15, 24};
-  auto overflow_col = cudf::test::fixed_width_column_wrapper<bool>{false, false, false};
-  std::vector<std::unique_ptr<cudf::column>> children;
-  children.push_back(sum_col.release());
-  children.push_back(overflow_col.release());
-  auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {});
-
-  auto agg = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-
-  // SUM_WITH_OVERFLOW should now work with int32_t value types
-  test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg), force_use_sort_impl::NO);
-
-  // SUM_WITH_OVERFLOW should throw with sort-based groupby
-  auto agg_sort = cudf::make_sum_with_overflow_aggregation<cudf::groupby_aggregation>();
-  EXPECT_THROW(
-    test_single_agg(
-      keys, vals, expect_keys, *expect_vals, std::move(agg_sort), force_use_sort_impl::YES),
-    cudf::logic_error);
-}
-
 // Test that SUM_WITH_OVERFLOW throws an error for bool type (which is not supported)
 TEST(groupby_sum_with_overflow_error_test, bool_type_not_supported)
 {

From 2f154ab587c52b897ba9f8bca94951a9fb76bf58 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 23 Oct 2025 12:01:24 -0700
Subject: [PATCH 20/23] Header cleanups

---
 cpp/tests/groupby/sum_with_overflow_tests.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp
index 9009d2f803e..87686429e02 100644
--- a/cpp/tests/groupby/sum_with_overflow_tests.cpp
+++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp
@@ -21,8 +21,11 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/aggregation.hpp>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 using namespace cudf::test::iterators;

From da628af6bc85da0a5942dc8d569dd74b5a9566d9 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 27 Oct 2025 11:41:57 -0700
Subject: [PATCH 21/23] Update copyrights

---
 cpp/tests/groupby/sum_with_overflow_tests.cpp | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/cpp/tests/groupby/sum_with_overflow_tests.cpp b/cpp/tests/groupby/sum_with_overflow_tests.cpp
index 87686429e02..a320f4c1f64 100644
--- a/cpp/tests/groupby/sum_with_overflow_tests.cpp
+++ b/cpp/tests/groupby/sum_with_overflow_tests.cpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <tests/groupby/groupby_test_util.hpp>

From 3856f1d596a5a083cd973e1adff5a94a07517cdc Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 3 Nov 2025 09:45:16 -0800
Subject: [PATCH 22/23] Update comment

---
 cpp/src/groupby/hash/output_utils.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/output_utils.cu b/cpp/src/groupby/hash/output_utils.cu
index 6029134ab94..20adec1b0f2 100644
--- a/cpp/src/groupby/hash/output_utils.cu
+++ b/cpp/src/groupby/hash/output_utils.cu
@@ -63,8 +63,7 @@ struct result_column_creator {
 
     auto make_children = [&make_empty_column, col_type = col.type()](size_type size) {
       std::vector<std::unique_ptr<column>> children;
-      // Create sum child column (int64_t) - no null mask needed, struct-level mask handles
-      // nullability
+      // Create sum child column - no null mask needed, struct-level mask handles nullability
       children.push_back(make_empty_column(col_type.id(), size, mask_state::UNALLOCATED));
       // Create overflow child column (bool) - no null mask needed, only value matters
       children.push_back(make_empty_column(type_id::BOOL8, size, mask_state::UNALLOCATED));

From 4372f92c9bf44d1923585a3538e78c576e70cf1f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 3 Nov 2025 14:05:01 -0800
Subject: [PATCH 23/23] Update
 cpp/include/cudf/detail/aggregation/aggregation.hpp

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 6ef51f4af95..24832238981 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1406,7 +1406,7 @@ struct target_type_impl<Source,
 };
 
 // SUM_WITH_OVERFLOW outputs a struct {sum: Source, overflow: bool} where sum type matches input
-// type Only supports signed integral types (excluding bool) and decimal types
+// type, only supports signed integral types (excluding bool) and decimal types
 template <typename Source>
   requires((cudf::is_integral_not_bool<Source>() && cudf::is_signed<Source>()) ||
            cudf::is_fixed_point<Source>())