Skip to content

Optimizations for tdigest generation. #19140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: branch-25.08
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp)
# ##################################################################################################
# * tdigest benchmark
# --------------------------------------------------------------------------------
ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cu)
ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cpp)

# ##################################################################################################
# * type_dispatcher benchmark ---------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,14 +17,9 @@
#include <cudf_test/column_wrapper.hpp>

#include <cudf/detail/tdigest/tdigest.hpp>
#include <cudf/filling.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/exec_policy.hpp>

#include <cuda/functional>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>

#include <nvbench/nvbench.cuh>

void bm_tdigest_merge(nvbench::state& state)
Expand Down Expand Up @@ -77,47 +72,113 @@ void bm_tdigest_merge(nvbench::state& state)
tdigest_children.push_back(maxes.release());
cudf::test::structs_column_wrapper tdigest(std::move(tdigest_children));

rmm::device_uvector<cudf::size_type> group_offsets(num_groups + 1, stream, mr);
rmm::device_uvector<cudf::size_type> group_labels(num_tdigests, stream, mr);
auto group_offset_iter = cudf::detail::make_counting_transform_iterator(
0,
cuda::proclaim_return_type<cudf::size_type>(
[tdigests_per_group] __device__(cudf::size_type i) { return i * tdigests_per_group; }));
thrust::copy(rmm::exec_policy_nosync(stream, mr),
group_offset_iter,
group_offset_iter + num_groups + 1,
group_offsets.begin());
auto group_label_iter = cudf::detail::make_counting_transform_iterator(
0,
cuda::proclaim_return_type<cudf::size_type>(
[tdigests_per_group] __device__(cudf::size_type i) { return i / tdigests_per_group; }));
thrust::copy(rmm::exec_policy_nosync(stream, mr),
group_label_iter,
group_label_iter + num_tdigests,
group_labels.begin());
// group offsets, labels
auto zero = cudf::numeric_scalar<cudf::size_type>(0);
auto indices = cudf::sequence(num_tdigests, zero);
auto tpg_scalar = cudf::numeric_scalar<cudf::size_type>(tdigests_per_group);

auto group_offsets = cudf::sequence(num_groups + 1, zero, tpg_scalar, stream, mr);
// expand 0, 1, 2, 3, 4, into 0, 0, 0, 1, 1, 1, 2, 2, 2, etc
auto group_labels = std::move(
cudf::repeat(cudf::table_view({cudf::slice(indices->view(), {0, num_groups}).front()}),
tdigests_per_group,
stream,
mr)
->release()
.front());

stream.synchronize();

state.add_element_count(total_centroids);

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
[&](nvbench::launch& launch, auto& timer) {
timer.start();
auto result = cudf::tdigest::detail::group_merge_tdigest(
tdigest, group_offsets, group_labels, num_groups, max_centroids, stream, mr);
timer.stop();
});
state.exec(
nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
timer.start();
auto result = cudf::tdigest::detail::group_merge_tdigest(
tdigest,
{group_offsets->view().begin<cudf::size_type>(),
static_cast<size_t>(group_offsets->size())},
{group_labels->view().begin<cudf::size_type>(), static_cast<size_t>(group_labels->size())},
Comment on lines +100 to +102
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe column_view has a device_span operator so this may work

Suggested change
{group_offsets->view().begin<cudf::size_type>(),
static_cast<size_t>(group_offsets->size())},
{group_labels->view().begin<cudf::size_type>(), static_cast<size_t>(group_labels->size())},
group_offsets->view(),
group_labels->view(),

num_groups,
max_centroids,
stream,
mr);
timer.stop();
});
}

void bm_tdigest_reduce(nvbench::state& state)
{
auto const rows_per_group = static_cast<cudf::size_type>(state.get_int64("rows_per_group"));
auto const num_groups = static_cast<cudf::size_type>(state.get_int64("num_groups"));
auto const num_rows = rows_per_group * num_groups;
auto const max_centroids = static_cast<cudf::size_type>(state.get_int64("max_centroids"));

auto stream = cudf::get_default_stream();
auto mr = rmm::mr::get_current_device_resource();

// construct input values
auto zero = cudf::numeric_scalar<cudf::size_type>(0);
auto input = cudf::sequence(num_rows, zero);

// group offsets, labels, valid counts
auto rpg_scalar = cudf::numeric_scalar<cudf::size_type>(rows_per_group);

auto group_offsets = cudf::sequence(num_groups + 1, zero, rpg_scalar, stream, mr);
// expand 0, 1, 2, 3, 4, into 0, 0, 0, 1, 1, 1, 2, 2, 2, etc
auto group_labels =
std::move(cudf::repeat(cudf::table_view({cudf::slice(input->view(), {0, num_groups}).front()}),
rows_per_group,
stream,
mr)
->release()
.front());
auto group_valid_counts = cudf::sequence(num_groups, rpg_scalar, zero);

stream.synchronize();

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(
nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
timer.start();
auto result = cudf::tdigest::detail::group_tdigest(
*input,
{group_offsets->view().begin<cudf::size_type>(),
static_cast<size_t>(group_offsets->size())},
{group_labels->view().begin<cudf::size_type>(), static_cast<size_t>(group_labels->size())},
{group_valid_counts->view().begin<cudf::size_type>(),
static_cast<size_t>(group_valid_counts->size())},
num_groups,
max_centroids,
stream,
mr);
timer.stop();
});
}

NVBENCH_BENCH(bm_tdigest_merge)
.set_name("TDigest many tiny groups")
.set_name("TDigest merge many tiny groups")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using these names on a command-line to select a specific benchmark may be cumbersome.
Can we shorten them a bit and remove the spaces?

merge-many-tiny
merge-many-small
reduce-many-small
reduce-few-large

The TDigest is redundant since the benchmark executable already includes this.

.add_int64_axis("num_tdigests", {500'000})
.add_int64_axis("tdigest_size", {1, 1000})
.add_int64_axis("tdigests_per_group", {1})
.add_int64_axis("max_centroids", {10000, 1000});

NVBENCH_BENCH(bm_tdigest_merge)
.set_name("TDigest many small groups")
.set_name("TDigest merge many small groups")
.add_int64_axis("num_tdigests", {500'000})
.add_int64_axis("tdigest_size", {1, 1000})
.add_int64_axis("tdigests_per_group", {3})
.add_int64_axis("max_centroids", {10000, 1000});

NVBENCH_BENCH(bm_tdigest_reduce)
.set_name("TDigest reduce many small groups")
.add_int64_axis("num_groups", {2000})
.add_int64_axis("rows_per_group", {1, 32, 100})
.add_int64_axis("max_centroids", {10000, 1000});

NVBENCH_BENCH(bm_tdigest_reduce)
.set_name("TDigest reduce few large groups")
.add_int64_axis("num_groups", {1, 16, 64})
.add_int64_axis("rows_per_group", {5'000'000, 1'000'000})
.add_int64_axis("max_centroids", {10000, 1000});
5 changes: 4 additions & 1 deletion cpp/include/cudf/detail/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,6 +27,9 @@
namespace CUDF_EXPORT cudf {
namespace tdigest::detail {

// for testing purposes
extern bool disable_cpu_cluster_computation;

/**
* @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
*
Expand Down
Loading