Skip to content

Commit 13c898c

Browse files
authored
GH-45676: [C++][Python][Compute] Add skew and kurtosis functions (#45677)
### What changes are included in this PR? * Add "skew" and "kurtosis" scalar aggregate functions * Add "hash_skew" and "hash_kurtosis" grouped aggregated functions * Add support for decimal32/decimal64 to "variance", "stddev", "hash_variance" and "hash_stddev" * Small optimization for "hash_variance" and "hash_stddev": avoid allocating the identity group id mapping when consuming an ExecBatch ### Are these changes tested? Yes, by included unit tests. ### Are there any user-facing changes? No, except new APIs. * GitHub Issue: #45676 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 8332cbf commit 13c898c

17 files changed

+1120
-520
lines changed

cpp/src/arrow/acero/hash_aggregate_test.cc

+189-190
Large diffs are not rendered by default.

cpp/src/arrow/compute/api_aggregate.cc

+17
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ static auto kVarianceOptionsType = GetFunctionOptionsType<VarianceOptions>(
109109
DataMember("ddof", &VarianceOptions::ddof),
110110
DataMember("skip_nulls", &VarianceOptions::skip_nulls),
111111
DataMember("min_count", &VarianceOptions::min_count));
112+
static auto kSkewOptionsType = GetFunctionOptionsType<SkewOptions>(
113+
DataMember("skip_nulls", &SkewOptions::skip_nulls),
114+
DataMember("min_count", &SkewOptions::min_count));
112115
static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
113116
DataMember("q", &QuantileOptions::q),
114117
DataMember("interpolation", &QuantileOptions::interpolation),
@@ -151,6 +154,11 @@ VarianceOptions::VarianceOptions(int ddof, bool skip_nulls, uint32_t min_count)
151154
min_count(min_count) {}
152155
constexpr char VarianceOptions::kTypeName[];
153156

157+
SkewOptions::SkewOptions(bool skip_nulls, uint32_t min_count)
158+
: FunctionOptions(internal::kSkewOptionsType),
159+
skip_nulls(skip_nulls),
160+
min_count(min_count) {}
161+
154162
QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation,
155163
bool skip_nulls, uint32_t min_count)
156164
: FunctionOptions(internal::kQuantileOptionsType),
@@ -203,6 +211,7 @@ void RegisterAggregateOptions(FunctionRegistry* registry) {
203211
DCHECK_OK(registry->AddFunctionOptionsType(kCountOptionsType));
204212
DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType));
205213
DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType));
214+
DCHECK_OK(registry->AddFunctionOptionsType(kSkewOptionsType));
206215
DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
207216
DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
208217
DCHECK_OK(registry->AddFunctionOptionsType(kPivotOptionsType));
@@ -271,6 +280,14 @@ Result<Datum> Variance(const Datum& value, const VarianceOptions& options,
271280
return CallFunction("variance", {value}, &options, ctx);
272281
}
273282

283+
Result<Datum> Skew(const Datum& value, const SkewOptions& options, ExecContext* ctx) {
284+
return CallFunction("skew", {value}, &options, ctx);
285+
}
286+
287+
Result<Datum> Kurtosis(const Datum& value, const SkewOptions& options, ExecContext* ctx) {
288+
return CallFunction("kurtosis", {value}, &options, ctx);
289+
}
290+
274291
Result<Datum> Quantile(const Datum& value, const QuantileOptions& options,
275292
ExecContext* ctx) {
276293
return CallFunction("quantile", {value}, &options, ctx);

cpp/src/arrow/compute/api_aggregate.h

+42
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,20 @@ class ARROW_EXPORT VarianceOptions : public FunctionOptions {
114114
uint32_t min_count;
115115
};
116116

117+
/// \brief Control Skew and Kurtosis kernel behavior
118+
class ARROW_EXPORT SkewOptions : public FunctionOptions {
119+
public:
120+
explicit SkewOptions(bool skip_nulls = true, uint32_t min_count = 0);
121+
static constexpr char const kTypeName[] = "SkewOptions";
122+
static SkewOptions Defaults() { return SkewOptions{}; }
123+
124+
/// If true (the default), null values are ignored. Otherwise, if any value is null,
125+
/// emit null.
126+
bool skip_nulls;
127+
/// If less than this many non-null values are observed, emit null.
128+
uint32_t min_count;
129+
};
130+
117131
/// \brief Control Quantile kernel behavior
118132
///
119133
/// By default, returns the median value.
@@ -503,6 +517,34 @@ Result<Datum> Variance(const Datum& value,
503517
const VarianceOptions& options = VarianceOptions::Defaults(),
504518
ExecContext* ctx = NULLPTR);
505519

520+
/// \brief Calculate the skewness of a numeric array
521+
///
522+
/// \param[in] value input datum, expecting Array or ChunkedArray
523+
/// \param[in] options see SkewOptions for more information
524+
/// \param[in] ctx the function execution context, optional
525+
/// \return datum of the computed skewness as a DoubleScalar
526+
///
527+
/// \since 20.0.0
528+
/// \note API not yet finalized
529+
ARROW_EXPORT
530+
Result<Datum> Skew(const Datum& value,
531+
const SkewOptions& options = SkewOptions::Defaults(),
532+
ExecContext* ctx = NULLPTR);
533+
534+
/// \brief Calculate the kurtosis of a numeric array
535+
///
536+
/// \param[in] value input datum, expecting Array or ChunkedArray
537+
/// \param[in] options see SkewOptions for more information
538+
/// \param[in] ctx the function execution context, optional
539+
/// \return datum of the computed kurtosis as a DoubleScalar
540+
///
541+
/// \since 20.0.0
542+
/// \note API not yet finalized
543+
ARROW_EXPORT
544+
Result<Datum> Kurtosis(const Datum& value,
545+
const SkewOptions& options = SkewOptions::Defaults(),
546+
ExecContext* ctx = NULLPTR);
547+
506548
/// \brief Calculate the quantiles of a numeric array
507549
///
508550
/// \param[in] value input datum, expecting Array or ChunkedArray

cpp/src/arrow/compute/kernels/aggregate_internal.h

+5-6
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,17 @@
1717

1818
#pragma once
1919

20+
#include <cmath>
21+
#include <initializer_list>
22+
2023
#include "arrow/compute/kernels/util_internal.h"
2124
#include "arrow/type.h"
2225
#include "arrow/type_traits.h"
2326
#include "arrow/util/bit_run_reader.h"
2427
#include "arrow/util/int128_internal.h"
2528
#include "arrow/util/logging.h"
2629

27-
namespace arrow {
28-
namespace compute {
29-
namespace internal {
30+
namespace arrow::compute::internal {
3031

3132
// Find the largest compatible primitive type for a primitive type.
3233
template <typename I, typename Enable = void>
@@ -254,6 +255,4 @@ SumType SumArray(const ArraySpan& data) {
254255
data, [](ValueType v) { return static_cast<SumType>(v); });
255256
}
256257

257-
} // namespace internal
258-
} // namespace compute
259-
} // namespace arrow
258+
} // namespace arrow::compute::internal

cpp/src/arrow/compute/kernels/aggregate_test.cc

+168-6
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
#include "arrow/testing/gtest_util.h"
4444
#include "arrow/testing/matchers.h"
45+
#include "arrow/testing/math.h"
4546
#include "arrow/testing/random.h"
4647
#include "arrow/util/logging.h"
4748

@@ -3386,6 +3387,9 @@ TEST_F(TestVarStdKernelMergeStability, Basics) {
33863387
#ifndef __MINGW32__ // MinGW has precision issues
33873388
// XXX: The reference value from numpy is actually wrong due to floating
33883389
// point limits. The correct result should equals variance(90, 0) = 4050.
3390+
// The problem is that the mean is not exactly representable as floating-point,
3391+
// and that small inaccuracy produces a large deviation when plugged into the M2
3392+
// calculation.
33893393
std::vector<std::string> chunks = {"[40000008000000490]", "[40000008000000400]"};
33903394
this->AssertVarStdIs(chunks, options, 3904.0);
33913395
#endif
@@ -3430,12 +3434,21 @@ TEST_F(TestVarStdKernelUInt32, Basics) {
34303434
this->AssertVarStdIs("[0, 0, 4294967295]", options, 6.148914688373205e+18);
34313435
}
34323436

3433-
// https://en.wikipedia.org/wiki/Kahan_summation_algorithm
34343437
void KahanSum(double& sum, double& adjust, double addend) {
3435-
double y = addend - adjust;
3436-
double t = sum + y;
3437-
adjust = (t - sum) - y;
3438-
sum = t;
3438+
// Backported enhancement from Neumaier's algorithm: consider case where
3439+
// sum is small compared to addend.
3440+
// https://en.wikipedia.org/wiki/Kahan_summation_algorithm#Further_enhancements
3441+
if (abs(sum) >= abs(addend)) {
3442+
double y = addend - adjust;
3443+
double t = sum + y;
3444+
adjust = (t - sum) - y;
3445+
sum = t;
3446+
} else {
3447+
double y = sum - adjust;
3448+
double t = addend + y;
3449+
adjust = (t - addend) - y;
3450+
sum = t;
3451+
}
34393452
}
34403453

34413454
// Calculate reference variance with Welford's online algorithm + Kahan summation
@@ -3534,7 +3547,8 @@ TEST_F(TestVarStdKernelIntegerLength, Basics) {
35343547

35353548
TEST(TestVarStdKernel, Decimal) {
35363549
// Effectively treated as double, sanity check results here
3537-
for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
3550+
for (const auto& ty :
3551+
{decimal32(3, 2), decimal64(3, 2), decimal128(3, 2), decimal256(3, 2)}) {
35383552
CheckVarStd(ArrayFromJSON(ty, R"(["1.00"])"), VarianceOptions(), 0);
35393553
CheckVarStd(ArrayFromJSON(ty, R"([null, "1.00", "2.00", "3.00"])"), VarianceOptions(),
35403554
0.6666666666666666);
@@ -3544,6 +3558,154 @@ TEST(TestVarStdKernel, Decimal) {
35443558
}
35453559
}
35463560

3561+
//
3562+
// Skew and Kurtosis
3563+
//
3564+
3565+
constexpr int kSkewUlps = 3;
3566+
constexpr int kKurtosisUlps = 6;
3567+
3568+
void CheckSkewKurtosis(const Datum& array, const SkewOptions& options,
3569+
double expected_skew, double expected_kurtosis, int n_ulps = -1) {
3570+
ARROW_SCOPED_TRACE("type = ", *array.type());
3571+
ASSERT_OK_AND_ASSIGN(Datum out_skew, Skew(array, options));
3572+
ASSERT_OK_AND_ASSIGN(Datum out_kurtosis, Kurtosis(array, options));
3573+
const auto& skew = checked_cast<const DoubleScalar&>(*out_skew.scalar());
3574+
const auto& kurtosis = checked_cast<const DoubleScalar&>(*out_kurtosis.scalar());
3575+
ASSERT_TRUE(skew.is_valid && kurtosis.is_valid);
3576+
AssertWithinUlp(expected_skew, skew.value, n_ulps >= 0 ? n_ulps : kSkewUlps);
3577+
AssertWithinUlp(expected_kurtosis, kurtosis.value,
3578+
n_ulps >= 0 ? n_ulps : kKurtosisUlps);
3579+
}
3580+
3581+
class TestSkewKurtosis : public ::testing::Test {
3582+
public:
3583+
void AssertSkewKurtosisAre(const Array& array, const SkewOptions& options,
3584+
double expected_skew, double expected_kurtosis,
3585+
int n_ulps = -1) {
3586+
CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps);
3587+
}
3588+
3589+
void AssertSkewKurtosisAre(const std::shared_ptr<ChunkedArray>& array,
3590+
const SkewOptions& options, double expected_skew,
3591+
double expected_kurtosis, int n_ulps = -1) {
3592+
CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps);
3593+
}
3594+
3595+
void AssertSkewKurtosisAre(const std::shared_ptr<DataType>& type, std::string_view json,
3596+
const SkewOptions& options, double expected_skew,
3597+
double expected_kurtosis, int n_ulps = -1) {
3598+
auto array = ArrayFromJSON(type, json);
3599+
CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps);
3600+
}
3601+
3602+
void AssertSkewKurtosisAre(const std::shared_ptr<DataType>& type,
3603+
const std::vector<std::string>& json,
3604+
const SkewOptions& options, double expected_skew,
3605+
double expected_kurtosis, int n_ulps = -1) {
3606+
auto array = ChunkedArrayFromJSON(type, json);
3607+
CheckSkewKurtosis(array, options, expected_skew, expected_kurtosis, n_ulps);
3608+
}
3609+
3610+
void AssertSkewKurtosisInvalid(const Array& array, const SkewOptions& options) {
3611+
AssertSkewKurtosisInvalidInternal(array, options);
3612+
}
3613+
3614+
void AssertSkewKurtosisInvalid(const std::shared_ptr<ChunkedArray>& array,
3615+
const SkewOptions& options) {
3616+
AssertSkewKurtosisInvalidInternal(array, options);
3617+
}
3618+
3619+
void AssertSkewKurtosisInvalid(const std::shared_ptr<DataType>& type,
3620+
std::string_view json, const SkewOptions& options) {
3621+
auto array = ArrayFromJSON(type, json);
3622+
AssertSkewKurtosisInvalidInternal(array, options);
3623+
}
3624+
3625+
void AssertSkewKurtosisInvalid(const std::shared_ptr<DataType>& type,
3626+
const std::vector<std::string>& json,
3627+
const SkewOptions& options) {
3628+
auto array = ChunkedArrayFromJSON(type, json);
3629+
AssertSkewKurtosisInvalidInternal(array, options);
3630+
}
3631+
3632+
private:
3633+
void AssertSkewKurtosisInvalidInternal(const Datum& array, const SkewOptions& options) {
3634+
ASSERT_OK_AND_ASSIGN(Datum out_skew, Skew(array, options));
3635+
ASSERT_OK_AND_ASSIGN(Datum out_kurtosis, Kurtosis(array, options));
3636+
const auto& skew = checked_cast<const DoubleScalar&>(*out_skew.scalar());
3637+
const auto& kurtosis = checked_cast<const DoubleScalar&>(*out_kurtosis.scalar());
3638+
ASSERT_FALSE(skew.is_valid || kurtosis.is_valid);
3639+
}
3640+
};
3641+
3642+
TEST_F(TestSkewKurtosis, Basics) {
3643+
// Test sample from SciPy, with results obtained using numpy.float128
3644+
auto options = SkewOptions::Defaults();
3645+
AssertSkewKurtosisAre(float64(), "[1.165, 0.6268, 0.0751, 0.3516, -0.6965]", options,
3646+
-0.29322304336607355496, -0.83411431970273759);
3647+
// Results are slightly different because the input doesn't losslessly convert
3648+
// to float32.
3649+
AssertSkewKurtosisAre(float32(), "[1.165, 0.6268, 0.0751, 0.3516, -0.6965]", options,
3650+
-0.2932230870440958164, -0.8341143229437093939);
3651+
}
3652+
3653+
TEST_F(TestSkewKurtosis, Chunked) {
3654+
auto options = SkewOptions::Defaults();
3655+
AssertSkewKurtosisAre(float64(), {"[1.165, 0.6268]", "[]", "[0.0751, 0.3516, -0.6965]"},
3656+
options, -0.29322304336607355496, -0.83411431970273759);
3657+
AssertSkewKurtosisAre(float32(), {"[1.165, 0.6268]", "[]", "[0.0751, 0.3516, -0.6965]"},
3658+
options, -0.2932230870440958164, -0.8341143229437093939);
3659+
}
3660+
3661+
TEST_F(TestSkewKurtosis, Decimal) {
3662+
auto options = SkewOptions::Defaults();
3663+
for (auto type :
3664+
{decimal32(5, 4), decimal64(5, 4), decimal128(5, 4), decimal256(5, 4)}) {
3665+
AssertSkewKurtosisAre(type, R"(["1.1650", "0.6268", "0.0751", "0.3516", "-0.6965"])",
3666+
options, -0.29322304336607355496, -0.83411431970273759);
3667+
}
3668+
}
3669+
3670+
TEST_F(TestSkewKurtosis, Integral) {
3671+
auto options = SkewOptions::Defaults();
3672+
for (auto type : IntTypes()) {
3673+
AssertSkewKurtosisAre(type, "[1, 2, 3, 5]", options, 0.4346507595746657,
3674+
-1.1542857142857144);
3675+
}
3676+
}
3677+
3678+
TEST_F(TestSkewKurtosis, SpecialCases) {
3679+
auto options = SkewOptions::Defaults();
3680+
for (auto type : {float64(), float32()}) {
3681+
AssertSkewKurtosisAre(type, "[0, 1, 2]", options, 0.0, -1.5, /*n_ulps=*/0);
3682+
AssertSkewKurtosisAre(type, "[1]", options, std::nan(""), std::nan(""));
3683+
AssertSkewKurtosisAre(type, "[1, 1, 1, 1, 1, 1]", options, std::nan(""),
3684+
std::nan(""));
3685+
}
3686+
}
3687+
3688+
TEST_F(TestSkewKurtosis, Options) {
3689+
for (auto type : {float64(), float32()}) {
3690+
auto options = SkewOptions::Defaults();
3691+
AssertSkewKurtosisInvalid(type, "[]", options);
3692+
AssertSkewKurtosisInvalid(type, std::vector<std::string>{}, options);
3693+
AssertSkewKurtosisInvalid(type, {"[]", "[]", "[]"}, options);
3694+
AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5);
3695+
AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0, -1.5);
3696+
options.min_count = 3;
3697+
AssertSkewKurtosisAre(type, "[0, 1, null, 2]", options, 0.0, -1.5);
3698+
AssertSkewKurtosisAre(type, {"[0, 1]", "[]", "[null, 2]"}, options, 0.0, -1.5);
3699+
options.skip_nulls = false;
3700+
AssertSkewKurtosisInvalid(type, "[0, 1, null, 2]", options);
3701+
AssertSkewKurtosisInvalid(type, {"[0, 1]", "[]", "[null, 2]"}, options);
3702+
options.skip_nulls = true;
3703+
options.min_count = 4;
3704+
AssertSkewKurtosisInvalid(type, "[0, 1, null, 2]", options);
3705+
AssertSkewKurtosisInvalid(type, {"[0, 1]", "[]", "[null, 2]"}, options);
3706+
}
3707+
}
3708+
35473709
//
35483710
// Quantile
35493711
//

0 commit comments

Comments
 (0)