Skip to content

Commit 82c121c

Browse files
committed
GH-45733: [C++][Python] Add biased/unbiased toggle to skew and kurtosis functions
1 parent c3e399a commit 82c121c

File tree

7 files changed

+30
-10
lines changed

7 files changed

+30
-10
lines changed

cpp/src/arrow/compute/api_aggregate.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ static auto kVarianceOptionsType = GetFunctionOptionsType<VarianceOptions>(
111111
DataMember("min_count", &VarianceOptions::min_count));
112112
static auto kSkewOptionsType = GetFunctionOptionsType<SkewOptions>(
113113
DataMember("skip_nulls", &SkewOptions::skip_nulls),
114+
DataMember("bias", &SkewOptions::bias),
114115
DataMember("min_count", &SkewOptions::min_count));
115116
static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
116117
DataMember("q", &QuantileOptions::q),
@@ -154,9 +155,10 @@ VarianceOptions::VarianceOptions(int ddof, bool skip_nulls, uint32_t min_count)
154155
min_count(min_count) {}
155156
constexpr char VarianceOptions::kTypeName[];
156157

157-
SkewOptions::SkewOptions(bool skip_nulls, uint32_t min_count)
158+
SkewOptions::SkewOptions(bool skip_nulls, bool bias, uint32_t min_count)
158159
: FunctionOptions(internal::kSkewOptionsType),
159160
skip_nulls(skip_nulls),
161+
bias(bias),
160162
min_count(min_count) {}
161163

162164
QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation,

cpp/src/arrow/compute/api_aggregate.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,14 @@ class ARROW_EXPORT VarianceOptions : public FunctionOptions {
117117
/// \brief Control Skew and Kurtosis kernel behavior
118118
class ARROW_EXPORT SkewOptions : public FunctionOptions {
119119
public:
120-
explicit SkewOptions(bool skip_nulls = true, uint32_t min_count = 0);
120+
explicit SkewOptions(bool skip_nulls = true, bool bias = true, uint32_t min_count = 0);
121121
static constexpr char const kTypeName[] = "SkewOptions";
122122
static SkewOptions Defaults() { return SkewOptions{}; }
123123

124124
/// If true (the default), null values are ignored. Otherwise, if any value is null,
125125
/// emit null.
126126
bool skip_nulls;
127+
bool bias;
127128
/// If less than this many non-null values are observed, emit null.
128129
uint32_t min_count;
129130
};

cpp/src/arrow/compute/kernels/aggregate_var_std.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ struct StatisticImpl : public ScalarAggregator {
176176
: out_type(out_type),
177177
stat_type(stat_type),
178178
skip_nulls(options.skip_nulls),
179+
bias(options.bias),
179180
min_count(options.min_count),
180181
ddof(0),
181182
state(moments_level_for_statistic(stat_type), decimal_scale, skip_nulls) {}
@@ -208,7 +209,7 @@ struct StatisticImpl : public ScalarAggregator {
208209
out->value = std::make_shared<DoubleScalar>(state.moments.Variance(ddof));
209210
break;
210211
case StatisticType::Skew:
211-
out->value = std::make_shared<DoubleScalar>(state.moments.Skew());
212+
out->value = std::make_shared<DoubleScalar>(state.moments.Skew(bias));
212213
break;
213214
case StatisticType::Kurtosis:
214215
out->value = std::make_shared<DoubleScalar>(state.moments.Kurtosis());
@@ -224,6 +225,7 @@ struct StatisticImpl : public ScalarAggregator {
224225
std::shared_ptr<DataType> out_type;
225226
StatisticType stat_type;
226227
bool skip_nulls;
228+
bool bias;
227229
uint32_t min_count;
228230
int ddof = 0;
229231
MomentsState<ArrowType> state;

cpp/src/arrow/compute/kernels/aggregate_var_std_internal.h

+9-2
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,16 @@ struct Moments {
8484

8585
double Stddev(int ddof) const { return sqrt(Variance(ddof)); }
8686

87-
double Skew() const {
87+
double Skew(bool bias = true) const {
88+
double result;
8889
// This may return NaN for m2 == 0 and m3 == 0, which is expected
89-
return sqrt(count) * m3 / sqrt(m2 * m2 * m2);
90+
if (bias) {
91+
result = sqrt(count) * m3 / sqrt(m2 * m2 * m2);
92+
} else {
93+
result =
94+
sqrt(count * (count - 1)) / (count - 2) * (m3 / count) / pow((m2 / count), 1.5);
95+
}
96+
return result;
9097
}
9198

9299
double Kurtosis() const {

python/pyarrow/_compute.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -1909,8 +1909,8 @@ class VarianceOptions(_VarianceOptions):
19091909

19101910

19111911
cdef class _SkewOptions(FunctionOptions):
1912-
def _set_options(self, skip_nulls, min_count):
1913-
self.wrapped.reset(new CSkewOptions(skip_nulls, min_count))
1912+
def _set_options(self, skip_nulls, bias, min_count):
1913+
self.wrapped.reset(new CSkewOptions(skip_nulls, bias, min_count))
19141914

19151915

19161916
class SkewOptions(_SkewOptions):
@@ -1923,8 +1923,8 @@ class SkewOptions(_SkewOptions):
19231923
{_min_count_doc(default=0)}
19241924
"""
19251925

1926-
def __init__(self, *, skip_nulls=True, min_count=0):
1927-
self._set_options(skip_nulls, min_count)
1926+
def __init__(self, *, skip_nulls=True, bias=True, min_count=0):
1927+
self._set_options(skip_nulls, bias, min_count)
19281928

19291929

19301930
cdef class _SplitOptions(FunctionOptions):

python/pyarrow/includes/libarrow.pxd

+2-1
Original file line numberDiff line numberDiff line change
@@ -2628,8 +2628,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
26282628

26292629
cdef cppclass CSkewOptions \
26302630
"arrow::compute::SkewOptions"(CFunctionOptions):
2631-
CSkewOptions(c_bool skip_nulls, uint32_t min_count)
2631+
CSkewOptions(c_bool skip_nulls, c_bool bias, uint32_t min_count)
26322632
c_bool skip_nulls
2633+
c_bool bias
26332634
uint32_t min_count
26342635

26352636
cdef cppclass CScalarAggregateOptions \

python/pyarrow/tests/test_compute.py

+7
Original file line numberDiff line numberDiff line change
@@ -3838,3 +3838,10 @@ def test_pivot_wider():
38383838
with pytest.raises(ValueError, match="Encountered more than one non-null value"):
38393839
result = pc.pivot_wider(["height", "width", "height"], [10, None, 11],
38403840
key_names=key_names)
3841+
3842+
3843+
@pytest.mark.pandas
3844+
def test_biased_skew():
3845+
arrow_skew = pc.skew([1.0, 2.0, 3.0, 40.0, None], skip_nulls=True, bias=False)
3846+
pandas_skew = pd.Series(np.array([1.0, 2.0, 3.0, 40.0, np.nan])).skew(skipna=True)
3847+
assert arrow_skew == pa.scalar(pandas_skew)

0 commit comments

Comments
 (0)