42
42
43
43
#include " arrow/testing/gtest_util.h"
44
44
#include " arrow/testing/matchers.h"
45
+ #include " arrow/testing/math.h"
45
46
#include " arrow/testing/random.h"
46
47
#include " arrow/util/logging.h"
47
48
@@ -3386,6 +3387,9 @@ TEST_F(TestVarStdKernelMergeStability, Basics) {
3386
3387
#ifndef __MINGW32__ // MinGW has precision issues
3387
3388
// XXX: The reference value from numpy is actually wrong due to floating
3388
3389
// point limits. The correct result should equals variance(90, 0) = 4050.
3390
+ // The problem is that the mean is not exactly representable as floating-point,
3391
+ // and that small inaccuracy produces a large deviation when plugged into the M2
3392
+ // calculation.
3389
3393
std::vector<std::string> chunks = {" [40000008000000490]" , " [40000008000000400]" };
3390
3394
this ->AssertVarStdIs (chunks, options, 3904.0 );
3391
3395
#endif
@@ -3430,12 +3434,21 @@ TEST_F(TestVarStdKernelUInt32, Basics) {
3430
3434
this ->AssertVarStdIs (" [0, 0, 4294967295]" , options, 6.148914688373205e+18 );
3431
3435
}
3432
3436
3433
- // https://en.wikipedia.org/wiki/Kahan_summation_algorithm
3434
3437
void KahanSum (double & sum, double & adjust, double addend) {
3435
- double y = addend - adjust;
3436
- double t = sum + y;
3437
- adjust = (t - sum) - y;
3438
- sum = t;
3438
+ // Backported enhancement from Neumaier's algorithm: consider case where
3439
+ // sum is small compared to addend.
3440
+ // https://en.wikipedia.org/wiki/Kahan_summation_algorithm#Further_enhancements
3441
+ if (abs (sum) >= abs (addend)) {
3442
+ double y = addend - adjust;
3443
+ double t = sum + y;
3444
+ adjust = (t - sum) - y;
3445
+ sum = t;
3446
+ } else {
3447
+ double y = sum - adjust;
3448
+ double t = addend + y;
3449
+ adjust = (t - addend) - y;
3450
+ sum = t;
3451
+ }
3439
3452
}
3440
3453
3441
3454
// Calculate reference variance with Welford's online algorithm + Kahan summation
@@ -3534,7 +3547,8 @@ TEST_F(TestVarStdKernelIntegerLength, Basics) {
3534
3547
3535
3548
TEST (TestVarStdKernel, Decimal) {
3536
3549
// Effectively treated as double, sanity check results here
3537
- for (const auto & ty : {decimal128 (3 , 2 ), decimal256 (3 , 2 )}) {
3550
+ for (const auto & ty :
3551
+ {decimal32 (3 , 2 ), decimal64 (3 , 2 ), decimal128 (3 , 2 ), decimal256 (3 , 2 )}) {
3538
3552
CheckVarStd (ArrayFromJSON (ty, R"( ["1.00"])" ), VarianceOptions (), 0 );
3539
3553
CheckVarStd (ArrayFromJSON (ty, R"( [null, "1.00", "2.00", "3.00"])" ), VarianceOptions (),
3540
3554
0.6666666666666666 );
@@ -3544,6 +3558,154 @@ TEST(TestVarStdKernel, Decimal) {
3544
3558
}
3545
3559
}
3546
3560
3561
+ //
3562
+ // Skew and Kurtosis
3563
+ //
3564
+
3565
+ constexpr int kSkewUlps = 3 ;
3566
+ constexpr int kKurtosisUlps = 6 ;
3567
+
3568
+ void CheckSkewKurtosis (const Datum& array, const SkewOptions& options,
3569
+ double expected_skew, double expected_kurtosis, int n_ulps = -1 ) {
3570
+ ARROW_SCOPED_TRACE (" type = " , *array.type ());
3571
+ ASSERT_OK_AND_ASSIGN (Datum out_skew, Skew (array, options));
3572
+ ASSERT_OK_AND_ASSIGN (Datum out_kurtosis, Kurtosis (array, options));
3573
+ const auto & skew = checked_cast<const DoubleScalar&>(*out_skew.scalar ());
3574
+ const auto & kurtosis = checked_cast<const DoubleScalar&>(*out_kurtosis.scalar ());
3575
+ ASSERT_TRUE (skew.is_valid && kurtosis.is_valid );
3576
+ AssertWithinUlp (expected_skew, skew.value , n_ulps >= 0 ? n_ulps : kSkewUlps );
3577
+ AssertWithinUlp (expected_kurtosis, kurtosis.value ,
3578
+ n_ulps >= 0 ? n_ulps : kKurtosisUlps );
3579
+ }
3580
+
3581
+ class TestSkewKurtosis : public ::testing::Test {
3582
+ public:
3583
+ void AssertSkewKurtosisAre (const Array& array, const SkewOptions& options,
3584
+ double expected_skew, double expected_kurtosis,
3585
+ int n_ulps = -1 ) {
3586
+ CheckSkewKurtosis (array, options, expected_skew, expected_kurtosis, n_ulps);
3587
+ }
3588
+
3589
+ void AssertSkewKurtosisAre (const std::shared_ptr<ChunkedArray>& array,
3590
+ const SkewOptions& options, double expected_skew,
3591
+ double expected_kurtosis, int n_ulps = -1 ) {
3592
+ CheckSkewKurtosis (array, options, expected_skew, expected_kurtosis, n_ulps);
3593
+ }
3594
+
3595
+ void AssertSkewKurtosisAre (const std::shared_ptr<DataType>& type, std::string_view json,
3596
+ const SkewOptions& options, double expected_skew,
3597
+ double expected_kurtosis, int n_ulps = -1 ) {
3598
+ auto array = ArrayFromJSON (type, json);
3599
+ CheckSkewKurtosis (array, options, expected_skew, expected_kurtosis, n_ulps);
3600
+ }
3601
+
3602
+ void AssertSkewKurtosisAre (const std::shared_ptr<DataType>& type,
3603
+ const std::vector<std::string>& json,
3604
+ const SkewOptions& options, double expected_skew,
3605
+ double expected_kurtosis, int n_ulps = -1 ) {
3606
+ auto array = ChunkedArrayFromJSON (type, json);
3607
+ CheckSkewKurtosis (array, options, expected_skew, expected_kurtosis, n_ulps);
3608
+ }
3609
+
3610
+ void AssertSkewKurtosisInvalid (const Array& array, const SkewOptions& options) {
3611
+ AssertSkewKurtosisInvalidInternal (array, options);
3612
+ }
3613
+
3614
+ void AssertSkewKurtosisInvalid (const std::shared_ptr<ChunkedArray>& array,
3615
+ const SkewOptions& options) {
3616
+ AssertSkewKurtosisInvalidInternal (array, options);
3617
+ }
3618
+
3619
+ void AssertSkewKurtosisInvalid (const std::shared_ptr<DataType>& type,
3620
+ std::string_view json, const SkewOptions& options) {
3621
+ auto array = ArrayFromJSON (type, json);
3622
+ AssertSkewKurtosisInvalidInternal (array, options);
3623
+ }
3624
+
3625
+ void AssertSkewKurtosisInvalid (const std::shared_ptr<DataType>& type,
3626
+ const std::vector<std::string>& json,
3627
+ const SkewOptions& options) {
3628
+ auto array = ChunkedArrayFromJSON (type, json);
3629
+ AssertSkewKurtosisInvalidInternal (array, options);
3630
+ }
3631
+
3632
+ private:
3633
+ void AssertSkewKurtosisInvalidInternal (const Datum& array, const SkewOptions& options) {
3634
+ ASSERT_OK_AND_ASSIGN (Datum out_skew, Skew (array, options));
3635
+ ASSERT_OK_AND_ASSIGN (Datum out_kurtosis, Kurtosis (array, options));
3636
+ const auto & skew = checked_cast<const DoubleScalar&>(*out_skew.scalar ());
3637
+ const auto & kurtosis = checked_cast<const DoubleScalar&>(*out_kurtosis.scalar ());
3638
+ ASSERT_FALSE (skew.is_valid || kurtosis.is_valid );
3639
+ }
3640
+ };
3641
+
3642
+ TEST_F (TestSkewKurtosis, Basics) {
3643
+ // Test sample from SciPy, with results obtained using numpy.float128
3644
+ auto options = SkewOptions::Defaults ();
3645
+ AssertSkewKurtosisAre (float64 (), " [1.165, 0.6268, 0.0751, 0.3516, -0.6965]" , options,
3646
+ -0.29322304336607355496 , -0.83411431970273759 );
3647
+ // Results are slightly different because the input doesn't losslessly convert
3648
+ // to float32.
3649
+ AssertSkewKurtosisAre (float32 (), " [1.165, 0.6268, 0.0751, 0.3516, -0.6965]" , options,
3650
+ -0.2932230870440958164 , -0.8341143229437093939 );
3651
+ }
3652
+
3653
+ TEST_F (TestSkewKurtosis, Chunked) {
3654
+ auto options = SkewOptions::Defaults ();
3655
+ AssertSkewKurtosisAre (float64 (), {" [1.165, 0.6268]" , " []" , " [0.0751, 0.3516, -0.6965]" },
3656
+ options, -0.29322304336607355496 , -0.83411431970273759 );
3657
+ AssertSkewKurtosisAre (float32 (), {" [1.165, 0.6268]" , " []" , " [0.0751, 0.3516, -0.6965]" },
3658
+ options, -0.2932230870440958164 , -0.8341143229437093939 );
3659
+ }
3660
+
3661
+ TEST_F (TestSkewKurtosis, Decimal) {
3662
+ auto options = SkewOptions::Defaults ();
3663
+ for (auto type :
3664
+ {decimal32 (5 , 4 ), decimal64 (5 , 4 ), decimal128 (5 , 4 ), decimal256 (5 , 4 )}) {
3665
+ AssertSkewKurtosisAre (type, R"( ["1.1650", "0.6268", "0.0751", "0.3516", "-0.6965"])" ,
3666
+ options, -0.29322304336607355496 , -0.83411431970273759 );
3667
+ }
3668
+ }
3669
+
3670
+ TEST_F (TestSkewKurtosis, Integral) {
3671
+ auto options = SkewOptions::Defaults ();
3672
+ for (auto type : IntTypes ()) {
3673
+ AssertSkewKurtosisAre (type, " [1, 2, 3, 5]" , options, 0.4346507595746657 ,
3674
+ -1.1542857142857144 );
3675
+ }
3676
+ }
3677
+
3678
+ TEST_F (TestSkewKurtosis, SpecialCases) {
3679
+ auto options = SkewOptions::Defaults ();
3680
+ for (auto type : {float64 (), float32 ()}) {
3681
+ AssertSkewKurtosisAre (type, " [0, 1, 2]" , options, 0.0 , -1.5 , /* n_ulps=*/ 0 );
3682
+ AssertSkewKurtosisAre (type, " [1]" , options, std::nan (" " ), std::nan (" " ));
3683
+ AssertSkewKurtosisAre (type, " [1, 1, 1, 1, 1, 1]" , options, std::nan (" " ),
3684
+ std::nan (" " ));
3685
+ }
3686
+ }
3687
+
3688
+ TEST_F (TestSkewKurtosis, Options) {
3689
+ for (auto type : {float64 (), float32 ()}) {
3690
+ auto options = SkewOptions::Defaults ();
3691
+ AssertSkewKurtosisInvalid (type, " []" , options);
3692
+ AssertSkewKurtosisInvalid (type, std::vector<std::string>{}, options);
3693
+ AssertSkewKurtosisInvalid (type, {" []" , " []" , " []" }, options);
3694
+ AssertSkewKurtosisAre (type, " [0, 1, null, 2]" , options, 0.0 , -1.5 );
3695
+ AssertSkewKurtosisAre (type, {" [0, 1]" , " []" , " [null, 2]" }, options, 0.0 , -1.5 );
3696
+ options.min_count = 3 ;
3697
+ AssertSkewKurtosisAre (type, " [0, 1, null, 2]" , options, 0.0 , -1.5 );
3698
+ AssertSkewKurtosisAre (type, {" [0, 1]" , " []" , " [null, 2]" }, options, 0.0 , -1.5 );
3699
+ options.skip_nulls = false ;
3700
+ AssertSkewKurtosisInvalid (type, " [0, 1, null, 2]" , options);
3701
+ AssertSkewKurtosisInvalid (type, {" [0, 1]" , " []" , " [null, 2]" }, options);
3702
+ options.skip_nulls = true ;
3703
+ options.min_count = 4 ;
3704
+ AssertSkewKurtosisInvalid (type, " [0, 1, null, 2]" , options);
3705
+ AssertSkewKurtosisInvalid (type, {" [0, 1]" , " []" , " [null, 2]" }, options);
3706
+ }
3707
+ }
3708
+
3547
3709
//
3548
3710
// Quantile
3549
3711
//
0 commit comments