diff --git a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx index 0c892b00b9297..57fa1e8e6bd5d 100644 --- a/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx +++ b/tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx @@ -44,7 +44,7 @@ #include // std::index_sequence #include #include // std::accumulate in MeanHelper - +#include // std::sqrt, std::pow class TCollection; class TStatistic; class TTreeReader; @@ -1316,6 +1316,211 @@ public: } }; +// Implements Welford's Online Algorithm for Skewness +// Ref: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm +class R__CLING_PTRCHECK(off) SkewnessHelper : public RActionImpl { + unsigned int fNSlots; + std::shared_ptr fResultSkewness; + + // Welford's Algorithm Accumulators per slot + std::vector fCounts; + std::vector fMeans; + std::vector fM2; // Sum of squares of differences + std::vector fM3; // Sum of cubed differences + +public: + SkewnessHelper(const std::shared_ptr &resPtr, const unsigned int nSlots) + : fNSlots(nSlots), + fResultSkewness(resPtr), + fCounts(fNSlots, 0), + fMeans(fNSlots, 0.), + fM2(fNSlots, 0.), + fM3(fNSlots, 0.) + { + } + + // Rule of Five: + ~SkewnessHelper() = default; // Destructor + SkewnessHelper(const SkewnessHelper &) = delete; // Copy Ctor + SkewnessHelper &operator=(const SkewnessHelper &) = delete; // Copy Assignment + SkewnessHelper(SkewnessHelper &&) = default; // Move Ctor + SkewnessHelper &operator=(SkewnessHelper &&) = default; // Move Assignment + + void InitTask(TTreeReader *, unsigned int) {} + + void Exec(unsigned int slot, double val) + { + ULong64_t n = ++fCounts[slot]; + double delta = val - fMeans[slot]; + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1); + + fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot]; + fM2[slot] += term1; + fMeans[slot] += delta_n; + } + + template ::value, int> = 0> + void Exec(unsigned int slot, const T &vs) + { + for (auto &&v : vs) { + Exec(slot, v); + } + } + + void Initialize() { /* noop */ } + + void Finalize() + { + for (unsigned int i = 1; i < fNSlots; ++i) { + if (fCounts[i] == 0) + continue; + + double n1 = static_cast(fCounts[0]); + double n2 = static_cast(fCounts[i]); + double n = n1 + n2; + + double delta = fMeans[i] - fMeans[0]; + double delta2 = delta * delta; + double delta3 = delta * delta2; + + fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n * n) + + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n; + + fM2[0] += fM2[i] + delta2 * n1 * n2 / n; + fMeans[0] += delta * n2 / n; + fCounts[0] += fCounts[i]; + } + + if (fCounts[0] > 2 && fM2[0] > 0) { + *fResultSkewness = (std::sqrt(static_cast(fCounts[0])) * fM3[0]) / std::pow(fM2[0], 1.5); + } else { + *fResultSkewness = 0.0; + } + } + + std::string GetActionName() { return "Skewness"; } + + SkewnessHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal") + { + auto &result = *static_cast *>(newResult); + return SkewnessHelper(result, fCounts.size()); + } + + // Helper functions for RMergeableValue + std::unique_ptr GetMergeableValue() const final { + // TODO: Implement mergeable value to support distributed RDataFrame + return nullptr; + } +}; + +// Implements Welford's Online Algorithm for Kurtosis +// Ref: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm +class R__CLING_PTRCHECK(off) KurtosisHelper : public RActionImpl { + unsigned int fNSlots; + std::shared_ptr fResultKurtosis; + + // Accumulators per slot + std::vector fCounts; + std::vector fMeans; + std::vector fM2; + std::vector fM3; + std::vector fM4; + +public: + KurtosisHelper(const std::shared_ptr &resPtr, const unsigned int nSlots) + : fNSlots(nSlots), + fResultKurtosis(resPtr), + fCounts(fNSlots, 0), + fMeans(fNSlots, 0.), + fM2(fNSlots, 0.), + fM3(fNSlots, 0.), + fM4(fNSlots, 0.) + { + } + + // Rule of Five: + ~KurtosisHelper() = default; // 1. Destructor + KurtosisHelper(const KurtosisHelper &) = delete; // 2. Copy Constructor + KurtosisHelper &operator=(const KurtosisHelper &) = delete; // 3. Copy Assignment + KurtosisHelper(KurtosisHelper &&) = default; // 4. Move Constructor + KurtosisHelper &operator=(KurtosisHelper &&) = default; // 5. Move Assignment + + void InitTask(TTreeReader *, unsigned int) {} + + void Exec(unsigned int slot, double val) + { + ULong64_t n = ++fCounts[slot]; + double delta = val - fMeans[slot]; + double delta_n = delta / n; + double term1 = delta * delta_n * (n - 1); + + fM4[slot] += + term1 * delta_n * delta_n * (n * n - 3 * n + 3) + 6 * delta_n * delta_n * fM2[slot] - 4 * delta_n * fM3[slot]; + fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot]; + fM2[slot] += term1; + fMeans[slot] += delta_n; + } + + template ::value, int> = 0> + void Exec(unsigned int slot, const T &vs) + { + for (auto &&v : vs) { + Exec(slot, v); + } + } + + void Initialize() { /* noop */ } + +void Finalize() + { + for (unsigned int i = 1; i < fNSlots; ++i) { + if (fCounts[i] == 0) continue; + + double n1 = static_cast(fCounts[0]); + double n2 = static_cast(fCounts[i]); + double n = n1 + n2; + + double delta = fMeans[i] - fMeans[0]; + double delta2 = delta * delta; + double delta3 = delta * delta2; + double delta4 = delta2 * delta2; + + fM4[0] += fM4[i] + delta4 * n1 * n2 * (n1 * n1 - n1 * n2 + n2 * n2) / (n * n * n) + + 6.0 * delta2 * (n1 * n1 * fM2[i] + n2 * n2 * fM2[0]) / (n * n) + + 4.0 * delta * (n1 * fM3[i] - n2 * fM3[0]) / n; + + fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) + + 3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n; + + fM2[0] += fM2[i] + delta2 * n1 * n2 / n; + + fMeans[0] += delta * n2 / n; + fCounts[0] += fCounts[i]; + } + + if (fCounts[0] > 3 && fM2[0] > 0) { + double n = static_cast(fCounts[0]); + *fResultKurtosis = (n * fM4[0]) / (fM2[0] * fM2[0]) - 3.0; + } else { + *fResultKurtosis = -3.0; + } + } + + std::string GetActionName() { return "Kurtosis"; } + + KurtosisHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal") + { + auto &result = *static_cast *>(newResult); + return KurtosisHelper(result, fCounts.size()); + } + // Helper functions for RMergeableValue + std::unique_ptr GetMergeableValue() const final { + // TODO: Implement mergeable value to support distributed RDataFrame + return nullptr; + } +}; + template class R__CLING_PTRCHECK(off) DisplayHelper : public RActionImpl> { private: diff --git a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx index 6f4a428640a6f..538441f27ddfa 100644 --- a/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx +++ b/tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx @@ -100,6 +100,8 @@ struct Sum{}; struct Mean{}; struct Fill{}; struct StdDev{}; +struct Skewness{}; +struct Kurtosis{}; struct Display{}; struct Snapshot{}; struct Book{}; @@ -246,6 +248,28 @@ BuildAction(const ColumnNames_t &bl, const std::shared_ptr &stdDeviation return std::make_unique(Helper_t(stdDeviationV, nSlots), bl, prevNode, colRegister); } +// Skewness action +template +std::unique_ptr +BuildAction(const ColumnNames_t &bl, const std::shared_ptr &skewnessV, const unsigned int nSlots, + std::shared_ptr prevNode, ActionTags::Skewness, const RColumnRegister &colRegister) +{ + using Helper_t = SkewnessHelper; + using Action_t = RAction>; + return std::make_unique(Helper_t(skewnessV, nSlots), bl, prevNode, colRegister); +} + +// Kurtosis action +template +std::unique_ptr +BuildAction(const ColumnNames_t &bl, const std::shared_ptr &kurtosisV, const unsigned int nSlots, + std::shared_ptr prevNode, ActionTags::Kurtosis, const RColumnRegister &colRegister) +{ + using Helper_t = KurtosisHelper; + using Action_t = RAction>; + return std::make_unique(Helper_t(kurtosisV, nSlots), bl, prevNode, colRegister); +} + using displayHelperArgs_t = std::pair>; // Display action diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index 242a34bd94f67..d2817e45c55a1 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -2906,7 +2906,60 @@ public: return CreateAction(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr); } - // clang-format off + //////////////////////////////////////////////////////////////////////////// + /// \brief Return the skewness of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the skewness of the selected column wrapped in a RResultPtr. + /// + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// // Deduce column type (this invocation needs jitting internally) + /// auto skew = df.Skewness("values"); + /// ~~~ + /// + /// \ingroup dataframe_actions + template + RResultPtr Skewness(std::string_view columnName = "") + { + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + auto skewnessV = std::make_shared(0); + return CreateAction(userColumns, skewnessV, skewnessV, fProxiedPtr); + } + + //////////////////////////////////////////////////////////////////////////// + /// \brief Return the excess kurtosis of processed column values (*lazy action*). + /// \tparam T The type of the branch/column. + /// \param[in] columnName The name of the branch/column to be treated. + /// \return the excess kurtosis of the selected column wrapped in a RResultPtr. + /// + /// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct + /// template specialization of this method. + /// + /// This action is *lazy*: upon invocation of this method the calculation is + /// booked but not executed. Also see RResultPtr. + /// + /// ### Example usage: + /// ~~~{.cpp} + /// // Deduce column type (this invocation needs jitting internally) + /// auto kurt = df.Kurtosis("values"); + /// ~~~ + /// + /// \ingroup dataframe_actions + template + RResultPtr Kurtosis(std::string_view columnName = "") + { + const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)}); + auto kurtosisV = std::make_shared(0); + return CreateAction(userColumns, kurtosisV, kurtosisV, fProxiedPtr); + } + //////////////////////////////////////////////////////////////////////////// /// \brief Return the sum of processed column values (*lazy action*). /// \tparam T The type of the branch/column. diff --git a/tree/dataframe/test/dataframe_simple.cxx b/tree/dataframe/test/dataframe_simple.cxx index 6e1461e3f9a7b..c425cab6291ba 100644 --- a/tree/dataframe/test/dataframe_simple.cxx +++ b/tree/dataframe/test/dataframe_simple.cxx @@ -1085,6 +1085,22 @@ TEST_P(RDFSimpleTests, ReadStdArray) } } +TEST_P(RDFSimpleTests, SkewnessAndKurtosis) +{ + auto df = ROOT::RDataFrame(5); + auto df_x = df.Define("x", [](ULong64_t i) { return (double)i; }, {"rdfentry_"}); + + auto skewness = df_x.Skewness("x"); + auto kurtosis = df_x.Kurtosis("x"); + + double s_val = *skewness; + double k_val = *kurtosis; + + EXPECT_NEAR(s_val, 0.0, 1e-9); + + EXPECT_LT(k_val, 0.0); +} + // run single-thread tests INSTANTIATE_TEST_SUITE_P(Seq, RDFSimpleTests, ::testing::Values(false));