Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 206 additions & 1 deletion tree/dataframe/inc/ROOT/RDF/ActionHelpers.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
#include <utility> // std::index_sequence
#include <vector>
#include <numeric> // std::accumulate in MeanHelper

#include <cmath> // std::sqrt, std::pow
class TCollection;
class TStatistic;
class TTreeReader;
Expand Down Expand Up @@ -1316,6 +1316,211 @@ public:
}
};

// Implements Welford's Online Algorithm for Skewness
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for reference, I would add a link to the explanation of the algorithm, e.g. https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm

// Ref: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
class R__CLING_PTRCHECK(off) SkewnessHelper : public RActionImpl<SkewnessHelper> {
unsigned int fNSlots;
std::shared_ptr<double> fResultSkewness;

// Welford's Algorithm Accumulators per slot
std::vector<ULong64_t> fCounts;
std::vector<double> fMeans;
std::vector<double> fM2; // Sum of squares of differences
std::vector<double> fM3; // Sum of cubed differences

public:
SkewnessHelper(const std::shared_ptr<double> &resPtr, const unsigned int nSlots)
: fNSlots(nSlots),
fResultSkewness(resPtr),
fCounts(fNSlots, 0),
fMeans(fNSlots, 0.),
fM2(fNSlots, 0.),
fM3(fNSlots, 0.)
{
}

// Rule of Five:
~SkewnessHelper() = default; // Destructor
SkewnessHelper(const SkewnessHelper &) = delete; // Copy Ctor
SkewnessHelper &operator=(const SkewnessHelper &) = delete; // Copy Assignment
SkewnessHelper(SkewnessHelper &&) = default; // Move Ctor
SkewnessHelper &operator=(SkewnessHelper &&) = default; // Move Assignment

void InitTask(TTreeReader *, unsigned int) {}

void Exec(unsigned int slot, double val)
{
ULong64_t n = ++fCounts[slot];
double delta = val - fMeans[slot];
double delta_n = delta / n;
double term1 = delta * delta_n * (n - 1);

fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot];
fM2[slot] += term1;
fMeans[slot] += delta_n;
}

template <typename T, std::enable_if_t<IsDataContainer<T>::value, int> = 0>
void Exec(unsigned int slot, const T &vs)
{
for (auto &&v : vs) {
Exec(slot, v);
}
}

void Initialize() { /* noop */ }

void Finalize()
{
for (unsigned int i = 1; i < fNSlots; ++i) {
if (fCounts[i] == 0)
continue;

double n1 = static_cast<double>(fCounts[0]);
double n2 = static_cast<double>(fCounts[i]);
double n = n1 + n2;

double delta = fMeans[i] - fMeans[0];
double delta2 = delta * delta;
double delta3 = delta * delta2;

fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n * n) +
3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n;

fM2[0] += fM2[i] + delta2 * n1 * n2 / n;
fMeans[0] += delta * n2 / n;
fCounts[0] += fCounts[i];
}

if (fCounts[0] > 2 && fM2[0] > 0) {
*fResultSkewness = (std::sqrt(static_cast<double>(fCounts[0])) * fM3[0]) / std::pow(fM2[0], 1.5);
} else {
*fResultSkewness = 0.0;
}
}

std::string GetActionName() { return "Skewness"; }

SkewnessHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal")
{
auto &result = *static_cast<std::shared_ptr<double> *>(newResult);
return SkewnessHelper(result, fCounts.size());
}

// Helper functions for RMergeableValue
std::unique_ptr<RMergeableValueBase> GetMergeableValue() const final {
// TODO: Implement mergeable value to support distributed RDataFrame
return nullptr;
}
};

// Implements Welford's Online Algorithm for Kurtosis
// Ref: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
class R__CLING_PTRCHECK(off) KurtosisHelper : public RActionImpl<KurtosisHelper> {
unsigned int fNSlots;
std::shared_ptr<double> fResultKurtosis;

// Accumulators per slot
std::vector<ULong64_t> fCounts;
std::vector<double> fMeans;
std::vector<double> fM2;
std::vector<double> fM3;
std::vector<double> fM4;

public:
KurtosisHelper(const std::shared_ptr<double> &resPtr, const unsigned int nSlots)
: fNSlots(nSlots),
fResultKurtosis(resPtr),
fCounts(fNSlots, 0),
fMeans(fNSlots, 0.),
fM2(fNSlots, 0.),
fM3(fNSlots, 0.),
fM4(fNSlots, 0.)
{
}

// Rule of Five:
~KurtosisHelper() = default; // 1. Destructor
KurtosisHelper(const KurtosisHelper &) = delete; // 2. Copy Constructor
KurtosisHelper &operator=(const KurtosisHelper &) = delete; // 3. Copy Assignment
KurtosisHelper(KurtosisHelper &&) = default; // 4. Move Constructor
KurtosisHelper &operator=(KurtosisHelper &&) = default; // 5. Move Assignment

void InitTask(TTreeReader *, unsigned int) {}

void Exec(unsigned int slot, double val)
{
ULong64_t n = ++fCounts[slot];
double delta = val - fMeans[slot];
double delta_n = delta / n;
double term1 = delta * delta_n * (n - 1);

fM4[slot] +=
term1 * delta_n * delta_n * (n * n - 3 * n + 3) + 6 * delta_n * delta_n * fM2[slot] - 4 * delta_n * fM3[slot];
fM3[slot] += term1 * delta_n * (n - 2) - 3 * delta_n * fM2[slot];
fM2[slot] += term1;
fMeans[slot] += delta_n;
}

template <typename T, std::enable_if_t<IsDataContainer<T>::value, int> = 0>
void Exec(unsigned int slot, const T &vs)
{
for (auto &&v : vs) {
Exec(slot, v);
}
}

void Initialize() { /* noop */ }

void Finalize()
{
for (unsigned int i = 1; i < fNSlots; ++i) {
if (fCounts[i] == 0) continue;

double n1 = static_cast<double>(fCounts[0]);
double n2 = static_cast<double>(fCounts[i]);
double n = n1 + n2;

double delta = fMeans[i] - fMeans[0];
double delta2 = delta * delta;
double delta3 = delta * delta2;
double delta4 = delta2 * delta2;

fM4[0] += fM4[i] + delta4 * n1 * n2 * (n1 * n1 - n1 * n2 + n2 * n2) / (n * n * n) +
6.0 * delta2 * (n1 * n1 * fM2[i] + n2 * n2 * fM2[0]) / (n * n) +
4.0 * delta * (n1 * fM3[i] - n2 * fM3[0]) / n;

fM3[0] += fM3[i] + delta3 * n1 * n2 * (n1 - n2) / (n * n) +
3.0 * delta * (n1 * fM2[i] - n2 * fM2[0]) / n;

fM2[0] += fM2[i] + delta2 * n1 * n2 / n;

fMeans[0] += delta * n2 / n;
fCounts[0] += fCounts[i];
}

if (fCounts[0] > 3 && fM2[0] > 0) {
double n = static_cast<double>(fCounts[0]);
*fResultKurtosis = (n * fM4[0]) / (fM2[0] * fM2[0]) - 3.0;
} else {
*fResultKurtosis = -3.0;
}
}

std::string GetActionName() { return "Kurtosis"; }

KurtosisHelper MakeNew(void *newResult, std::string_view /*variation*/ = "nominal")
{
auto &result = *static_cast<std::shared_ptr<double> *>(newResult);
return KurtosisHelper(result, fCounts.size());
}
// Helper functions for RMergeableValue
std::unique_ptr<RMergeableValueBase> GetMergeableValue() const final {
// TODO: Implement mergeable value to support distributed RDataFrame
return nullptr;
}
};

template <typename PrevNodeType>
class R__CLING_PTRCHECK(off) DisplayHelper : public RActionImpl<DisplayHelper<PrevNodeType>> {
private:
Expand Down
24 changes: 24 additions & 0 deletions tree/dataframe/inc/ROOT/RDF/InterfaceUtils.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ struct Sum{};
struct Mean{};
struct Fill{};
struct StdDev{};
struct Skewness{};
struct Kurtosis{};
struct Display{};
struct Snapshot{};
struct Book{};
Expand Down Expand Up @@ -246,6 +248,28 @@ BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &stdDeviation
return std::make_unique<Action_t>(Helper_t(stdDeviationV, nSlots), bl, prevNode, colRegister);
}

// Skewness action
template <typename ColType, typename PrevNodeType>
std::unique_ptr<RActionBase>
BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &skewnessV, const unsigned int nSlots,
std::shared_ptr<PrevNodeType> prevNode, ActionTags::Skewness, const RColumnRegister &colRegister)
{
using Helper_t = SkewnessHelper;
using Action_t = RAction<Helper_t, PrevNodeType, TTraits::TypeList<ColType>>;
return std::make_unique<Action_t>(Helper_t(skewnessV, nSlots), bl, prevNode, colRegister);
}

// Kurtosis action
template <typename ColType, typename PrevNodeType>
std::unique_ptr<RActionBase>
BuildAction(const ColumnNames_t &bl, const std::shared_ptr<double> &kurtosisV, const unsigned int nSlots,
std::shared_ptr<PrevNodeType> prevNode, ActionTags::Kurtosis, const RColumnRegister &colRegister)
{
using Helper_t = KurtosisHelper;
using Action_t = RAction<Helper_t, PrevNodeType, TTraits::TypeList<ColType>>;
return std::make_unique<Action_t>(Helper_t(kurtosisV, nSlots), bl, prevNode, colRegister);
}

using displayHelperArgs_t = std::pair<size_t, std::shared_ptr<ROOT::RDF::RDisplay>>;

// Display action
Expand Down
55 changes: 54 additions & 1 deletion tree/dataframe/inc/ROOT/RDF/RInterface.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -2906,7 +2906,60 @@ public:
return CreateAction<RDFInternal::ActionTags::StdDev, T>(userColumns, stdDeviationV, stdDeviationV, fProxiedPtr);
}

// clang-format off
////////////////////////////////////////////////////////////////////////////
/// \brief Return the skewness of processed column values (*lazy action*).
/// \tparam T The type of the branch/column.
/// \param[in] columnName The name of the branch/column to be treated.
/// \return the skewness of the selected column wrapped in a RResultPtr.
///
/// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
/// template specialization of this method.
///
/// This action is *lazy*: upon invocation of this method the calculation is
/// booked but not executed. Also see RResultPtr.
///
/// ### Example usage:
/// ~~~{.cpp}
/// // Deduce column type (this invocation needs jitting internally)
/// auto skew = df.Skewness("values");
/// ~~~
///
/// \ingroup dataframe_actions
template <typename T = RDFDetail::RInferredType>
RResultPtr<double> Skewness(std::string_view columnName = "")
{
const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
auto skewnessV = std::make_shared<double>(0);
return CreateAction<RDFInternal::ActionTags::Skewness, T>(userColumns, skewnessV, skewnessV, fProxiedPtr);
}

////////////////////////////////////////////////////////////////////////////
/// \brief Return the excess kurtosis of processed column values (*lazy action*).
/// \tparam T The type of the branch/column.
/// \param[in] columnName The name of the branch/column to be treated.
/// \return the excess kurtosis of the selected column wrapped in a RResultPtr.
///
/// If T is not specified, RDataFrame will infer it from the data and just-in-time compile the correct
/// template specialization of this method.
///
/// This action is *lazy*: upon invocation of this method the calculation is
/// booked but not executed. Also see RResultPtr.
///
/// ### Example usage:
/// ~~~{.cpp}
/// // Deduce column type (this invocation needs jitting internally)
/// auto kurt = df.Kurtosis("values");
/// ~~~
///
/// \ingroup dataframe_actions
template <typename T = RDFDetail::RInferredType>
RResultPtr<double> Kurtosis(std::string_view columnName = "")
{
const auto userColumns = columnName.empty() ? ColumnNames_t() : ColumnNames_t({std::string(columnName)});
auto kurtosisV = std::make_shared<double>(0);
return CreateAction<RDFInternal::ActionTags::Kurtosis, T>(userColumns, kurtosisV, kurtosisV, fProxiedPtr);
}

////////////////////////////////////////////////////////////////////////////
/// \brief Return the sum of processed column values (*lazy action*).
/// \tparam T The type of the branch/column.
Expand Down
16 changes: 16 additions & 0 deletions tree/dataframe/test/dataframe_simple.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,22 @@ TEST_P(RDFSimpleTests, ReadStdArray)
}
}

TEST_P(RDFSimpleTests, SkewnessAndKurtosis)
{
auto df = ROOT::RDataFrame(5);
auto df_x = df.Define("x", [](ULong64_t i) { return (double)i; }, {"rdfentry_"});

auto skewness = df_x.Skewness("x");
auto kurtosis = df_x.Kurtosis("x");

double s_val = *skewness;
double k_val = *kurtosis;

EXPECT_NEAR(s_val, 0.0, 1e-9);

EXPECT_LT(k_val, 0.0);
}

// run single-thread tests
INSTANTIATE_TEST_SUITE_P(Seq, RDFSimpleTests, ::testing::Values(false));

Expand Down