Skip to content

Commit

Permalink
[util] Kolmogorov-Smirnov test approximated over T-Digests (grpc#38245)
Browse files Browse the repository at this point in the history
This will be used to detect change/no-change over some statistics in a future iteration of the chaotic good autoscaler.

Closes grpc#38245

COPYBARA_INTEGRATE_REVIEW=grpc#38245 from ctiller:kolmogorov-smirnov 9052420
PiperOrigin-RevId: 704358901
  • Loading branch information
ctiller authored and copybara-github committed Dec 9, 2024
1 parent 8df11e2 commit f36c6ae
Show file tree
Hide file tree
Showing 5 changed files with 212 additions and 0 deletions.
12 changes: 12 additions & 0 deletions fuzztest/core/util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,15 @@ grpc_fuzz_test(
],
deps = ["//src/core:tdigest"],
)

grpc_fuzz_test(
name = "kolmogorov_smirnov_fuzztest",
srcs = ["kolmogorov_smirnov_fuzztest.cc"],
external_deps = [
"absl/types:variant",
"fuzztest",
"fuzztest_main",
"gtest",
],
deps = ["//src/core:kolmogorov_smirnov"],
)
96 changes: 96 additions & 0 deletions fuzztest/core/util/kolmogorov_smirnov_fuzztest.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2024 gRPC authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <string>
#include <utility>
#include <vector>

#include "fuzztest/fuzztest.h"
#include "gtest/gtest.h"
#include "src/core/util/kolmogorov_smirnov.h"

using fuzztest::VectorOf;
using fuzztest::InRange;

namespace grpc_core {
namespace {

void TestThresholdSensitivityAlpha(double alpha, double a_count, double b_count, double delta) {
EXPECT_GT(
KolmogorovSmirnovThreshold(alpha, a_count, b_count),
KolmogorovSmirnovThreshold(alpha + delta, a_count, b_count)
);
}
FUZZ_TEST(KolmogorovSmirnov, TestThresholdSensitivityAlpha)
.WithDomains(InRange(0.001, 0.2), InRange(1.0, 100000.0), InRange(1.0, 100000.0), InRange(0.001, 0.1));

void TestThresholdSensitivityReversedCount(double alpha, double a_count, double b_count) {
EXPECT_NEAR(
KolmogorovSmirnovThreshold(alpha, a_count, b_count),
KolmogorovSmirnovThreshold(alpha, b_count, a_count),
0.00001
);
}
FUZZ_TEST(KolmogorovSmirnov, TestThresholdSensitivityReversedCount)
.WithDomains(InRange(0.001, 0.2), InRange(1.0, 100000.0), InRange(1.0, 100000.0));

void TestThresholdSensitivityCount(double alpha, double a_count, double b_count, double delta) {
EXPECT_LT(
KolmogorovSmirnovThreshold(alpha, a_count, b_count),
KolmogorovSmirnovThreshold(alpha, a_count + delta, b_count)
);
}
FUZZ_TEST(KolmogorovSmirnov, TestThresholdSensitivityCount)
.WithDomains(InRange(0.001, 0.2), InRange(1.0, 100000.0), InRange(1.0, 100000.0), InRange(1.0, 1000.0));

double ExactStatistic(std::vector<double>& a, std::vector<double>& b) {
std::sort(a.begin(), a.end());
std::sort(b.begin(), b.end());
double max_diff = 0.0;
for (size_t i=0, j=0; i<a.size() && j<b.size();) {
double d1 = static_cast<double>(i) / a.size();
double d2 = static_cast<double>(j) / b.size();
double diff = std::abs(d1 - d2);
if (diff > max_diff) {
max_diff = diff;
}
if (a[i] <= b[j]) {
++i;
} else {
++j;
}
}
return max_diff;
}

void TestStatistic(std::vector<double> a, std::vector<double> b, double a_compression, double b_compression, uint32_t num_samples) {
TDigest a_digest(a_compression);
for (double x : a) {
a_digest.Add(x);
}
TDigest b_digest(b_compression);
for (double x : b) {
b_digest.Add(x);
}
EXPECT_NEAR(
KolmogorovSmirnovStatistic(a_digest, b_digest, num_samples),
ExactStatistic(a, b),
0.5
);
}
FUZZ_TEST(KolmogorovSmirnov, TestStatistic)
.WithDomains(VectorOf(InRange(0.0, 1000.0)).WithMinSize(100), VectorOf(InRange(0.0, 1000.0)).WithMinSize(100), InRange(50.0, 1000.0), InRange(50.0, 1000.0), InRange(10, 100));

}
}
14 changes: 14 additions & 0 deletions src/core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -4021,6 +4021,20 @@ grpc_cc_library(
deps = ["//:gpr_platform"],
)

grpc_cc_library(
name = "kolmogorov_smirnov",
srcs = [
"util/kolmogorov_smirnov.cc",
],
hdrs = [
"util/kolmogorov_smirnov.h",
],
external_deps = [
],
language = "c++",
deps = ["tdigest"],
)

grpc_cc_library(
name = "certificate_provider_factory",
hdrs = [
Expand Down
48 changes: 48 additions & 0 deletions src/core/util/kolmogorov_smirnov.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Copyright 2024 gRPC authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "src/core/util/kolmogorov_smirnov.h"

namespace grpc_core {

bool KolmogorovSmirnovTest(TDigest& a, TDigest& b, double alpha,
uint32_t num_samples) {
return KolmogorovSmirnovStatistic(a, b, num_samples) >
KolmogorovSmirnovThreshold(alpha, a.Count(), b.Count());
}

double KolmogorovSmirnovStatistic(TDigest& a, TDigest& b,
uint32_t num_samples) {
const double min_value = std::min(a.Min(), b.Min());
const double max_value = std::max(a.Max(), b.Max());
// We don't step to max_value because we know the CDF is 1 there for a & b
// so we use our samples for the parts of the curve where the CDF actually
// varies
const double step = (max_value - min_value) / (num_samples + 1);
double max_diff = 0;
for (size_t i = 0; i < num_samples; ++i) {
const double a_cdf = a.Cdf(min_value + (i + 1) * step);
const double b_cdf = b.Cdf(min_value + (i + 1) * step);
max_diff = std::max(max_diff, std::abs(a_cdf - b_cdf));
}
return max_diff;
}

double KolmogorovSmirnovThreshold(double alpha, double a_count,
double b_count) {
const double sample_scaling = a_count * b_count / (a_count + b_count);
return std::sqrt(-0.5 * std::log(alpha / 2) * sample_scaling);
}

} // namespace grpc_core
42 changes: 42 additions & 0 deletions src/core/util/kolmogorov_smirnov.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2024 gRPC authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef GRPC_SRC_CORE_UTIL_KOLMOGOROV_SMIRNOV_H
#define GRPC_SRC_CORE_UTIL_KOLMOGOROV_SMIRNOV_H

#include "src/core/util/tdigest.h"

namespace grpc_core {

// Perform a Kolmogorov-Smirnov test to determine if two TDigests are
// significantly different (returns true), or not (returns false).
//
// alpha is a real numbered value between 0 and 1, representing the
// significance level of the test.
//
// num_samples is the number of cdf samples to take from each TDigest.
//
// Computational complexity roughly num_samples * (a.NumCentroids() +
// b.NumCentroids()).
bool KolmogorovSmirnovTest(TDigest& a, TDigest& b, double alpha,
uint32_t num_samples = 10);

double KolmogorovSmirnovStatistic(TDigest& a, TDigest& b,
uint32_t num_samples = 10);

double KolmogorovSmirnovThreshold(double alpha, double a_count, double b_count);

} // namespace grpc_core

#endif // GRPC_SRC_CORE_UTIL_KOLMOGOROV_SMIRNOV_H

0 comments on commit f36c6ae

Please sign in to comment.