-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[util] Kolmogorov-Smirnov test approximated over T-Digests (grpc#38245)
This will be used to detect change/no-change over some statistics in a future iteration of the chaotic good autoscaler. Closes grpc#38245 COPYBARA_INTEGRATE_REVIEW=grpc#38245 from ctiller:kolmogorov-smirnov 9052420 PiperOrigin-RevId: 704358901
- Loading branch information
1 parent
8df11e2
commit f36c6ae
Showing
5 changed files
with
212 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
// Copyright 2024 gRPC authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include <string> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include "fuzztest/fuzztest.h" | ||
#include "gtest/gtest.h" | ||
#include "src/core/util/kolmogorov_smirnov.h" | ||
|
||
using fuzztest::VectorOf; | ||
using fuzztest::InRange; | ||
|
||
namespace grpc_core { | ||
namespace { | ||
|
||
void TestThresholdSensitivityAlpha(double alpha, double a_count, double b_count, double delta) { | ||
EXPECT_GT( | ||
KolmogorovSmirnovThreshold(alpha, a_count, b_count), | ||
KolmogorovSmirnovThreshold(alpha + delta, a_count, b_count) | ||
); | ||
} | ||
FUZZ_TEST(KolmogorovSmirnov, TestThresholdSensitivityAlpha) | ||
.WithDomains(InRange(0.001, 0.2), InRange(1.0, 100000.0), InRange(1.0, 100000.0), InRange(0.001, 0.1)); | ||
|
||
void TestThresholdSensitivityReversedCount(double alpha, double a_count, double b_count) { | ||
EXPECT_NEAR( | ||
KolmogorovSmirnovThreshold(alpha, a_count, b_count), | ||
KolmogorovSmirnovThreshold(alpha, b_count, a_count), | ||
0.00001 | ||
); | ||
} | ||
FUZZ_TEST(KolmogorovSmirnov, TestThresholdSensitivityReversedCount) | ||
.WithDomains(InRange(0.001, 0.2), InRange(1.0, 100000.0), InRange(1.0, 100000.0)); | ||
|
||
void TestThresholdSensitivityCount(double alpha, double a_count, double b_count, double delta) { | ||
EXPECT_LT( | ||
KolmogorovSmirnovThreshold(alpha, a_count, b_count), | ||
KolmogorovSmirnovThreshold(alpha, a_count + delta, b_count) | ||
); | ||
} | ||
FUZZ_TEST(KolmogorovSmirnov, TestThresholdSensitivityCount) | ||
.WithDomains(InRange(0.001, 0.2), InRange(1.0, 100000.0), InRange(1.0, 100000.0), InRange(1.0, 1000.0)); | ||
|
||
double ExactStatistic(std::vector<double>& a, std::vector<double>& b) { | ||
std::sort(a.begin(), a.end()); | ||
std::sort(b.begin(), b.end()); | ||
double max_diff = 0.0; | ||
for (size_t i=0, j=0; i<a.size() && j<b.size();) { | ||
double d1 = static_cast<double>(i) / a.size(); | ||
double d2 = static_cast<double>(j) / b.size(); | ||
double diff = std::abs(d1 - d2); | ||
if (diff > max_diff) { | ||
max_diff = diff; | ||
} | ||
if (a[i] <= b[j]) { | ||
++i; | ||
} else { | ||
++j; | ||
} | ||
} | ||
return max_diff; | ||
} | ||
|
||
void TestStatistic(std::vector<double> a, std::vector<double> b, double a_compression, double b_compression, uint32_t num_samples) { | ||
TDigest a_digest(a_compression); | ||
for (double x : a) { | ||
a_digest.Add(x); | ||
} | ||
TDigest b_digest(b_compression); | ||
for (double x : b) { | ||
b_digest.Add(x); | ||
} | ||
EXPECT_NEAR( | ||
KolmogorovSmirnovStatistic(a_digest, b_digest, num_samples), | ||
ExactStatistic(a, b), | ||
0.5 | ||
); | ||
} | ||
FUZZ_TEST(KolmogorovSmirnov, TestStatistic) | ||
.WithDomains(VectorOf(InRange(0.0, 1000.0)).WithMinSize(100), VectorOf(InRange(0.0, 1000.0)).WithMinSize(100), InRange(50.0, 1000.0), InRange(50.0, 1000.0), InRange(10, 100)); | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
// Copyright 2024 gRPC authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "src/core/util/kolmogorov_smirnov.h" | ||
|
||
namespace grpc_core { | ||
|
||
bool KolmogorovSmirnovTest(TDigest& a, TDigest& b, double alpha, | ||
uint32_t num_samples) { | ||
return KolmogorovSmirnovStatistic(a, b, num_samples) > | ||
KolmogorovSmirnovThreshold(alpha, a.Count(), b.Count()); | ||
} | ||
|
||
double KolmogorovSmirnovStatistic(TDigest& a, TDigest& b, | ||
uint32_t num_samples) { | ||
const double min_value = std::min(a.Min(), b.Min()); | ||
const double max_value = std::max(a.Max(), b.Max()); | ||
// We don't step to max_value because we know the CDF is 1 there for a & b | ||
// so we use our samples for the parts of the curve where the CDF actually | ||
// varies | ||
const double step = (max_value - min_value) / (num_samples + 1); | ||
double max_diff = 0; | ||
for (size_t i = 0; i < num_samples; ++i) { | ||
const double a_cdf = a.Cdf(min_value + (i + 1) * step); | ||
const double b_cdf = b.Cdf(min_value + (i + 1) * step); | ||
max_diff = std::max(max_diff, std::abs(a_cdf - b_cdf)); | ||
} | ||
return max_diff; | ||
} | ||
|
||
double KolmogorovSmirnovThreshold(double alpha, double a_count, | ||
double b_count) { | ||
const double sample_scaling = a_count * b_count / (a_count + b_count); | ||
return std::sqrt(-0.5 * std::log(alpha / 2) * sample_scaling); | ||
} | ||
|
||
} // namespace grpc_core |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
// Copyright 2024 gRPC authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef GRPC_SRC_CORE_UTIL_KOLMOGOROV_SMIRNOV_H | ||
#define GRPC_SRC_CORE_UTIL_KOLMOGOROV_SMIRNOV_H | ||
|
||
#include "src/core/util/tdigest.h" | ||
|
||
namespace grpc_core { | ||
|
||
// Perform a Kolmogorov-Smirnov test to determine if two TDigests are | ||
// significantly different (returns true), or not (returns false). | ||
// | ||
// alpha is a real numbered value between 0 and 1, representing the | ||
// significance level of the test. | ||
// | ||
// num_samples is the number of cdf samples to take from each TDigest. | ||
// | ||
// Computational complexity roughly num_samples * (a.NumCentroids() + | ||
// b.NumCentroids()). | ||
bool KolmogorovSmirnovTest(TDigest& a, TDigest& b, double alpha, | ||
uint32_t num_samples = 10); | ||
|
||
double KolmogorovSmirnovStatistic(TDigest& a, TDigest& b, | ||
uint32_t num_samples = 10); | ||
|
||
double KolmogorovSmirnovThreshold(double alpha, double a_count, double b_count); | ||
|
||
} // namespace grpc_core | ||
|
||
#endif // GRPC_SRC_CORE_UTIL_KOLMOGOROV_SMIRNOV_H |