Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed plotting bug when all samples' measurements have the same value #832

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]
- MSRV bumped to 1.70
- Fixed plotting bug when all samples' measurements have the same value. See [#720](https://github.com/bheisler/criterion.rs/issues/720).

### Changed
- The `real_blackbox` feature no longer has any impact. Criterion always uses `std::hint::black_box()` now.
Expand Down
79 changes: 46 additions & 33 deletions src/plot/gnuplot_backend/distributions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,41 +21,54 @@ fn abs_distribution(
let mut ci_values = [ci.lower_bound, ci.upper_bound, estimate.point_estimate];
let unit = formatter.scale_values(typical, &mut ci_values);
let (lb, ub, point) = (ci_values[0], ci_values[1], ci_values[2]);

let start = lb - (ub - lb) / 9.;
let end = ub + (ub - lb) / 9.;
let mut scaled_xs: Vec<f64> = distribution.iter().cloned().collect();
let _ = formatter.scale_values(typical, &mut scaled_xs);
let scaled_xs_sample = Sample::new(&scaled_xs);
let (kde_xs, ys) = kde::sweep(scaled_xs_sample, KDE_POINTS, Some((start, end)));

// interpolate between two points of the KDE sweep to find the Y position at the point estimate.
let n_point = kde_xs
.iter()
.position(|&x| x >= point)
.unwrap_or(kde_xs.len() - 1)
.max(1); // Must be at least the second element or this will panic
let slope = (ys[n_point] - ys[n_point - 1]) / (kde_xs[n_point] - kde_xs[n_point - 1]);
let y_point = ys[n_point - 1] + (slope * (point - kde_xs[n_point - 1]));

let zero = iter::repeat(0);

let start = kde_xs
.iter()
.enumerate()
.find(|&(_, &x)| x >= lb)
.unwrap()
.0;
let end = kde_xs
.iter()
.enumerate()
.rev()
.find(|&(_, &x)| x <= ub)
.unwrap()
.0;
let len = end - start;
let (kde_xs, ys, start, len, y_point) =
// If lower_bound == upper_bound, all samples have the same value.
// Here we make some tweaks for better visualization in the charts.
if lb == ub {
let delta = if lb == 0. { 0.01 } else { lb * 0.01 };
// the first two x,y values are to draw the line/bar of the distribution at `x`
// the second two x,y values are to draw the rectangle of the confidence interval
let kde_xs = Vec::from([lb, lb, lb - delta, lb + delta]).into_boxed_slice();
let ys = Vec::from([0_f64, point, point, point]).into_boxed_slice();
// start and len are used to mark the range of confidence interval rectangle coordinates
let start = 2_usize;
let len = 2_usize;
(kde_xs, ys, start, len, point)
} else {
let start = lb - (ub - lb) / 9.;
let end = ub + (ub - lb) / 9.;
let mut scaled_xs: Vec<f64> = distribution.iter().cloned().collect();
let _ = formatter.scale_values(typical, &mut scaled_xs);
let scaled_xs_sample = Sample::new(&scaled_xs);
let (kde_xs, ys) = kde::sweep(scaled_xs_sample, KDE_POINTS, Some((start, end)));

let kde_xs_sample = Sample::new(&kde_xs);
// interpolate between two points of the KDE sweep to find the Y position at the point estimate.
let n_point = kde_xs
.iter()
.position(|&x| x >= point)
.unwrap_or(kde_xs.len() - 1)
.max(1); // Must be at least the second element or this will panic
let slope = (ys[n_point] - ys[n_point - 1]) / (kde_xs[n_point] - kde_xs[n_point - 1]);
let y_point = ys[n_point - 1] + (slope * (point - kde_xs[n_point - 1]));

let start = kde_xs
.iter()
.enumerate()
.find(|&(_, &x)| x >= lb)
.unwrap()
.0;
let end = kde_xs
.iter()
.enumerate()
.rev()
.find(|&(_, &x)| x <= ub)
.unwrap()
.0;
let len = end - start;
(kde_xs, ys, start, len, y_point)
};

let mut figure = Figure::new();
figure
Expand All @@ -68,7 +81,7 @@ fn abs_distribution(
)))
.configure(Axis::BottomX, |a| {
a.set(Label(format!("Average time ({})", unit)))
.set(Range::Limits(kde_xs_sample.min(), kde_xs_sample.max()))
.set(Range::Auto)
})
.configure(Axis::LeftY, |a| a.set(Label("Density (a.u.)")))
.configure(Key, |k| {
Expand Down
3 changes: 1 addition & 2 deletions src/plot/gnuplot_backend/pdf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ pub(crate) fn pdf_small(
let mean = scaled_avg_times.mean();

let (xs, ys, mean_y) = kde::sweep_and_estimate(scaled_avg_times, KDE_POINTS, None, mean);
let xs_ = Sample::new(&xs);
let ys_ = Sample::new(&ys);

let y_limit = ys_.max() * 1.1;
Expand All @@ -249,7 +248,7 @@ pub(crate) fn pdf_small(
.set(size.unwrap_or(SIZE))
.configure(Axis::BottomX, |a| {
a.set(Label(format!("Average time ({})", unit)))
.set(Range::Limits(xs_.min(), xs_.max()))
.set(Range::Auto)
})
.configure(Axis::LeftY, |a| {
a.set(Label("Density (a.u.)"))
Expand Down
72 changes: 44 additions & 28 deletions src/plot/plotters_backend/distributions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,52 @@ fn abs_distribution(
let unit = formatter.scale_values(typical, &mut ci_values);
let (lb, ub, point) = (ci_values[0], ci_values[1], ci_values[2]);

let start = lb - (ub - lb) / 9.;
let end = ub + (ub - lb) / 9.;
let mut scaled_xs: Vec<f64> = distribution.iter().cloned().collect();
let _ = formatter.scale_values(typical, &mut scaled_xs);
let scaled_xs_sample = Sample::new(&scaled_xs);
let (kde_xs, ys) = kde::sweep(scaled_xs_sample, KDE_POINTS, Some((start, end)));
let (kde_xs, ys, start, len, y_point) =
// If lower_bound == upper_bound, all samples have the same value.
// Here we make some tweaks for better visualization in the charts.
if lb == ub {
let delta = if lb == 0. { 0.01 } else { lb * 0.01 };
// the first two x,y values are to draw the line/bar of the distribution at `x`
// the second two x,y values are to draw the rectangle of the confidence interval
let kde_xs = Vec::from([lb, lb, lb - delta, lb + delta]).into_boxed_slice();
let ys = Vec::from([0_f64, point, point, point]).into_boxed_slice();
// start and len are used to mark the range of confidence interval rectangle coordinates
let start = 2_usize;
let len = 2_usize;
(kde_xs, ys, start, len, point)
} else {
let start = lb - (ub - lb) / 9.;
let end = ub + (ub - lb) / 9.;
let mut scaled_xs: Vec<f64> = distribution.iter().cloned().collect();
let _ = formatter.scale_values(typical, &mut scaled_xs);
let scaled_xs_sample = Sample::new(&scaled_xs);
let (kde_xs, ys) = kde::sweep(scaled_xs_sample, KDE_POINTS, Some((start, end)));

// interpolate between two points of the KDE sweep to find the Y position at the point estimate.
let n_point = kde_xs
.iter()
.position(|&x| x >= point)
.unwrap_or(kde_xs.len() - 1)
.max(1); // Must be at least the second element or this will panic
let slope = (ys[n_point] - ys[n_point - 1]) / (kde_xs[n_point] - kde_xs[n_point - 1]);
let y_point = ys[n_point - 1] + (slope * (point - kde_xs[n_point - 1]));
// interpolate between two points of the KDE sweep to find the Y position at the point estimate.
let n_point = kde_xs
.iter()
.position(|&x| x >= point)
.unwrap_or(kde_xs.len() - 1)
.max(1); // Must be at least the second element or this will panic
let slope = (ys[n_point] - ys[n_point - 1]) / (kde_xs[n_point] - kde_xs[n_point - 1]);
let y_point = ys[n_point - 1] + (slope * (point - kde_xs[n_point - 1]));

let start = kde_xs
.iter()
.enumerate()
.find(|&(_, &x)| x >= lb)
.unwrap()
.0;
let end = kde_xs
.iter()
.enumerate()
.rev()
.find(|&(_, &x)| x <= ub)
.unwrap()
.0;
let len = end - start;
let start = kde_xs
.iter()
.enumerate()
.find(|&(_, &x)| x >= lb)
.unwrap()
.0;
let end = kde_xs
.iter()
.enumerate()
.rev()
.find(|&(_, &x)| x <= ub)
.unwrap()
.0;
let len = end - start;
(kde_xs, ys, start, len, y_point)
};

let kde_xs_sample = Sample::new(&kde_xs);

Expand Down
3 changes: 1 addition & 2 deletions src/routine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,7 @@ pub(crate) trait Routine<M: Measurement, T: ?Sized> {
// Early exit for extremely long running benchmarks:
if time_start.elapsed() > maximum_bench_duration {
let iters = vec![n as f64, n as f64].into_boxed_slice();
// prevent gnuplot bug when all values are equal
let elapsed = vec![t_prev, t_prev + 0.000001].into_boxed_slice();
let elapsed = vec![t_prev, t_prev].into_boxed_slice();
return (ActualSamplingMode::Flat, iters, elapsed);
}

Expand Down
72 changes: 61 additions & 11 deletions src/stats/univariate/kde/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ where
bandwidth: A,
kernel: K,
sample: &'a Sample<A>,
are_samples_the_same: bool,
}

impl<'a, A, K> Kde<'a, A, K>
Expand All @@ -27,10 +28,14 @@ where
/// Creates a new kernel density estimator from the `sample`, using a kernel and estimating
/// the bandwidth using the method `bw`
pub fn new(sample: &'a Sample<A>, kernel: K, bw: Bandwidth) -> Kde<'a, A, K> {
// We check if all samples have the same value just once
// during initialization and store it in `are_samples_the_same`.
// This is used in the `estimate(x)` method.
Kde {
bandwidth: bw.estimate(sample),
kernel,
sample,
are_samples_the_same: sample.min() == sample.max(),
}
}

Expand All @@ -56,16 +61,22 @@ where

/// Estimates the probability density of `x`
pub fn estimate(&self, x: A) -> A {
let _0 = A::cast(0);
let slice = self.sample;
let h = self.bandwidth;
let n = A::cast(slice.len());

let sum = slice
.iter()
.fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h));

sum / (h * n)
if self.are_samples_the_same {
// the probability density of `x`` is 1
// when all samples are `x``
A::cast(1)
} else {
let _0 = A::cast(0);
let slice = self.sample;
let h = self.bandwidth;
let n = A::cast(slice.len());

let sum = slice
.iter()
.fold(_0, |acc, &x_i| acc + self.kernel.evaluate((x - x_i) / h));

sum / (h * n)
}
}
}

Expand All @@ -84,7 +95,19 @@ impl Bandwidth {
let n = A::cast(sample.len());
let sigma = sample.std_dev(None);

sigma * (factor / n).powf(exponent)
// When all samples have the same value (rare case, but not impossible),
// the standard deviation will be zero, and the Silverman
// method will produce a zeroed bandwidth.
// But the bandwidth should always be a positive (non-zero) number,
// so if that happens, the optimal bandwidth should be very small,
// as this will create a very sharp peak at the single data point,
// accurately reflecting the fact that all data is concentrated
// at that single value.
if sigma.is_zero() {
A::cast(0.001)
} else {
sigma * (factor / n).powf(exponent)
}
}
}
}
Expand All @@ -102,6 +125,33 @@ macro_rules! test {
use crate::stats::univariate::kde::{Bandwidth, Kde};
use crate::stats::univariate::Sample;

// The probability density of a sample when all sample
// values are the same X should be 1 for X.
#[test]
fn constant_sample_measurements() {
const CONSTANT_MEASUREMENT: $ty = 1.;
let measurements = &[CONSTANT_MEASUREMENT, CONSTANT_MEASUREMENT];
let sample = Sample::new(measurements);
let kde = Kde::new(sample, Gaussian, Bandwidth::Silverman);
for x in measurements {
let prob_density = kde.estimate(*x);
assert_eq!(prob_density, 1., "The probability density of X when all samples are X should be 1.");
}
}

// The bandwidth should be a positive (non-zero) number, even when
// the samples are all the same (constant measurements).
// It's very unlikely that all samples will have the same value, but
// not impossible, so we should handle that case gracefully.
#[test]
fn positive_bandwidth() {
const CONSTANT_MEASUREMENT: $ty = 1.;
let sample = Sample::new(&[CONSTANT_MEASUREMENT, CONSTANT_MEASUREMENT]);
let bandwidth_estimator = Bandwidth::Silverman;
let h = bandwidth_estimator.estimate(&sample);
assert!(h > 0., "The bandwidth should be > 0, even when all samples have the same value, but it was 0.");
}

// The [-inf inf] integral of the estimated PDF should be one
quickcheck! {
fn integral(size: u8, start: u8) -> TestResult {
Expand Down
Loading