Skip to content

Commit 58d4b2b

Browse files
davinci26PardhuKonakanchieshitachandwani
authored
outlierdetection: add metrics specified in gRFC A91 (#8644)
Implements gRFC A91: https://github.com/grpc/proposal/blob/master/A91-outlier-detection-metrics.md ### Notable implementation detals * `grpc.lb.backend_service` is not implemented yet (marked as optional in the gRFC) * modifies the tests to make sure we can cover all the cases for `enforced`/`unenforced` without repeating the test setup. RELEASE NOTES: * outlierdetection: add metrics for enforced (grpc.lb.outlier_detection.ejections_enforced) and unenforced (grpc.lb.outlier_detection.ejections_unenforced) outlier ejections. --------- Signed-off-by: sotiris <[email protected]> Co-authored-by: Pardhu Konakanchi <[email protected]> Co-authored-by: Pardhu Konakanchi <[email protected]> Co-authored-by: eshitachandwani <[email protected]>
1 parent 2b35fa5 commit 58d4b2b

File tree

2 files changed

+389
-246
lines changed

2 files changed

+389
-246
lines changed

internal/xds/balancer/outlierdetection/balancer.go

Lines changed: 60 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333

3434
"google.golang.org/grpc/balancer"
3535
"google.golang.org/grpc/connectivity"
36+
estats "google.golang.org/grpc/experimental/stats"
3637
"google.golang.org/grpc/internal/balancer/gracefulswitch"
3738
"google.golang.org/grpc/internal/buffer"
3839
"google.golang.org/grpc/internal/channelz"
@@ -52,6 +53,24 @@ var (
5253
// Name is the name of the outlier detection balancer.
5354
const Name = "outlier_detection_experimental"
5455

56+
var (
57+
ejectionsEnforcedMetric = estats.RegisterInt64Count(estats.MetricDescriptor{
58+
Name: "grpc.lb.outlier_detection.ejections_enforced",
59+
Description: "EXPERIMENTAL. Number of outlier ejections enforced by detection method",
60+
Unit: "{ejection}",
61+
Labels: []string{"grpc.target", "grpc.lb.outlier_detection.detection_method"},
62+
Default: false,
63+
})
64+
65+
ejectionsUnenforcedMetric = estats.RegisterInt64Count(estats.MetricDescriptor{
66+
Name: "grpc.lb.outlier_detection.ejections_unenforced",
67+
Description: "EXPERIMENTAL. Number of unenforced outlier ejections due to either `max_ejection_percentage` or `enforcement_percentage`",
68+
Unit: "{ejection}",
69+
Labels: []string{"grpc.target", "grpc.lb.outlier_detection.detection_method", "grpc.lb.outlier_detection.unenforced_reason"},
70+
Default: false,
71+
})
72+
)
73+
5574
func init() {
5675
balancer.Register(bb{})
5776
}
@@ -60,14 +79,16 @@ type bb struct{}
6079

6180
func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer {
6281
b := &outlierDetectionBalancer{
63-
ClientConn: cc,
64-
closed: grpcsync.NewEvent(),
65-
done: grpcsync.NewEvent(),
66-
addrs: make(map[string]*endpointInfo),
67-
scUpdateCh: buffer.NewUnbounded(),
68-
pickerUpdateCh: buffer.NewUnbounded(),
69-
channelzParent: bOpts.ChannelzParent,
70-
endpoints: resolver.NewEndpointMap[*endpointInfo](),
82+
ClientConn: cc,
83+
closed: grpcsync.NewEvent(),
84+
done: grpcsync.NewEvent(),
85+
addrs: make(map[string]*endpointInfo),
86+
scUpdateCh: buffer.NewUnbounded(),
87+
pickerUpdateCh: buffer.NewUnbounded(),
88+
channelzParent: bOpts.ChannelzParent,
89+
endpoints: resolver.NewEndpointMap[*endpointInfo](),
90+
metricsRecorder: cc.MetricsRecorder(), // we use an explicit field instead of using cc.MetricsRecorder() so we can override the metric recorder in tests.
91+
target: bOpts.Target.String(),
7192
}
7293
b.logger = prefixLogger(b)
7394
b.logger.Infof("Created")
@@ -169,10 +190,12 @@ type outlierDetectionBalancer struct {
169190
// to suppress redundant picker updates.
170191
recentPickerNoop bool
171192

172-
closed *grpcsync.Event
173-
done *grpcsync.Event
174-
logger *grpclog.PrefixLogger
175-
channelzParent channelz.Identifier
193+
closed *grpcsync.Event
194+
done *grpcsync.Event
195+
logger *grpclog.PrefixLogger
196+
channelzParent channelz.Identifier
197+
metricsRecorder estats.MetricsRecorder
198+
target string
176199

177200
child synchronizingBalancerWrapper
178201

@@ -788,18 +811,24 @@ func (b *outlierDetectionBalancer) successRateAlgorithm() {
788811
return
789812
}
790813
mean, stddev := b.meanAndStdDev(endpointsToConsider)
814+
ejectionCfg := b.cfg.SuccessRateEjection
791815
for _, epInfo := range endpointsToConsider {
792816
bucket := epInfo.callCounter.inactiveBucket
793-
ejectionCfg := b.cfg.SuccessRateEjection
794-
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) {
795-
return
796-
}
797817
successRate := float64(bucket.numSuccesses) / float64(bucket.numSuccesses+bucket.numFailures)
798818
requiredSuccessRate := mean - stddev*(float64(ejectionCfg.StdevFactor)/1000)
799819
if successRate < requiredSuccessRate {
800820
channelz.Infof(logger, b.channelzParent, "SuccessRate algorithm detected outlier: %s. Parameters: successRate=%f, mean=%f, stddev=%f, requiredSuccessRate=%f", epInfo, successRate, mean, stddev, requiredSuccessRate)
821+
// Check if max ejection percentage would prevent ejection.
822+
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) {
823+
// Record unenforced ejection due to max ejection percentage.
824+
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "success_rate", "max_ejection_overflow")
825+
continue
826+
}
801827
if uint32(rand.Int32N(100)) < ejectionCfg.EnforcementPercentage {
802-
b.ejectEndpoint(epInfo)
828+
b.ejectEndpoint(epInfo, "success_rate")
829+
} else {
830+
// Record unenforced ejection due to enforcement percentage.
831+
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "success_rate", "enforcement_percentage")
803832
}
804833
}
805834
}
@@ -816,24 +845,30 @@ func (b *outlierDetectionBalancer) failurePercentageAlgorithm() {
816845
return
817846
}
818847

848+
ejectionCfg := b.cfg.FailurePercentageEjection
819849
for _, epInfo := range endpointsToConsider {
820850
bucket := epInfo.callCounter.inactiveBucket
821-
ejectionCfg := b.cfg.FailurePercentageEjection
822-
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) {
823-
return
824-
}
825851
failurePercentage := (float64(bucket.numFailures) / float64(bucket.numSuccesses+bucket.numFailures)) * 100
826852
if failurePercentage > float64(b.cfg.FailurePercentageEjection.Threshold) {
827853
channelz.Infof(logger, b.channelzParent, "FailurePercentage algorithm detected outlier: %s, failurePercentage=%f", epInfo, failurePercentage)
854+
// Check if max ejection percentage would prevent ejection.
855+
if float64(b.numEndpointsEjected)/float64(b.endpoints.Len())*100 >= float64(b.cfg.MaxEjectionPercent) {
856+
// Record unenforced ejection due to max ejection percentage.
857+
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "failure_percentage", "max_ejection_overflow")
858+
continue
859+
}
828860
if uint32(rand.Int32N(100)) < ejectionCfg.EnforcementPercentage {
829-
b.ejectEndpoint(epInfo)
861+
b.ejectEndpoint(epInfo, "failure_percentage")
862+
} else {
863+
// Record unenforced ejection due to enforcement percentage.
864+
ejectionsUnenforcedMetric.Record(b.metricsRecorder, 1, b.target, "failure_percentage", "enforcement_percentage")
830865
}
831866
}
832867
}
833868
}
834869

835870
// Caller must hold b.mu.
836-
func (b *outlierDetectionBalancer) ejectEndpoint(epInfo *endpointInfo) {
871+
func (b *outlierDetectionBalancer) ejectEndpoint(epInfo *endpointInfo, detectionMethod string) {
837872
b.numEndpointsEjected++
838873
epInfo.latestEjectionTimestamp = b.timerStartTime
839874
epInfo.ejectionTimeMultiplier++
@@ -842,6 +877,8 @@ func (b *outlierDetectionBalancer) ejectEndpoint(epInfo *endpointInfo) {
842877
channelz.Infof(logger, b.channelzParent, "Subchannel ejected: %s", sbw)
843878
}
844879

880+
// Record the enforced ejection metric.
881+
ejectionsEnforcedMetric.Record(b.metricsRecorder, 1, b.target, detectionMethod)
845882
}
846883

847884
// Caller must hold b.mu.

0 commit comments

Comments
 (0)