@@ -33,6 +33,7 @@ import (
3333
3434 "google.golang.org/grpc/balancer"
3535 "google.golang.org/grpc/connectivity"
36+ estats "google.golang.org/grpc/experimental/stats"
3637 "google.golang.org/grpc/internal/balancer/gracefulswitch"
3738 "google.golang.org/grpc/internal/buffer"
3839 "google.golang.org/grpc/internal/channelz"
5253// Name is the name of the outlier detection balancer.
5354const Name = "outlier_detection_experimental"
5455
56+ var (
57+ ejectionsEnforcedMetric = estats .RegisterInt64Count (estats.MetricDescriptor {
58+ Name : "grpc.lb.outlier_detection.ejections_enforced" ,
59+ Description : "EXPERIMENTAL. Number of outlier ejections enforced by detection method" ,
60+ Unit : "{ejection}" ,
61+ Labels : []string {"grpc.target" , "grpc.lb.outlier_detection.detection_method" },
62+ Default : false ,
63+ })
64+
65+ ejectionsUnenforcedMetric = estats .RegisterInt64Count (estats.MetricDescriptor {
66+ Name : "grpc.lb.outlier_detection.ejections_unenforced" ,
67+ Description : "EXPERIMENTAL. Number of unenforced outlier ejections due to either `max_ejection_percentage` or `enforcement_percentage`" ,
68+ Unit : "{ejection}" ,
69+ Labels : []string {"grpc.target" , "grpc.lb.outlier_detection.detection_method" , "grpc.lb.outlier_detection.unenforced_reason" },
70+ Default : false ,
71+ })
72+ )
73+
5574func init () {
5675 balancer .Register (bb {})
5776}
@@ -60,14 +79,16 @@ type bb struct{}
6079
6180func (bb ) Build (cc balancer.ClientConn , bOpts balancer.BuildOptions ) balancer.Balancer {
6281 b := & outlierDetectionBalancer {
63- ClientConn : cc ,
64- closed : grpcsync .NewEvent (),
65- done : grpcsync .NewEvent (),
66- addrs : make (map [string ]* endpointInfo ),
67- scUpdateCh : buffer .NewUnbounded (),
68- pickerUpdateCh : buffer .NewUnbounded (),
69- channelzParent : bOpts .ChannelzParent ,
70- endpoints : resolver .NewEndpointMap [* endpointInfo ](),
82+ ClientConn : cc ,
83+ closed : grpcsync .NewEvent (),
84+ done : grpcsync .NewEvent (),
85+ addrs : make (map [string ]* endpointInfo ),
86+ scUpdateCh : buffer .NewUnbounded (),
87+ pickerUpdateCh : buffer .NewUnbounded (),
88+ channelzParent : bOpts .ChannelzParent ,
89+ endpoints : resolver .NewEndpointMap [* endpointInfo ](),
90+ metricsRecorder : cc .MetricsRecorder (), // we use an explicit field instead of using cc.MetricsRecorder() so we can override the metric recorder in tests.
91+ target : bOpts .Target .String (),
7192 }
7293 b .logger = prefixLogger (b )
7394 b .logger .Infof ("Created" )
@@ -169,10 +190,12 @@ type outlierDetectionBalancer struct {
169190 // to suppress redundant picker updates.
170191 recentPickerNoop bool
171192
172- closed * grpcsync.Event
173- done * grpcsync.Event
174- logger * grpclog.PrefixLogger
175- channelzParent channelz.Identifier
193+ closed * grpcsync.Event
194+ done * grpcsync.Event
195+ logger * grpclog.PrefixLogger
196+ channelzParent channelz.Identifier
197+ metricsRecorder estats.MetricsRecorder
198+ target string
176199
177200 child synchronizingBalancerWrapper
178201
@@ -788,18 +811,24 @@ func (b *outlierDetectionBalancer) successRateAlgorithm() {
788811 return
789812 }
790813 mean , stddev := b .meanAndStdDev (endpointsToConsider )
814+ ejectionCfg := b .cfg .SuccessRateEjection
791815 for _ , epInfo := range endpointsToConsider {
792816 bucket := epInfo .callCounter .inactiveBucket
793- ejectionCfg := b .cfg .SuccessRateEjection
794- if float64 (b .numEndpointsEjected )/ float64 (b .endpoints .Len ())* 100 >= float64 (b .cfg .MaxEjectionPercent ) {
795- return
796- }
797817 successRate := float64 (bucket .numSuccesses ) / float64 (bucket .numSuccesses + bucket .numFailures )
798818 requiredSuccessRate := mean - stddev * (float64 (ejectionCfg .StdevFactor )/ 1000 )
799819 if successRate < requiredSuccessRate {
800820 channelz .Infof (logger , b .channelzParent , "SuccessRate algorithm detected outlier: %s. Parameters: successRate=%f, mean=%f, stddev=%f, requiredSuccessRate=%f" , epInfo , successRate , mean , stddev , requiredSuccessRate )
821+ // Check if max ejection percentage would prevent ejection.
822+ if float64 (b .numEndpointsEjected )/ float64 (b .endpoints .Len ())* 100 >= float64 (b .cfg .MaxEjectionPercent ) {
823+ // Record unenforced ejection due to max ejection percentage.
824+ ejectionsUnenforcedMetric .Record (b .metricsRecorder , 1 , b .target , "success_rate" , "max_ejection_overflow" )
825+ continue
826+ }
801827 if uint32 (rand .Int32N (100 )) < ejectionCfg .EnforcementPercentage {
802- b .ejectEndpoint (epInfo )
828+ b .ejectEndpoint (epInfo , "success_rate" )
829+ } else {
830+ // Record unenforced ejection due to enforcement percentage.
831+ ejectionsUnenforcedMetric .Record (b .metricsRecorder , 1 , b .target , "success_rate" , "enforcement_percentage" )
803832 }
804833 }
805834 }
@@ -816,24 +845,30 @@ func (b *outlierDetectionBalancer) failurePercentageAlgorithm() {
816845 return
817846 }
818847
848+ ejectionCfg := b .cfg .FailurePercentageEjection
819849 for _ , epInfo := range endpointsToConsider {
820850 bucket := epInfo .callCounter .inactiveBucket
821- ejectionCfg := b .cfg .FailurePercentageEjection
822- if float64 (b .numEndpointsEjected )/ float64 (b .endpoints .Len ())* 100 >= float64 (b .cfg .MaxEjectionPercent ) {
823- return
824- }
825851 failurePercentage := (float64 (bucket .numFailures ) / float64 (bucket .numSuccesses + bucket .numFailures )) * 100
826852 if failurePercentage > float64 (b .cfg .FailurePercentageEjection .Threshold ) {
827853 channelz .Infof (logger , b .channelzParent , "FailurePercentage algorithm detected outlier: %s, failurePercentage=%f" , epInfo , failurePercentage )
854+ // Check if max ejection percentage would prevent ejection.
855+ if float64 (b .numEndpointsEjected )/ float64 (b .endpoints .Len ())* 100 >= float64 (b .cfg .MaxEjectionPercent ) {
856+ // Record unenforced ejection due to max ejection percentage.
857+ ejectionsUnenforcedMetric .Record (b .metricsRecorder , 1 , b .target , "failure_percentage" , "max_ejection_overflow" )
858+ continue
859+ }
828860 if uint32 (rand .Int32N (100 )) < ejectionCfg .EnforcementPercentage {
829- b .ejectEndpoint (epInfo )
861+ b .ejectEndpoint (epInfo , "failure_percentage" )
862+ } else {
863+ // Record unenforced ejection due to enforcement percentage.
864+ ejectionsUnenforcedMetric .Record (b .metricsRecorder , 1 , b .target , "failure_percentage" , "enforcement_percentage" )
830865 }
831866 }
832867 }
833868}
834869
835870// Caller must hold b.mu.
836- func (b * outlierDetectionBalancer ) ejectEndpoint (epInfo * endpointInfo ) {
871+ func (b * outlierDetectionBalancer ) ejectEndpoint (epInfo * endpointInfo , detectionMethod string ) {
837872 b .numEndpointsEjected ++
838873 epInfo .latestEjectionTimestamp = b .timerStartTime
839874 epInfo .ejectionTimeMultiplier ++
@@ -842,6 +877,8 @@ func (b *outlierDetectionBalancer) ejectEndpoint(epInfo *endpointInfo) {
842877 channelz .Infof (logger , b .channelzParent , "Subchannel ejected: %s" , sbw )
843878 }
844879
880+ // Record the enforced ejection metric.
881+ ejectionsEnforcedMetric .Record (b .metricsRecorder , 1 , b .target , detectionMethod )
845882}
846883
847884// Caller must hold b.mu.
0 commit comments