@@ -509,6 +509,23 @@ func sortTargetCandidateSetAndPick(
509509 }
510510 slices .SortFunc (cands .candidates , func (a , b candidateInfo ) int {
511511 if diversityScoresAlmostEqual (a .diversityScore , b .diversityScore ) {
512+ // Note: Consider the case where the current leaseholder's LPI is
513+ // 3 (lower is better) and we have the following candidates:
514+ // - LPI=1 SLS=normal
515+ // - LPI=2 SLS=low
516+ // Currently we consider the low-SLS candidate first. This is in
517+ // contrast to the single-metric allocator, which only considers
518+ // candidates in the lowest-SLS class (i.e. wouldn't even consider
519+ // the low-SLS candidate since we have a candidate at LPI=1). If we
520+ // make the corresponding change in candidateToMoveLease, we would
521+ // match the single-metric allocator's behavior, but it's unclear
522+ // that that would be better. A good middle ground could be sorting
523+ // here by LPI first, then SLS. That should result in mma preferring
524+ // improving the lease preference, but if that is not possible, it
525+ // would settle for not making it worse (than the current
526+ // leaseholder), which the single-metric allocator won't.
527+ //
528+ // TODO(tbg): consider changing this to sort by LPI first, then SLS.
512529 return cmp .Or (cmp .Compare (a .sls , b .sls ),
513530 cmp .Compare (a .leasePreferenceIndex , b .leasePreferenceIndex ),
514531 cmp .Compare (a .StoreID , b .StoreID ))
@@ -537,6 +554,9 @@ func sortTargetCandidateSetAndPick(
537554 }
538555 }
539556 // Diversity is the same. Include if not reaching disk capacity.
557+ // TODO(tbg): remove highDiskSpaceUtilization check here. These candidates
558+ // should instead be filtered out by retainReadyLeaseTargetStoresOnly (which
559+ // filters down the initial candidate set before computing the mean).
540560 if ! cand .highDiskSpaceUtilization {
541561 cands .candidates [j ] = cand
542562 j ++
@@ -796,10 +816,10 @@ func (cs *clusterState) ensureAnalyzedConstraints(rstate *rangeState) {
796816// - Need diversity change for each candidate.
797817//
798818// The first 3 bullets are encapsulated in the helper function
799- // computeCandidatesForRange . It works for both replica additions and
819+ // computeCandidatesForReplicaTransfer . It works for both replica additions and
800820// rebalancing.
801821//
802- // For the last bullet (diversity), the caller of computeCandidatesForRange
822+ // For the last bullet (diversity), the caller of computeCandidatesForReplicaTransfer
803823// needs to populate candidateInfo.diversityScore for each candidate in
804824// candidateSet. It does so via diversityScoringMemo. Then the (loadSummary,
805825// diversityScore) pair can be used to order candidates for attempts to add.
@@ -827,41 +847,125 @@ func (cs *clusterState) ensureAnalyzedConstraints(rstate *rangeState) {
827847
828848// loadSheddingStore is only specified if this candidate computation is
829849// happening because of overload.
830- func (cs * clusterState ) computeCandidatesForRange (
850+ //
851+ // postMeansExclusions are filtered post-means: their load is included in the
852+ // mean (they're viable locations in principle) but they're not candidates for
853+ // this specific transfer (the classic case: already have a replica).
854+ func (cs * clusterState ) computeCandidatesForReplicaTransfer (
831855 ctx context.Context ,
832- expr constraintsDisj ,
833- storesToExclude storeSet ,
856+ conj constraintsConj ,
857+ existingReplicas storeSet ,
858+ postMeansExclusions storeSet ,
834859 loadSheddingStore roachpb.StoreID ,
835860 passObs * rebalancingPassMetricsAndLogger ,
836861) (_ candidateSet , sheddingSLS storeLoadSummary ) {
837- means := cs .meansMemo .getMeans (expr )
838- if loadSheddingStore > 0 {
839- sheddingSS := cs .stores [loadSheddingStore ]
840- sheddingSLS = cs .meansMemo .getStoreLoadSummary (ctx , means , loadSheddingStore , sheddingSS .loadSeqNum )
841- if sheddingSLS .sls <= loadNoChange && sheddingSLS .nls <= loadNoChange {
842- // In this set of stores, this store no longer looks overloaded.
843- passObs .replicaShed (notOverloaded )
844- return candidateSet {}, sheddingSLS
845- }
862+ // Start with computing the stores (and corresponding means) that satisfy
863+ // the constraint expression. If we don't see a need to filter out any of
864+ // these stores before computing the means, we can use it verbatim, otherwise
865+ // we will recompute the means again below.
866+ cs .scratchDisj [0 ] = conj
867+ means := cs .meansMemo .getMeans (cs .scratchDisj [:1 ])
868+
869+ // Pre-means filtering: copy to scratch, then filter in place.
870+ // Filter out stores that have a non-OK replica disposition.
871+ cs .scratchStoreSet = append (cs .scratchStoreSet [:0 ], means .stores ... )
872+ filteredStores := retainReadyReplicaTargetStoresOnly (ctx , cs .scratchStoreSet , cs .stores , existingReplicas )
873+
874+ // Determine which means to use.
875+ //
876+ // TODO(tbg): unit testing.
877+ var effectiveMeans * meansLoad
878+ if len (filteredStores ) == len (means .stores ) {
879+ // Common case: nothing was filtered, use cached means.
880+ effectiveMeans = & means .meansLoad
881+ } else if len (filteredStores ) == 0 {
882+ // No viable candidates at all.
883+ return candidateSet {}, sheddingSLS
884+ } else {
885+ // Some stores were filtered; recompute means over filtered set.
886+ cs .scratchMeans = computeMeansForStoreSet (
887+ cs , filteredStores , cs .meansMemo .scratchNodes , cs .meansMemo .scratchStores )
888+ effectiveMeans = & cs .scratchMeans
889+ log .KvDistribution .VEventf (ctx , 2 ,
890+ "pre-means filtered %d stores → remaining %v, means: store=%v node=%v" ,
891+ len (means .stores )- len (filteredStores ), filteredStores ,
892+ effectiveMeans .storeLoad , effectiveMeans .nodeLoad )
846893 }
847- // We only filter out stores that are not fdOK. The rest of the filtering
848- // happens later.
894+
895+ sheddingSLS = cs .computeLoadSummary (ctx , loadSheddingStore , & effectiveMeans .storeLoad , & effectiveMeans .nodeLoad )
896+ if sheddingSLS .sls <= loadNoChange && sheddingSLS .nls <= loadNoChange {
897+ // In this set of stores, this store no longer looks overloaded.
898+ passObs .replicaShed (notOverloaded )
899+ return candidateSet {}, sheddingSLS
900+ }
901+
849902 var cset candidateSet
850- for _ , storeID := range means .stores {
851- if storesToExclude .contains (storeID ) {
903+ for _ , storeID := range filteredStores {
904+ if postMeansExclusions .contains (storeID ) {
905+ // This store's load is included in the mean, but it's not a viable
906+ // target for this specific transfer (e.g. it already has a replica).
852907 continue
853908 }
854- ss := cs .stores [storeID ]
855- csls := cs .meansMemo .getStoreLoadSummary (ctx , means , storeID , ss .loadSeqNum )
909+ csls := cs .computeLoadSummary (ctx , storeID , & effectiveMeans .storeLoad , & effectiveMeans .nodeLoad )
856910 cset .candidates = append (cset .candidates , candidateInfo {
857911 StoreID : storeID ,
858912 storeLoadSummary : csls ,
859913 })
860914 }
861- cset .means = & means . meansLoad
915+ cset .means = effectiveMeans
862916 return cset , sheddingSLS
863917}
864918
919+ // retainReadyReplicaTargetStoresOnly filters the input set to only those stores
920+ // that are ready to accept a replica. A store is not ready if it has a non-OK
921+ // replica disposition. In practice, the input set is already filtered by
922+ // constraints.
923+ //
924+ // Stores already housing a replica (on top of being in the input storeSet)
925+ // bypass this disposition check since they already have the replica - its load
926+ // should be in the mean regardless of its disposition, as we'll pick candidates
927+ // based on improving clustering around the mean.
928+ //
929+ // The input storeSet is mutated (and returned as the result).
930+ func retainReadyReplicaTargetStoresOnly (
931+ ctx context.Context ,
932+ in storeSet ,
933+ stores map [roachpb.StoreID ]* storeState ,
934+ existingReplicas storeSet ,
935+ ) storeSet {
936+ out := in [:0 ]
937+ for _ , storeID := range in {
938+ if existingReplicas .contains (storeID ) {
939+ // Stores on existing replicas already have the load and we want to
940+ // include them in the mean, even if they are not accepting new replicas
941+ // or even try to shed.
942+ //
943+ // TODO(tbg): health might play into this, though. For example, when
944+ // a store is dead, whatever load we have from it is stale and we
945+ // are better off not including it. For now, we ignore this problem
946+ // because the mma only handles rebalancing, whereas a replica on a
947+ // dead store would be removed by the single-metric allocator after
948+ // the TimeUntilStoreDead and so would disappear from our view.
949+ out = append (out , storeID )
950+ continue
951+ }
952+ ss := stores [storeID ]
953+ switch {
954+ case ss .status .Disposition .Replica != ReplicaDispositionOK :
955+ log .KvDistribution .VEventf (ctx , 2 , "skipping s%d for replica transfer: replica disposition %v (health %v)" , storeID , ss .status .Disposition .Replica , ss .status .Health )
956+ case highDiskSpaceUtilization (ss .reportedLoad [ByteSize ], ss .capacity [ByteSize ]):
957+ // TODO(tbg): remove this from mma and just let the caller set this
958+ // disposition based on the following cluster settings:
959+ // - kv.allocator.max_disk_utilization_threshold
960+ // - kv.allocator.rebalance_to_max_disk_utilization_threshold
961+ log .KvDistribution .VEventf (ctx , 2 , "skipping s%d for replica transfer: high disk utilization (health %v)" , storeID , ss .status .Health )
962+ default :
963+ out = append (out , storeID )
964+ }
965+ }
966+ return out
967+ }
968+
865969// Diversity scoring is very amenable to caching, since the set of unique
866970// locality tiers for range replicas is likely to be small. And the cache does
867971// not need to be cleared after every allocator pass. This caching is done via
0 commit comments