diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go index d6ef1b2cf327..05e11661ea05 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go @@ -698,8 +698,43 @@ func sortTargetCandidateSetAndPick( cmp.Compare(a.StoreID, b.StoreID)) }) } - // Candidates are sorted by non-decreasing leasePreferenceIndex. Eliminate - // ones that have notMatchedLeasePreferenceIndex. + // Candidates are sorted by non-decreasing leasePreferenceIndex (lower is + // better). Eliminate ones that have notMatchedLeasePreferenceIndex (i.e., + // candidates that don't match ANY lease preference). This filter only has an + // effect when the span config has lease preferences configured; otherwise all + // candidates have leasePreferenceIndex=0 (see matchedLeasePreferenceIndex). + // + // For lease transfers: This check is usually redundant because + // candidatesToMoveLease already filters candidates to those with + // leasePreferenceIndex <= the current leaseholder's index. However, if the + // current leaseholder matches NO preference (LPI = + // notMatchedLeasePreferenceIndex), then candidatesToMoveLease returns all + // voters (since all LPIs <= MaxInt32), and this filter becomes necessary to + // remove candidates that also match no preference. + // + // TODO(wenyihu6): Is this expected behavior? Another thought is that + // rebalancing should not need to improve lease preference satisfaction: we + // are here because the source is overloaded or wanting to shed leases for + // some reason. Doing that work without making the lease preference worse is + // sufficient. Improving the lease preference should be the job of the lease + // queue. If the current leaseholder matches no preference, perhaps we should + // allow transferring to any candidate (even those also matching no + // preference) rather than requiring candidates to match some preference. + // + // For replica transfers from the leaseholder: This is the only lease + // preference filtering applied. matchedLeasePreferenceIndex is called + // directly on candidates in rebalanceReplicas. + // + // Note: There's an asymmetry between lease transfers and replica transfers. + // Lease transfers enforce "don't make lease preference worse" (via + // candidatesToMoveLease filtering to LPI <= current). Replica transfers from + // the leaseholder only enforce "must match some preference" (via this + // filter), meaning a replica transfer could move the leaseholder's replica to + // a store with a worse lease preference (e.g., LPI=0 -> LPI=2). This is + // consistent with SMA behavior: SMA's rankedCandidateListForRebalancing (used + // by RebalanceTarget) doesn't consider lease preferences for replica + // transfers. The rationale is likely that the lease queue will fix any lease + // preference violations after the replica transfer completes. j = 0 for _, cand := range cands.candidates { if cand.leasePreferenceIndex == notMatchedLeasePreferenceIndex { diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go index ca86c471079e..a37e8aa70bfc 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go @@ -2443,12 +2443,15 @@ func (cs *clusterState) canShedAndAddLoad( // In other words, whenever a node level summary was "bumped up" beyond // the target's by some other local store, we reject the change. // - // TODO(tbg): While the example illustrates that "something had to be - // done", I don't understand why it makes sense to solve this exactly - // as it was done. The node level summary is based on node-wide CPU - // utilization as well as its distance from the mean (across the - // candidate set). Store summaries a) reflect the worst dimension, and - // b) on the CPU dimension are based on the store-apportioned capacity. + // TODO(tbg,wenyihu6): There's a conceptual mismatch in this comparison: nls + // is CPU-specific while sls is max over all dimensions. A more consistent + // approach would compare nls with dimSummary[CPURate]. However, using sls + // is more permissive (since dimSummary[CPURate] <= sls). If sls is elevated + // due to a non-CPU dimension (e.g., write bandwidth), the check passes more + // easily, allowing more transfers. This helps avoid situations where the + // source is overloaded and needs to shed load, but all potential targets + // are rejected by overly strict checks despite having capacity. For now, + // we lean toward permissiveness, but it's unclear if this is optimal. targetSLS.nls <= targetSLS.sls if canAddLoad { return true diff --git a/pkg/kv/kvserver/allocator/mmaprototype/constraint.go b/pkg/kv/kvserver/allocator/mmaprototype/constraint.go index 59185e1dd6dc..083089bd3659 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/constraint.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/constraint.go @@ -2060,6 +2060,34 @@ type storeAndLeasePreference struct { // https://github.com/sumeerbhola/cockroach/blob/c4c1dcdeda2c0f38c38270e28535f2139a077ec7/pkg/kv/kvserver/allocator/allocatorimpl/allocator.go#L2980-L2980 // This may be unnecessarily strict and not essential, since it seems many // users only set one lease preference. +// +// TODO(wenyihu6): There's a gap in MMA's filtering order compared to SMA. MMA +// filters by lease preference first (LPI <= current via candidatesToMoveLease), +// then by health (retainReadyLeaseTargetStoresOnly), and finally filters out +// candidates matching no preference (notMatchedLeasePreferenceIndex check in +// sortTargetCandidateSetAndPick). SMA filters by health first +// (ValidLeaseTargets calls LiveAndDeadReplicas) then finds the best LPI among +// healthy survivors (PreferredLeaseholders). If no healthy store satisfies any +// configured lease preference, then SMA falls back to considering all healthy +// stores as candidates. +// +// This means MMA can fail to find candidates when SMA would succeed. Example: +// - Current leaseholder: s1 with LPI=0 (best), unhealthy +// - Voters: s1(LPI=0, unhealthy), s2(LPI=0, unhealthy), s3(LPI=1, healthy) +// - MMA: candidatesToMoveLease returns [s2] (LPI <= 0), health filter removes +// s2, result: NO candidates. +// - SMA: health filter leaves [s3], PreferredLeaseholders finds s3 (best +// among healthy), result: s3 is candidate. +// +// To fix this, consider filtering by health first, then: +// - If current leaseholder is healthy: apply LPI <= current among healthy +// stores - If current leaseholder is unhealthy: find best LPI among healthy +// survivors (like SMA) +// - (Up for discussion) Fallback to all healthy stores if no preference +// matches. +// +// This would require restructuring to pass a health-filtered store set into +// this function. func (rac *rangeAnalyzedConstraints) candidatesToMoveLease() ( cands []storeAndLeasePreference, curLeasePreferenceIndex int32,