From 7e30eada1822327516588fedc349c18ec04451f7 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 5 Dec 2025 12:12:35 +0100 Subject: [PATCH 1/3] mmaprototype: sprinkle some comments about high disk util --- .../kvserver/allocator/mmaprototype/allocator_state.go | 3 +++ .../kvserver/allocator/mmaprototype/cluster_state.go | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go index 62d4bb600754..7fe378fb90a3 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go @@ -537,6 +537,9 @@ func sortTargetCandidateSetAndPick( } } // Diversity is the same. Include if not reaching disk capacity. + // TODO(tbg): remove highDiskSpaceUtilization check here. These candidates + // should instead be filtered out by retainReadyLeaseTargetStoresOnly (which + // filters down the initial candidate set before computing the mean). if !cand.highDiskSpaceUtilization { cands.candidates[j] = cand j++ diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go index 7cf3dcca3216..5e61460fc092 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go @@ -1623,6 +1623,7 @@ func (cs *clusterState) processStoreLeaseholderMsgInternal( topk.dim = WriteBandwidth } if sls.highDiskSpaceUtilization { + // If disk space is running out, shedding bytes becomes the top priority. topk.dim = ByteSize } else if sls.sls > loadNoChange { // If multiple dimensions are contributing the same loadSummary, we will pick @@ -2451,10 +2452,11 @@ func computeLoadSummary( } nls := loadSummaryForDimension(ctx, storeIDForLogging, ns.NodeID, CPURate, ns.adjustedCPU, ns.CapacityCPU, mnl.loadCPU, mnl.utilCPU) return storeLoadSummary{ - worstDim: worstDim, - sls: sls, - nls: nls, - dimSummary: dimSummary, + worstDim: worstDim, + sls: sls, + nls: nls, + dimSummary: dimSummary, + // TODO(tbg): remove highDiskSpaceUtilization. highDiskSpaceUtilization: highDiskSpaceUtil, maxFractionPendingIncrease: ss.maxFractionPendingIncrease, maxFractionPendingDecrease: ss.maxFractionPendingDecrease, From d0ae816ef3f138c9e41ec7343b44bbaacf4a9e85 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 5 Dec 2025 12:12:06 +0100 Subject: [PATCH 2/3] mmaprototype: improve lease rebalancing - Improve documentation and TODOs - Skip leaseholder for disposition checks --- .../allocator/mmaprototype/allocator_state.go | 17 +++ .../allocator/mmaprototype/cluster_state.go | 104 ++++++++++++------ .../cluster_state_rebalance_stores.go | 70 ++++++++++-- .../mmaprototype/cluster_state_test.go | 3 +- .../cluster_state/lease_disposition.txt | 9 ++ .../rebalance_stores_capacity_mismatch.txt | 6 +- ...alance_stores_cpu_lease_frac_threshold.txt | 4 +- ...lance_stores_cpu_lease_refusing_target.txt | 2 +- ...ance_stores_cpu_lease_replica_refusing.txt | 2 +- ...alance_stores_cpu_lease_transfer_count.txt | 4 +- ...ce_stores_cpu_lease_transfer_unbounded.txt | 8 +- 11 files changed, 171 insertions(+), 58 deletions(-) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go index 7fe378fb90a3..677a2068d786 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go @@ -509,6 +509,23 @@ func sortTargetCandidateSetAndPick( } slices.SortFunc(cands.candidates, func(a, b candidateInfo) int { if diversityScoresAlmostEqual(a.diversityScore, b.diversityScore) { + // Note: Consider the case where the current leaseholder's LPI is + // 3 (lower is better) and we have the following candidates: + // - LPI=1 SLS=normal + // - LPI=2 SLS=low + // Currently we consider the low-SLS candidate first. This is in + // contrast to the single-metric allocator, which only considers + // candidates in the lowest-SLS class (i.e. wouldn't even consider + // the low-SLS candidate since we have a candidate at LPI=1). If we + // make the corresponding change in candidateToMoveLease, we would + // match the single-metric allocator's behavior, but it's unclear + // that that would be better. A good middle ground could be sorting + // here by LPI first, then SLS. That should result in mma preferring + // improving the lease preference, but if that is not possible, it + // would settle for not making it worse (than the current + // leaseholder), which the single-metric allocator won't. + // + // TODO(tbg): consider changing this to sort by LPI first, then SLS. return cmp.Or(cmp.Compare(a.sls, b.sls), cmp.Compare(a.leasePreferenceIndex, b.leasePreferenceIndex), cmp.Compare(a.StoreID, b.StoreID)) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go index 5e61460fc092..c330106267a8 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go @@ -2192,15 +2192,42 @@ func (cs *clusterState) getNodeReportedLoad(nodeID roachpb.NodeID) *NodeLoad { return nil } -// canShedAndAddLoad returns true if the delta can be added to the target -// store and removed from the src store, such that the relative load summaries -// will not get worse. +// canShedAndAddLoad returns true if the delta can be added to the target store +// and removed from the src store. It does not change any state between the call +// and return. // -// It does not change any state between the call and return. +// overloadDim represents the dimension that is overloaded in the source and the +// function requires that along that dimension, the target is < loadNoChange and +// the source is > loadNoChange. // -// overloadDim represents the dimension that is overloaded in the source and -// the function requires that the target must be currently < loadNoChange -// along that dimension. +// Broadly speaking, the method tries to ascertain that the target wouldn't be +// worse off than the source following the transfer. To do this, the method +// looks at a load summary for the target that would result from the load +// transfer (targetLoadSummary). +// +// When onlyConsiderTargetCPUSummary is true, the targetLoadSummary derives from +// the target's post-transfer CPU dimension only. This is appropriate when a lease is +// transferred, as this should only affect the CPU dimension, and we don't want +// lease transfers to be subject to stricter checks related to other dimensions. +// When onlyConsiderTargetCPUSummary is false, targetLoadSummary is the target's +// worst post-transfer load summary. In both cases, the node load summary is also +// considered. +// +// TODO(tbg): understand and explain why the node load summary is in the mix here. +// +// In either case, if the targetLoadSummary is < loadNoChange, the change is +// permitted right away. Otherwise, stricter checks apply: After the transfer, +// - the target must not be overloadUrgent, +// - the target has no pending changes (to delay making a potentially non-ideal +// choice of the target), +// - the target's overloaded dimension's summary must not be worse than the +// source's ("overloadedDimPermitsChange"), +// - along each of the other (!=overloadeDim) dimensions, the percentage +// increase in load is at most a third of that of the overloaded dimension. +// (e.g. if CPU goes up by 30%, WriteBandwidth can go up by at most 10%). +// - the target's node load summary must not be worse than the target's store +// load summary. See inline comment for more details. + func (cs *clusterState) canShedAndAddLoad( ctx context.Context, srcSS *storeState, @@ -2217,6 +2244,11 @@ func (cs *clusterState) canShedAndAddLoad( // the load delta addition flips the loadSummary for either the target or the // source, which suggests it might be useful to add this to verbose logging. + // Compute srcSLS and targetSLS, which are the load summaries of the source + // and target that would result from moving the lease. + // + // TODO(tbg): extract this into a helper and set it up so that it doesn't + // temporarily modify the cluster state. targetNS := cs.nodes[targetSS.NodeID] // Add the delta. deltaToAdd := loadVectorToAdd(delta) @@ -2255,28 +2287,16 @@ func (cs *clusterState) canShedAndAddLoad( reason.WriteString("targetSLS.highDiskSpaceUtilization") return false } - // We define targetSummary as a summarization across all dimensions of the - // target. A targetSummary < loadNoChange always accepts the change. When - // the targetSummary >= loadNoChange, we are stricter and require both that - // there are no pending changes in the target, and the target is "not worse" - // in a way that will cause thrashing, where the details are defined below. - // The no pending changes requirement is to delay making a potentially - // non-ideal choice of the target. - // - // NB: The target's overload dimension summary must have been < - // loadNoChange, and the source must have been > loadNoChange. + + // We define targetSummary as a "worst" of the considered load dimesions + // (only CPU, or all). var targetSummary loadSummary if onlyConsiderTargetCPUSummary { targetSummary = targetSLS.dimSummary[CPURate] - if targetSummary < targetSLS.nls { - targetSummary = targetSLS.nls - } } else { targetSummary = targetSLS.sls - if targetSummary < targetSLS.nls { - targetSummary = targetSLS.nls - } } + targetSummary = max(targetSummary, targetSLS.nls) if targetSummary < loadNoChange { return true @@ -2285,6 +2305,7 @@ func (cs *clusterState) canShedAndAddLoad( reason.WriteString("overloadUrgent") return false } + // Need to consider additional factors. // // It is possible that both are overloadSlow in aggregate. We want to make @@ -2313,7 +2334,7 @@ func (cs *clusterState) canShedAndAddLoad( // That boolean predicate can also be too strict, in that we should permit // transitions to overloadSlow along one dimension, to allow for an // exchange. - overloadedDimFractionIncrease := math.MaxFloat64 + var overloadedDimFractionIncrease float64 if targetSS.adjusted.load[overloadedDim] > 0 { overloadedDimFractionIncrease = float64(deltaToAdd[overloadedDim]) / float64(targetSS.adjusted.load[overloadedDim]) @@ -2354,14 +2375,33 @@ func (cs *clusterState) canShedAndAddLoad( targetSLS.maxFractionPendingIncrease < epsilon && targetSLS.maxFractionPendingDecrease < epsilon && // NB: targetSLS.nls <= targetSLS.sls is not a typo, in that we are - // comparing targetSLS with itself. The nls only captures node-level - // CPU, so if a store that is overloaded wrt WriteBandwidth wants to - // shed to a store that is overloaded wrt CPURate, we need to permit - // that. However, the nls of the former will be less than the that of - // the latter. By looking at the nls of the target here, we are making - // sure that it is no worse than the sls of the target, since if it - // is, the node is overloaded wrt CPU due to some other store on that - // node, and we should be shedding that load first. + // comparing targetSLS with itself. + // + // Consider a node that has two stores: + // - s1 is low on CPU + // - s2 is very high on CPU, resulting in a node load summary of + // overloadSlow or overloadUrgent) + // + // In this code path, targetSLS is >= loadNoChange, so there must be + // some overload dimension in targetSLS. If it comes from write bandwidth + // (or any other non-CPU dimension), without this check,s1 might be + // considered an acceptable target for adding CPU load. But it is clearly + // not a good target, since the node housing s1 is CPU overloaded - s2 + // should be shedding CPU load first. + // This example motivates the condition below. If we reach this code, + // we know that targetSLS >= loadNoChange, and we decide: + // - at sls=loadNoChange, we require nls <= loadNoChange + // - at sls=overloadSlow, we require nls <= overloadSlow + // - at sls=overloadUrgent, we require nls <= overloadUrgent. + // In other words, whenever a node level summary was "bumped up" beyond + // the target's by some other local store, we reject the change. + // + // TODO(tbg): While the example illustrates that "something had to be + // done", I don't understand why it makes sense to solve this exactly + // as it was done. The node level summary is based on node-wide CPU + // utilization as well as its distance from the mean (across the + // candidate set). Store summaries a) reflect the worst dimension, and + // b) on the CPU dimension are based on the store-apportioned capacity. targetSLS.nls <= targetSLS.sls if canAddLoad { return true diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go index c51505ee21f6..bde3fae5c925 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go @@ -700,23 +700,47 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( "store=%v range_id=%v should be leaseholder but isn't", store.StoreID, rangeID)) } + + // Get the stores from the replica set that are at least as good as the + // current leaseholder wrt satisfaction of lease preferences. This means + // that mma will never make lease preferences violations worse when + // moving the lease. + // + // Example: + // s1 and s2 in us-east, s3 in us-central, lease preference for us-east. + // - if s3 has the lease: candsPL = [s1, s2, s3] + // - if s1 has the lease: candsPL = [s2, s1] (s3 filtered out) + // - if s2 has the lease: candsPL = [s1, s2] (s3 filtered out) + // + // In effect, we interpret each replica whose store is worse than the current + // leaseholder as ill-disposed for the lease and (pre-means) filter then out. cands, _ := rstate.constraints.candidatesToMoveLease() - var candsPL storeSet + // candsPL is the set of stores to consider the mean. This should + // include the current leaseholder, so we add it in, but only in a + // little while. + var candsPL storeSet // TODO(tbg): avoid allocation for _, cand := range cands { candsPL.insert(cand.storeID) } - // Always consider the local store (which already holds the lease) as a - // candidate, so that we don't move the lease away if keeping it would be - // the better option overall. - // TODO(tbg): is this really needed? We intentionally exclude the leaseholder - // in candidatesToMoveLease, so why reinsert it now? - candsPL.insert(store.StoreID) - if len(candsPL) <= 1 { + if len(candsPL) == 0 { + // No candidates to move the lease to. We bail early to avoid some + // logging below that is not helpful if we didn't have any real + // candidates to begin with. re.passObs.leaseShed(noCandidate) - continue // leaseholder is the only candidate + continue } + // NB: intentionally log before re-adding the current leaseholder so + // we don't list it as a candidate. + log.KvDistribution.VEventf(ctx, 2, "considering lease-transfer r%v from s%v: candidates are %v", rangeID, store.StoreID, candsPL) + // Now candsPL is ready for computing the means. + candsPL.insert(store.StoreID) + + // Filter by disposition. Note that we pass the shedding store in to + // make sure that its disposition does not matter. In other words, the + // leaseholder is always going to include itself in the mean, even if it + // is ill-disposed towards leases. + candsPL = retainReadyLeaseTargetStoresOnly(ctx, candsPL, re.stores, rangeID, store.StoreID) - candsPL = retainReadyLeaseTargetStoresOnly(ctx, candsPL, re.stores, rangeID) // INVARIANT: candsPL - {store.StoreID} \subset cands if len(candsPL) == 0 || (len(candsPL) == 1 && candsPL[0] == store.StoreID) { re.passObs.leaseShed(noHealthyCandidate) @@ -731,7 +755,6 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( clear(re.scratch.nodes) means := computeMeansForStoreSet(re, candsPL, re.scratch.nodes, re.scratch.stores) sls := re.computeLoadSummary(ctx, store.StoreID, &means.storeLoad, &means.nodeLoad) - log.KvDistribution.VEventf(ctx, 2, "considering lease-transfer r%v from s%v: candidates are %v", rangeID, store.StoreID, candsPL) if sls.dimSummary[CPURate] < overloadSlow { // This store is not cpu overloaded relative to these candidates for // this range. @@ -741,6 +764,9 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( } var candsSet candidateSet for _, cand := range cands { + if cand.storeID == store.StoreID { + panic(errors.AssertionFailedf("current leaseholder can't be a candidate: %v", cand)) + } if !candsPL.contains(cand.storeID) { // Skip candidates that are filtered out by // retainReadyLeaseTargetStoresOnly. @@ -822,10 +848,30 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( // // The input storeSet is mutated (and used to for the returned result). func retainReadyLeaseTargetStoresOnly( - ctx context.Context, in storeSet, stores map[roachpb.StoreID]*storeState, rangeID roachpb.RangeID, + ctx context.Context, + in storeSet, + stores map[roachpb.StoreID]*storeState, + rangeID roachpb.RangeID, + existingLeaseholder roachpb.StoreID, ) storeSet { out := in[:0] for _, storeID := range in { + if storeID == existingLeaseholder { + // The existing leaseholder is always included in the mean, even if + // it is ill-disposed towards leases. Because it is holding the lease, + // we know that its load is recent. + // + // Example: Consider a range with leaseholder on s1 and voters on s2 + // and s3. All stores have CPU capacity of 100 units. s1 has load 40, + // s2 has load 80, s3 has load 80. The mean CPU utilization (total + // load / total capacity) is (40+80+80)/(100+100+100) = 66% if we + // include s1 and (80+80)/(100+100) = 80% if we don't. + // If we filtered out s1 just because it is ill-disposed towards + // leases, s2 and s3 would be exactly on the mean and we might + // consider transferring the lease to them, but we should not. + out = append(out, storeID) + continue + } s := stores[storeID].status switch { case s.Disposition.Lease != LeaseDispositionOK: diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go index e25af0004929..6e7258e0fee3 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go @@ -630,7 +630,8 @@ func TestClusterState(t *testing.T) { case "retain-ready-lease-target-stores-only": in := dd.ScanArg[[]roachpb.StoreID](t, d, "in") rangeID := dd.ScanArg[roachpb.RangeID](t, d, "range-id") - out := retainReadyLeaseTargetStoresOnly(ctx, storeSet(in), cs.stores, rangeID) + lh, _ := dd.ScanArgOpt[roachpb.StoreID](t, d, "leaseholder") + out := retainReadyLeaseTargetStoresOnly(ctx, storeSet(in), cs.stores, rangeID, lh) rec := finishAndGet() var sb redact.StringBuilder rec.SafeFormatMinimal(&sb) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt index d4ed37ba94ec..61c77a69a356 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/lease_disposition.txt @@ -124,6 +124,11 @@ retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 skipping s2 for lease transfer: replica lease disposition refusing (health ok) [1 3] +# The leaseholder is exempt from the disposition check. +retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 leaseholder=2 +---- +[1 2 3] + # Restore s2's replica disposition. store-leaseholder-msg store-id=1 @@ -137,3 +142,7 @@ store-id=1 retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 ---- [1 2 3] + +retain-ready-lease-target-stores-only in=(1,2,3) range-id=1 leaseholder=2 +---- +[1 2 3] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt index 3f1e9da46fc9..94903429af4f 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt @@ -181,11 +181,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] start processing shedding store s1: cpu node load overloadSlow, store load overloadSlow, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r3:[cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadSlow >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r3 from s1: candidates are [3 5] [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=693 fractionUsed=90.00% meanUtil=83.20% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=693 fractionUsed=90.00% meanUtil=83.20% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 3 5] [mmaid=1] load summary for dim=CPURate (s3): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=693 fractionUsed=90.00% meanUtil=83.20% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -207,11 +207,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n5s5: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=loadNormal worst=CPURate cpu=loadNormal writes=loadNormal bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s5 for r3: delta load [cpu:180ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] considering lease-transfer r2 from s1: candidates are [2 4] [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=700 fractionUsed=90.00% meanUtil=84.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=700 fractionUsed=90.00% meanUtil=84.00% capacity=1000] -[mmaid=1] considering lease-transfer r2 from s1: candidates are [1 2 4] [mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: load is >10% above mean [load=900 meanLoad=700 fractionUsed=90.00% meanUtil=84.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -233,11 +233,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n4s4: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=loadNormal worst=CPURate cpu=loadNormal writes=loadNormal bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s4 for r2: delta load [cpu:180ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=900 meanLoad=900 fractionUsed=90.00% meanUtil=90.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=900 meanLoad=900 fractionUsed=90.00% meanUtil=90.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 2 3] [mmaid=1] result(failed): skipping r1 since store not overloaded relative to candidates [mmaid=1] attempting to shed replicas next [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt index d403b8024c39..192a62144629 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_frac_threshold.txt @@ -29,11 +29,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=0.4 [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r4:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r3:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r4 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r4 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -53,11 +53,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=0.4 [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n2s2: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(success): shedding r4 lease from s1 to s2 [change:r4=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:770ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.23) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r3 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt index 1da24fd04dbe..0f59952fae42 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt @@ -69,12 +69,12 @@ rebalance-stores store-id=1 [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] skipping s2 for lease transfer: lease disposition refusing (health ok) [mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 3] [mmaid=1] result(failed): skipping r1 since store not overloaded relative to candidates [mmaid=1] attempting to shed replicas next [mmaid=1] excluding all stores on n1 due to overload/fd status diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt index 58597c4f4f00..6c671b32b6bb 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_replica_refusing.txt @@ -77,12 +77,12 @@ rebalance-stores store-id=1 [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] skipping s2 for lease transfer: replica lease disposition refusing (health ok) [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=900 fractionUsed=100.00% meanUtil=90.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=900 fractionUsed=100.00% meanUtil=90.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 3] [mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=800 meanLoad=900 fractionUsed=80.00% meanUtil=90.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt index 31fe7defbd37..ad88ed7c56fb 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_count.txt @@ -29,11 +29,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=1.0 max-lease-tr [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r4:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r3:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r4 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r4 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -53,11 +53,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=1.0 max-lease-tr [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n2s2: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(success): shedding r4 lease from s1 to s2 [change:r4=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:770ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.23) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r3 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt index 5e2a8bb03e8b..19832e40e1c2 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_transfer_unbounded.txt @@ -28,11 +28,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r4:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r3:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:250ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r4 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=400 fractionUsed=100.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r4 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=100 meanLoad=400 fractionUsed=10.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -52,11 +52,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n2s2: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(success): shedding r4 lease from s1 to s2 [change:r4=[transfer_to=2 cids=1,2]] with resulting loads source:[cpu:770ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.23) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r3 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r3 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -77,11 +77,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] cannot add load to n2s2: due to target_summary(overloadSlow)>=loadNoChange,targetSLS.frac_pending(2.53or0.00>=epsilon) [mmaid=1] [target_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=2.53,0.00(false)),src_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.23(false))] [mmaid=1] result(failed): cannot shed from s1 to s2 for r3: delta load [cpu:230ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] considering lease-transfer r2 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=770 meanLoad=400 fractionUsed=77.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r2 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -101,11 +101,11 @@ rebalance-stores store-id=1 fraction-pending-decrease-threshold=10.0 max-lease-t [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=540 meanLoad=400 fractionUsed=54.00% meanUtil=40.00% capacity=1000] [mmaid=1] can add load to n3s3: true targetSLS[(store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true))] srcSLS[(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.23(false))] [mmaid=1] result(success): shedding r2 lease from s1 to s3 [change:r2=[transfer_to=3 cids=3,4]] with resulting loads source:[cpu:540ns/s, write-bandwidth:0 B/s, byte-size:0 B] target:[cpu:353ns/s, write-bandwidth:0 B/s, byte-size:0 B] (means: [cpu:400ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (frac_pending: (src:0.00,target:0.46) (src:2.53,target:0.00)) +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] [mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=540 meanLoad=400 fractionUsed=54.00% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed < 75% [load=540 meanLoad=400 fractionUsed=54.00% meanUtil=40.00% capacity=1000] -[mmaid=1] considering lease-transfer r1 from s1: candidates are [1 2 3] [mmaid=1] load summary for dim=CPURate (s2): loadLow, reason: load is >10% below mean [load=353 meanLoad=400 fractionUsed=35.30% meanUtil=40.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] From 15843b645b44b730ac9903696a06da2108590f2c Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Fri, 28 Nov 2025 16:51:08 +0100 Subject: [PATCH 3/3] mmaprototype: add pre-means filtering for replica transfers Previously, computeCandidatesForRange (now computeCandidatesForReplicaTransfer) computed the mean over all constraint-satisfying stores, then applied post-means exclusions before returning candidates. These exclusions were: - Stores on nodes already housing replicas of the range - The shedding store itself (handled via a separate check in the loop) - If the shedding store's node was CPU overloaded, all other stores on that node The last bullet was ad-hoc and asymmetric: we wouldn't apply the same logic to other overloaded nodes that happened to have replicas. It also wasn't exercised, and the downstream candidate selection logic wouldn't choose these stores anyway (they'd be rejected for being on an overloaded node). This commit makes two changes: 1. Simplified post-means exclusions: The parameter is renamed from storesToExclude to postMeansExclusions to clarify its role. All exclusions are now handled uniformly in this set: - The shedding store (we're moving away from it) - Stores on nodes with other existing replicas The ad-hoc "exclude other stores on shedding node if node is overloaded" logic is removed. Within-node rebalance (to another store on the shedding node) is now permitted at this stage. 2. Pre-means filtering: This is new. Before computing the mean, we now filter out stores that aren't ready to receive replicas based on disposition. As a legacy case, we manually filter on high disk utilization, but there is a TODO to fold this into the disposition as well - this is tracked and will be done separately. This is done by retainReadyReplicaTargetStoresOnly. The shedding store bypasses the disposition check since it already has the replica - its load should be in the mean regardless of its disposition for NEW replicas. When filtering occurs, we recompute the mean over the filtered set. The common case (nothing filtered) still uses the cached mean for efficiency. Other cleanup: - Fixed logging (VEventf instead of mixing V() and Infof) - Added scratch fields to clusterState for the new filtering but also left a TODO to get out of scratch fields entirely. Part of #156776. Epic: CRDB-55052 --- .../allocator/mmaprototype/allocator_state.go | 126 ++++++++-- .../allocator/mmaprototype/cluster_state.go | 3 + .../cluster_state_rebalance_stores.go | 73 +++--- .../mmaprototype/cluster_state_test.go | 18 ++ .../rebalance_stores_capacity_mismatch.txt | 47 ++++ ...lance_stores_cpu_lease_refusing_target.txt | 2 +- .../rebalance_stores_cpu_replica_count.txt | 3 +- ...ance_stores_cpu_replica_frac_threshold.txt | 3 +- .../rebalance_stores_cpu_replica_lateral.txt | 222 ++++++++++++++++++ ...nce_stores_cpu_replica_refusing_target.txt | 106 +++++++++ ...rebalance_stores_cpu_replica_unbounded.txt | 7 +- .../cluster_state/replica_disposition.txt | 181 ++++++++++++++ 12 files changed, 732 insertions(+), 59 deletions(-) create mode 100644 pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_lateral.txt create mode 100644 pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_refusing_target.txt create mode 100644 pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/replica_disposition.txt diff --git a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go index 677a2068d786..53635198120e 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/allocator_state.go @@ -835,10 +835,10 @@ func (cs *clusterState) ensureAnalyzedConstraints(rstate *rangeState) { // - Need diversity change for each candidate. // // The first 3 bullets are encapsulated in the helper function -// computeCandidatesForRange. It works for both replica additions and +// computeCandidatesForReplicaTransfer. It works for both replica additions and // rebalancing. // -// For the last bullet (diversity), the caller of computeCandidatesForRange +// For the last bullet (diversity), the caller of computeCandidatesForReplicaTransfer // needs to populate candidateInfo.diversityScore for each candidate in // candidateSet. It does so via diversityScoringMemo. Then the (loadSummary, // diversityScore) pair can be used to order candidates for attempts to add. @@ -866,41 +866,125 @@ func (cs *clusterState) ensureAnalyzedConstraints(rstate *rangeState) { // loadSheddingStore is only specified if this candidate computation is // happening because of overload. -func (cs *clusterState) computeCandidatesForRange( +// +// postMeansExclusions are filtered post-means: their load is included in the +// mean (they're viable locations in principle) but they're not candidates for +// this specific transfer (the classic case: already have a replica). +func (cs *clusterState) computeCandidatesForReplicaTransfer( ctx context.Context, - expr constraintsDisj, - storesToExclude storeSet, + conj constraintsConj, + existingReplicas storeSet, + postMeansExclusions storeSet, loadSheddingStore roachpb.StoreID, passObs *rebalancingPassMetricsAndLogger, ) (_ candidateSet, sheddingSLS storeLoadSummary) { - means := cs.meansMemo.getMeans(expr) - if loadSheddingStore > 0 { - sheddingSS := cs.stores[loadSheddingStore] - sheddingSLS = cs.meansMemo.getStoreLoadSummary(ctx, means, loadSheddingStore, sheddingSS.loadSeqNum) - if sheddingSLS.sls <= loadNoChange && sheddingSLS.nls <= loadNoChange { - // In this set of stores, this store no longer looks overloaded. - passObs.replicaShed(notOverloaded) - return candidateSet{}, sheddingSLS - } + // Start with computing the stores (and corresponding means) that satisfy + // the constraint expression. If we don't see a need to filter out any of + // these stores before computing the means, we can use it verbatim, otherwise + // we will recompute the means again below. + cs.scratchDisj[0] = conj + means := cs.meansMemo.getMeans(cs.scratchDisj[:1]) + + // Pre-means filtering: copy to scratch, then filter in place. + // Filter out stores that have a non-OK replica disposition. + cs.scratchStoreSet = append(cs.scratchStoreSet[:0], means.stores...) + filteredStores := retainReadyReplicaTargetStoresOnly(ctx, cs.scratchStoreSet, cs.stores, existingReplicas) + + // Determine which means to use. + // + // TODO(tbg): unit testing. + var effectiveMeans *meansLoad + if len(filteredStores) == len(means.stores) { + // Common case: nothing was filtered, use cached means. + effectiveMeans = &means.meansLoad + } else if len(filteredStores) == 0 { + // No viable candidates at all. + return candidateSet{}, sheddingSLS + } else { + // Some stores were filtered; recompute means over filtered set. + cs.scratchMeans = computeMeansForStoreSet( + cs, filteredStores, cs.meansMemo.scratchNodes, cs.meansMemo.scratchStores) + effectiveMeans = &cs.scratchMeans + log.KvDistribution.VEventf(ctx, 2, + "pre-means filtered %d stores → remaining %v, means: store=%v node=%v", + len(means.stores)-len(filteredStores), filteredStores, + effectiveMeans.storeLoad, effectiveMeans.nodeLoad) } - // We only filter out stores that are not fdOK. The rest of the filtering - // happens later. + + sheddingSLS = cs.computeLoadSummary(ctx, loadSheddingStore, &effectiveMeans.storeLoad, &effectiveMeans.nodeLoad) + if sheddingSLS.sls <= loadNoChange && sheddingSLS.nls <= loadNoChange { + // In this set of stores, this store no longer looks overloaded. + passObs.replicaShed(notOverloaded) + return candidateSet{}, sheddingSLS + } + var cset candidateSet - for _, storeID := range means.stores { - if storesToExclude.contains(storeID) { + for _, storeID := range filteredStores { + if postMeansExclusions.contains(storeID) { + // This store's load is included in the mean, but it's not a viable + // target for this specific transfer (e.g. it already has a replica). continue } - ss := cs.stores[storeID] - csls := cs.meansMemo.getStoreLoadSummary(ctx, means, storeID, ss.loadSeqNum) + csls := cs.computeLoadSummary(ctx, storeID, &effectiveMeans.storeLoad, &effectiveMeans.nodeLoad) cset.candidates = append(cset.candidates, candidateInfo{ StoreID: storeID, storeLoadSummary: csls, }) } - cset.means = &means.meansLoad + cset.means = effectiveMeans return cset, sheddingSLS } +// retainReadyReplicaTargetStoresOnly filters the input set to only those stores +// that are ready to accept a replica. A store is not ready if it has a non-OK +// replica disposition. In practice, the input set is already filtered by +// constraints. +// +// Stores already housing a replica (on top of being in the input storeSet) +// bypass this disposition check since they already have the replica - its load +// should be in the mean regardless of its disposition, as we'll pick candidates +// based on improving clustering around the mean. +// +// The input storeSet is mutated (and returned as the result). +func retainReadyReplicaTargetStoresOnly( + ctx context.Context, + in storeSet, + stores map[roachpb.StoreID]*storeState, + existingReplicas storeSet, +) storeSet { + out := in[:0] + for _, storeID := range in { + if existingReplicas.contains(storeID) { + // Stores on existing replicas already have the load and we want to + // include them in the mean, even if they are not accepting new replicas + // or even try to shed. + // + // TODO(tbg): health might play into this, though. For example, when + // a store is dead, whatever load we have from it is stale and we + // are better off not including it. For now, we ignore this problem + // because the mma only handles rebalancing, whereas a replica on a + // dead store would be removed by the single-metric allocator after + // the TimeUntilStoreDead and so would disappear from our view. + out = append(out, storeID) + continue + } + ss := stores[storeID] + switch { + case ss.status.Disposition.Replica != ReplicaDispositionOK: + log.KvDistribution.VEventf(ctx, 2, "skipping s%d for replica transfer: replica disposition %v (health %v)", storeID, ss.status.Disposition.Replica, ss.status.Health) + case highDiskSpaceUtilization(ss.reportedLoad[ByteSize], ss.capacity[ByteSize]): + // TODO(tbg): remove this from mma and just let the caller set this + // disposition based on the following cluster settings: + // - kv.allocator.max_disk_utilization_threshold + // - kv.allocator.rebalance_to_max_disk_utilization_threshold + log.KvDistribution.VEventf(ctx, 2, "skipping s%d for replica transfer: high disk utilization (health %v)", storeID, ss.status.Health) + default: + out = append(out, storeID) + } + } + return out +} + // Diversity scoring is very amenable to caching, since the set of unique // locality tiers for range replicas is likely to be small. And the cache does // not need to be cleared after every allocator pass. This caching is done via diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go index c330106267a8..0426082d9beb 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state.go @@ -1268,6 +1268,9 @@ type clusterState struct { ranges map[roachpb.RangeID]*rangeState scratchRangeMap map[roachpb.RangeID]struct{} + scratchStoreSet storeSet // scratch space for pre-means filtering + scratchMeans meansLoad // scratch space for recomputed means + scratchDisj [1]constraintsConj // scratch space for getMeans call // Added to when a change is proposed. Will also add to corresponding // rangeState.pendingChanges and to the affected storeStates. diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go index bde3fae5c925..7dec874fbafd 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_rebalance_stores.go @@ -76,12 +76,13 @@ type rebalanceEnv struct { // pass. Can be nil. passObs *rebalancingPassMetricsAndLogger // Scratch variables reused across iterations. + // TODO(tbg): these are a potential source of errors (imagine two nested + // calls using the same scratch variable). Just make a global variable + // that wraps a bunch of sync.Pools for the types we need. scratch struct { - disj [1]constraintsConj - storesToExclude storeSet - storesToExcludeForRange storeSet - nodes map[roachpb.NodeID]*NodeLoad - stores map[roachpb.StoreID]struct{} + postMeansExclusions storeSet + nodes map[roachpb.NodeID]*NodeLoad + stores map[roachpb.StoreID]struct{} } } @@ -429,22 +430,11 @@ func (re *rebalanceEnv) rebalanceReplicas( localStoreID roachpb.StoreID, ignoreLevel ignoreLevel, ) { - // If the node is cpu overloaded, or the store/node is not fdOK, exclude - // the other stores on this node from receiving replicas shed by this - // store. - excludeStoresOnNode := store.nls > overloadSlow - re.scratch.storesToExclude = re.scratch.storesToExclude[:0] - if excludeStoresOnNode { - nodeID := ss.NodeID - for _, storeID := range re.nodes[nodeID].stores { - re.scratch.storesToExclude.insert(storeID) - } - log.KvDistribution.VEventf(ctx, 2, "excluding all stores on n%d due to overload/fd status", nodeID) - } else { - // This store is excluded of course. - re.scratch.storesToExclude.insert(store.StoreID) + if store.StoreID != localStoreID && store.dimSummary[CPURate] >= overloadSlow && + re.now.Sub(ss.overloadStartTime) < remoteStoreLeaseSheddingGraceDuration { + log.KvDistribution.VEventf(ctx, 2, "skipping remote store s%d: in lease shedding grace period", store.StoreID) + return } - // Iterate over top-K ranges first and try to move them. topKRanges := ss.adjusted.topKRanges[localStoreID] n := topKRanges.len() @@ -498,6 +488,8 @@ func (re *rebalanceEnv) rebalanceReplicas( "rstate_replicas=%v rstate_constraints=%v", store.StoreID, rangeID, rstate.pendingChanges, rstate.replicas, rstate.constraints)) } + // Get the constraint conjunction which will allow us to look up stores + // that could replace the shedding store. var conj constraintsConj var err error if isVoter { @@ -512,32 +504,43 @@ func (re *rebalanceEnv) rebalanceReplicas( log.KvDistribution.VEventf(ctx, 2, "skipping r%d: constraint violation needs fixing first: %v", rangeID, err) continue } - re.scratch.disj[0] = conj - re.scratch.storesToExcludeForRange = append(re.scratch.storesToExcludeForRange[:0], re.scratch.storesToExclude...) - // Also exclude all stores on nodes that have existing replicas. + // Build post-means exclusions: stores whose load is included in the mean + // (they're viable locations in principle) but aren't valid targets for + // this specific transfer. + // + // NB: to prevent placing replicas on multiple CRDB nodes sharing a + // host, we'd need to make changes here. + // See: https://github.com/cockroachdb/cockroach/issues/153863 + re.scratch.postMeansExclusions = re.scratch.postMeansExclusions[:0] + existingReplicas := storeSet{} // TODO(tbg): avoid allocation for _, replica := range rstate.replicas { storeID := replica.StoreID + existingReplicas.insert(storeID) if storeID == store.StoreID { - // We don't exclude other stores on this node, since we are allowed to - // transfer the range to them. If the node is overloaded or not fdOK, - // we have already excluded those stores above. + // Exclude the shedding store (we're moving away from it), but not + // other stores on its node (within-node rebalance is allowed). + re.scratch.postMeansExclusions.insert(storeID) continue } + // Exclude all stores on nodes with other existing replicas. nodeID := re.stores[storeID].NodeID for _, storeID := range re.nodes[nodeID].stores { - re.scratch.storesToExcludeForRange.insert(storeID) + re.scratch.postMeansExclusions.insert(storeID) } } + + // Compute the candidates. These are already filtered down to only those stores + // that we'll actually be happy to transfer the range to. + // Note that existingReplicas is a subset of postMeansExclusions, so they'll + // be included in the mean, but are never considered as candidates. + // // TODO(sumeer): eliminate cands allocations by passing a scratch slice. - cands, ssSLS := re.computeCandidatesForRange( - ctx, re.scratch.disj[:], re.scratch.storesToExcludeForRange, store.StoreID, re.passObs) + cands, ssSLS := re.computeCandidatesForReplicaTransfer(ctx, conj, existingReplicas, re.scratch.postMeansExclusions, store.StoreID, re.passObs) log.KvDistribution.VEventf(ctx, 2, "considering replica-transfer r%v from s%v: store load %v", rangeID, store.StoreID, ss.adjusted.load) - if log.V(2) { - log.KvDistribution.Infof(ctx, "candidates are:") - for _, c := range cands.candidates { - log.KvDistribution.Infof(ctx, " s%d: %s", c.StoreID, c.storeLoadSummary) - } + log.KvDistribution.VEventf(ctx, 3, "candidates are:") + for _, c := range cands.candidates { + log.KvDistribution.VEventf(ctx, 3, " s%d: %s", c.StoreID, c.storeLoadSummary) } if len(cands.candidates) == 0 { @@ -753,6 +756,8 @@ func (re *rebalanceEnv) rebalanceLeasesFromLocalStoreID( // which is also in cands. clear(re.scratch.nodes) + // NB: candsPL is not empty - it includes at least the current leaseholder + // and one additional candidate. means := computeMeansForStoreSet(re, candsPL, re.scratch.nodes, re.scratch.stores) sls := re.computeLoadSummary(ctx, store.StoreID, &means.storeLoad, &means.nodeLoad) if sls.dimSummary[CPURate] < overloadSlow { diff --git a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go index 6e7258e0fee3..92c7db61b480 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go +++ b/pkg/kv/kvserver/allocator/mmaprototype/cluster_state_test.go @@ -419,6 +419,11 @@ func TestClusterState(t *testing.T) { ctx, finishAndGet := tracing.ContextWithRecordingSpan( context.Background(), tr, d.Cmd, ) + if d.HasArg("breakpoint") { + // You can set a debugger breakpoint here and use `breakpoint=true` + // in a datadriven command to hit it. + t.Log("hit breakpoint") + } switch d.Cmd { case "include": loc := dd.ScanArg[string](t, d, "path") @@ -637,6 +642,19 @@ func TestClusterState(t *testing.T) { rec.SafeFormatMinimal(&sb) return fmt.Sprintf("%s%v\n", sb.String(), out) + case "retain-ready-replica-target-stores-only": + in := dd.ScanArg[[]roachpb.StoreID](t, d, "in") + replicas, _ := dd.ScanArgOpt[[]roachpb.StoreID](t, d, "replicas") + var replicasSet storeSet + for _, replica := range replicas { + replicasSet.insert(replica) + } + out := retainReadyReplicaTargetStoresOnly(ctx, storeSet(in), cs.stores, replicasSet) + rec := finishAndGet() + var sb redact.StringBuilder + rec.SafeFormatMinimal(&sb) + return fmt.Sprintf("%s%v\n", sb.String(), out) + default: panic(fmt.Sprintf("unknown command: %v", d.Cmd)) } diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt index 94903429af4f..bc8de2a9a72c 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_capacity_mismatch.txt @@ -261,6 +261,11 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] load summary for dim=ByteSize (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] [mmaid=1] load summary for dim=CPURate (n7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] [mmaid=1] considering replica-transfer r3 from s1: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s2: (store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s6: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s7: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=1] discarding candidates with higher load than lowestLoadSet(loadNormal): s2(SLS:overloadSlow, overloadedDimLoadSummary:overloadSlow), overloadedDim:CPURate [mmaid=1] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s6(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s7(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s6 [mmaid=1] load summary for dim=CPURate (s6): overloadUrgent, reason: fractionUsed > 90% [load=480 meanLoad=541 fractionUsed=96.00% meanUtil=75.80% capacity=500] @@ -274,6 +279,10 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n6s6: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s6 for r3: delta load [cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] [mmaid=1] load summary for dim=CPURate (s3): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] [mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -282,7 +291,20 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] load summary for dim=WriteBandwidth (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] [mmaid=1] load summary for dim=ByteSize (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] [mmaid=1] load summary for dim=CPURate (n5): loadLow, reason: load is >10% below mean [load=280 meanLoad=541 fractionUsed=56.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] [mmaid=1] considering replica-transfer r2 from s1: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s3: (store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s5: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s6: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s7: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=1] discarding candidates with higher load than lowestLoadSet(loadNormal): s3(SLS:overloadSlow, overloadedDimLoadSummary:overloadSlow), overloadedDim:CPURate [mmaid=1] sortTargetCandidateSetAndPick: candidates: s5(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s6(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s7(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s6 [mmaid=1] load summary for dim=CPURate (s6): overloadUrgent, reason: fractionUsed > 90% [load=480 meanLoad=541 fractionUsed=96.00% meanUtil=75.80% capacity=500] @@ -296,7 +318,32 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=1] cannot add load to n6s6: due to overloadUrgent [mmaid=1] [target_sls:(store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadNormal bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true))] [mmaid=1] result(failed): cannot shed from s1 to s6 for r2: delta load [cpu:200ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): overloadSlow, reason: fractionUsed > 75% [load=900 meanLoad=541 fractionUsed=90.00% meanUtil=75.80% capacity=1000] +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=300 meanLoad=541 fractionUsed=60.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=300 meanLoad=541 fractionUsed=60.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s5): loadLow, reason: load is >10% below mean [load=280 meanLoad=541 fractionUsed=56.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s5): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n5): loadLow, reason: load is >10% below mean [load=280 meanLoad=541 fractionUsed=56.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s6): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n6): loadLow, reason: load is >10% below mean [load=260 meanLoad=541 fractionUsed=52.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=CPURate (s7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] +[mmaid=1] load summary for dim=WriteBandwidth (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=ByteSize (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] +[mmaid=1] load summary for dim=CPURate (n7): loadLow, reason: load is >10% below mean [load=250 meanLoad=541 fractionUsed=50.00% meanUtil=75.80% capacity=500] [mmaid=1] considering replica-transfer r1 from s1: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s5: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s6: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] s7: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=1] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s5(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s6(SLS:loadNormal, overloadedDimLoadSummary:loadLow) s7(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s7 [mmaid=1] load summary for dim=CPURate (s7): overloadUrgent, reason: fractionUsed > 90% [load=470 meanLoad=541 fractionUsed=94.00% meanUtil=75.80% capacity=500] [mmaid=1] load summary for dim=WriteBandwidth (s7): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=500] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt index 0f59952fae42..3e7c5ee0314e 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_lease_refusing_target.txt @@ -77,12 +77,12 @@ rebalance-stores store-id=1 [mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] [mmaid=1] result(failed): skipping r1 since store not overloaded relative to candidates [mmaid=1] attempting to shed replicas next -[mmaid=1] excluding all stores on n1 due to overload/fd status [mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=700 fractionUsed=100.00% meanUtil=70.00% capacity=1000] [mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=700 fractionUsed=100.00% meanUtil=70.00% capacity=1000] [mmaid=1] considering replica-transfer r1 from s1: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: [mmaid=1] result(failed): no candidates found for r1 after exclusions [mmaid=1] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=1] no top-K[CPURate] ranges found for s3 with lease on local s1 diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt index 684bb9538bbc..90a48acf5031 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_count.txt @@ -16,7 +16,6 @@ rebalance-stores store-id=1 max-range-move-count=1 fraction-pending-decrease-thr [mmaid=2] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=2] top-K[CPURate] ranges for s3 with lease on local s1: r3:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=2] attempting to shed replicas next -[mmaid=2] excluding all stores on n3 due to overload/fd status [mmaid=2] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=525 fractionUsed=100.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -26,6 +25,8 @@ rebalance-stores store-id=1 max-range-move-count=1 fraction-pending-decrease-thr [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=525 fractionUsed=10.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r3 from s3: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt index deded1f3501a..f77c7051691b 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_frac_threshold.txt @@ -16,7 +16,6 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=2] top-K[CPURate] ranges for s3 with lease on local s1: r3:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=2] attempting to shed replicas next -[mmaid=2] excluding all stores on n3 due to overload/fd status [mmaid=2] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=525 fractionUsed=100.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -26,6 +25,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=525 fractionUsed=10.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r3 from s3: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_lateral.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_lateral.txt new file mode 100644 index 000000000000..f9d41de9b0b9 --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_lateral.txt @@ -0,0 +1,222 @@ +# This test verifies lateral (within-node) replica transfers are allowed. +# Stores on the shedding store's node are NOT excluded as targets, allowing +# replica movement between stores on the same node. +# +# Setup: +# - n1s1: has replica, leaseholder, not overloaded +# - n2s2: has replica, not overloaded +# - n3s3: has replica, write-bandwidth overloaded (trying to shed) +# - n3s4: same node as s3, no replica, not overloaded - VALID lateral target +# +# Expected: s3 sheds its replica to s4 (lateral transfer within n3). + +set-store + store-id=1 node-id=1 + store-id=2 node-id=2 + store-id=3 node-id=3 + store-id=4 node-id=3 +---- +node-id=1 locality-tiers=node=1 + store-id=1 attrs= +node-id=2 locality-tiers=node=2 + store-id=2 attrs= +node-id=3 locality-tiers=node=3 + store-id=3 attrs= + store-id=4 attrs= + +store-load-msg + store-id=1 node-id=1 load=[100,20000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s + store-id=2 node-id=2 load=[100,20000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s + store-id=3 node-id=3 load=[50,80000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s + store-id=4 node-id=3 load=[50,10000000,0] capacity=[1000,100000000,1000] secondary-load=0 load-time=0s +---- + +store-leaseholder-msg +store-id=1 + range-id=1 load=[10,10000000,0] raft-cpu=10 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL + store-id=3 replica-id=3 type=VOTER_FULL + config=num_replicas=3 constraints={} voter_constraints={} +---- + +# First call establishes s3's overload state and enters grace period. +rebalance-stores store-id=1 +---- +[mmaid=1] rebalanceStores begins +[mmaid=1] cluster means: (stores-load [cpu:75ns/s, write-bandwidth:32 MB/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:100 MB/s, byte-size:1.0 kB]) (nodes-cpu-load 100) (nodes-cpu-capacity 1333) +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] evaluating s1: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] overload-continued s1 ((store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s1 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] evaluating s2: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] overload-continued s2 ((store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s2 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=32500000 fractionUsed=80.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] evaluating s3: node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=1] overload-continued s3 ((store=overloadUrgent worst=WriteBandwidth cpu=loadLow writes=overloadUrgent bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s3 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=32500000 fractionUsed=10.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] evaluating s4: node load loadNormal, store load loadNormal, worst dim ByteSize +[mmaid=1] start processing shedding store s1: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] local store s1 is CPU overloaded (overloadSlow >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=83 fractionUsed=5.00% meanUtil=8.33% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=40000000 fractionUsed=80.00% meanUtil=40.00% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] candidate store 2 was discarded due to (nls=false overloadDim=true pending_thresh=false): sls=(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] discarding candidates with higher load than loadThreshold(overloadSlow): s3(SLS:overloadUrgent, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate +[mmaid=1] sortTargetCandidateSetAndPick: no candidates due to load +[mmaid=1] result(failed): no candidates to move lease from n1s1 for r1 after sortTargetCandidateSetAndPick +[mmaid=1] attempting to shed replicas next +[mmaid=1] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=1] considering replica-transfer r1 from s1: store load [cpu:100ns/s, write-bandwidth:20 MB/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] result(failed): no candidates found for r1 after exclusions +[mmaid=1] start processing shedding store s2: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s2 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] skipping remote store s2: in lease shedding grace period +[mmaid=1] start processing shedding store s3: cpu node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=1] top-K[WriteBandwidth] ranges for s3 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] attempting to shed replicas next +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=32500000 fractionUsed=80.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=32500000 fractionUsed=10.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=1] considering replica-transfer r1 from s3: store load [cpu:50ns/s, write-bandwidth:80 MB/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] s4: (store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=1] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:WriteBandwidth, picked s4 +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=61 meanLoad=75 fractionUsed=6.10% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=21000000 meanLoad=32500000 fractionUsed=21.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): overloadSlow, reason: load is >10% above mean [load=111 meanLoad=100 fractionUsed=5.55% meanUtil=7.50% capacity=2000] +[mmaid=1] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=40 meanLoad=75 fractionUsed=4.00% meanUtil=7.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): overloadSlow, reason: fractionUsed < 75% and >1.75x meanUtil [load=70000000 meanLoad=32500000 fractionUsed=70.00% meanUtil=32.50% capacity=100000000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=90 meanLoad=100 fractionUsed=4.50% meanUtil=7.50% capacity=2000] +[mmaid=1] cannot add load to n3s4: due to target_summary(overloadSlow)>=loadNoChange,target-node(overloadSlow)>target-store(loadNormal) +[mmaid=1] [target_sls:(store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] +[mmaid=1] result(failed): cannot shed from s3 to s4 for r1: delta load [cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=1] rebalancing pass failures (store,reason:count): (s1,no-cand-load:1), (s3,no-cand-to-accept-load:1) +pending(0) + +# Advance time beyond the lease shedding grace period. +tick seconds=300 +---- +t=5m0s + +# s3 is overloaded. Since the lease is on s1 (not s3), s3 can only shed via +# replica transfer. s4 is on the same node (n3) and is a valid lateral target. +# TODO(during review): s4 actually gets rejected on the last mile because for +# some reason the "due to target_summary(overloadSlow)>=loadNoChange" fires; +# I need to piece this together but this doesn't seem right. We're on a low-CPU +# node and simply trying to move IO between two stores of the same node. This +# seems like an unambigiously good idea (if we can't move the load elsewhere). +rebalance-stores store-id=1 +---- +[mmaid=2] rebalanceStores begins +[mmaid=2] cluster means: (stores-load [cpu:75ns/s, write-bandwidth:32 MB/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:100 MB/s, byte-size:1.0 kB]) (nodes-cpu-load 100) (nodes-cpu-capacity 1333) +[mmaid=2] evaluating s1: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] store s1 was added to shedding store list +[mmaid=2] evaluating s2: node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] store s2 was added to shedding store list +[mmaid=2] evaluating s3: node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=2] store s3 was added to shedding store list +[mmaid=2] evaluating s4: node load loadNormal, store load loadNormal, worst dim ByteSize +[mmaid=2] start processing shedding store s1: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] local store s1 is CPU overloaded (overloadSlow >= overloadSlow), attempting lease transfers first +[mmaid=2] considering lease-transfer r1 from s1: candidates are [2 3] +[mmaid=2] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=83 fractionUsed=10.00% meanUtil=8.33% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=40000000 fractionUsed=20.00% meanUtil=40.00% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=83 fractionUsed=5.00% meanUtil=8.33% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=40000000 fractionUsed=80.00% meanUtil=40.00% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=2] candidate store 2 was discarded due to (nls=false overloadDim=true pending_thresh=false): sls=(store=overloadSlow worst=CPURate cpu=overloadSlow writes=loadLow bytes=loadNormal node=loadNoChange high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=2] discarding candidates with higher load than loadThreshold(overloadSlow): s3(SLS:overloadUrgent, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate +[mmaid=2] sortTargetCandidateSetAndPick: no candidates due to load +[mmaid=2] result(failed): no candidates to move lease from n1s1 for r1 after sortTargetCandidateSetAndPick +[mmaid=2] attempting to shed replicas next +[mmaid=2] load summary for dim=CPURate (s1): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s1): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n1): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] considering replica-transfer r1 from s1: store load [cpu:100ns/s, write-bandwidth:20 MB/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] result(failed): no candidates found for r1 after exclusions +[mmaid=2] start processing shedding store s2: cpu node load loadNoChange, store load overloadSlow, worst dim CPURate +[mmaid=2] top-K[CPURate] ranges for s2 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] attempting to shed replicas next +[mmaid=2] load summary for dim=CPURate (s2): overloadSlow, reason: fractionUsed < 75% [load=100 meanLoad=75 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s2): loadLow, reason: load is >10% below mean [load=20000000 meanLoad=32500000 fractionUsed=20.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n2): loadNoChange, reason: fractionUsed < 75% [load=100 meanLoad=100 fractionUsed=10.00% meanUtil=7.50% capacity=1000] +[mmaid=2] considering replica-transfer r1 from s2: store load [cpu:100ns/s, write-bandwidth:20 MB/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] result(failed): no candidates found for r1 after exclusions +[mmaid=2] start processing shedding store s3: cpu node load loadNormal, store load overloadUrgent, worst dim WriteBandwidth +[mmaid=2] top-K[WriteBandwidth] ranges for s3 with lease on local s1: r1:[cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] attempting to shed replicas next +[mmaid=2] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s3): overloadUrgent, reason: fractionUsed > 75% and >1.5x meanUtil [load=80000000 meanLoad=32500000 fractionUsed=80.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=50 meanLoad=75 fractionUsed=5.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=10000000 meanLoad=32500000 fractionUsed=10.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=100 meanLoad=100 fractionUsed=5.00% meanUtil=7.50% capacity=2000] +[mmaid=2] considering replica-transfer r1 from s3: store load [cpu:50ns/s, write-bandwidth:80 MB/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true)) +[mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:WriteBandwidth, picked s4 +[mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=61 meanLoad=75 fractionUsed=6.10% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s4): loadLow, reason: load is >10% below mean [load=21000000 meanLoad=32500000 fractionUsed=21.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): overloadSlow, reason: load is >10% above mean [load=111 meanLoad=100 fractionUsed=5.55% meanUtil=7.50% capacity=2000] +[mmaid=2] load summary for dim=CPURate (s3): loadLow, reason: load is >10% below mean [load=40 meanLoad=75 fractionUsed=4.00% meanUtil=7.50% capacity=1000] +[mmaid=2] load summary for dim=WriteBandwidth (s3): overloadSlow, reason: fractionUsed < 75% and >1.75x meanUtil [load=70000000 meanLoad=32500000 fractionUsed=70.00% meanUtil=32.50% capacity=100000000] +[mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=2] load summary for dim=CPURate (n3): loadNormal, reason: load is within 5% of mean [load=90 meanLoad=100 fractionUsed=4.50% meanUtil=7.50% capacity=2000] +[mmaid=2] cannot add load to n3s4: due to target_summary(overloadSlow)>=loadNoChange,target-node(overloadSlow)>target-store(loadNormal) +[mmaid=2] [target_sls:(store=loadNormal worst=ByteSize cpu=loadLow writes=loadLow bytes=loadNormal node=overloadSlow high_disk=false frac_pending=0.00,0.00(true)),src_sls:(store=overloadSlow worst=WriteBandwidth cpu=loadLow writes=overloadSlow bytes=loadNormal node=loadNormal high_disk=false frac_pending=0.00,0.00(true))] +[mmaid=2] result(failed): cannot shed from s3 to s4 for r1: delta load [cpu:10ns/s, write-bandwidth:10 MB/s, byte-size:0 B] +[mmaid=2] rebalancing pass failures (store,reason:count): (s1,no-cand-load:1), (s2,no-cand:1), (s3,no-cand-to-accept-load:1) +pending(0) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_refusing_target.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_refusing_target.txt new file mode 100644 index 000000000000..863a3b958a7d --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_refusing_target.txt @@ -0,0 +1,106 @@ +# This test verifies that the multi-metric allocator skips stores that aren't +# ready to receive replicas when considering replica transfers. +# +# Setup: s1 holds the lease for r1, with replicas on s1, s2, s3. +# - s1: overloaded (wants to shed load) +# - s2, s3: also overloaded (not good targets) +# - s4: low load (would be ideal target) but marked as refusing replicas +# +# Expected: s4 is filtered out due to replica disposition. +# With no valid targets remaining, no replica transfer occurs. + +set-store + store-id=1 node-id=1 + store-id=2 node-id=2 + store-id=3 node-id=3 + store-id=4 node-id=4 +---- +node-id=1 locality-tiers=node=1 + store-id=1 attrs= +node-id=2 locality-tiers=node=2 + store-id=2 attrs= +node-id=3 locality-tiers=node=3 + store-id=3 attrs= +node-id=4 locality-tiers=node=4 + store-id=4 attrs= + +store-load-msg + store-id=1 node-id=1 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s + store-id=2 node-id=2 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s + store-id=3 node-id=3 load=[1000,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s + store-id=4 node-id=4 load=[100,0,0] capacity=[1000,1000,1000] secondary-load=0 load-time=0s +---- + +store-leaseholder-msg +store-id=1 + range-id=1 load=[100,0,0] raft-cpu=100 + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL + store-id=3 replica-id=3 type=VOTER_FULL + config=num_replicas=3 constraints={} voter_constraints={} +---- + +# Mark s4 as refusing replicas. +set-store-status store-id=4 replicas=refusing +---- +ok refusing=replicas + +# s1 tries to shed load. Lease transfers fail (all other replica-holding stores +# are also overloaded). Replica transfer attempted but s4 is filtered out. +rebalance-stores store-id=1 +---- +[mmaid=1] rebalanceStores begins +[mmaid=1] cluster means: (stores-load [cpu:775ns/s, write-bandwidth:0 B/s, byte-size:0 B]) (stores-capacity [cpu:1µs/s, write-bandwidth:1.0 kB/s, byte-size:1.0 kB]) (nodes-cpu-load 775) (nodes-cpu-capacity 1000) +[mmaid=1] load summary for dim=CPURate (s1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s1: node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] overload-continued s1 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s1 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s2): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s2): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n2): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s2: node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] overload-continued s2 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s2 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=775 fractionUsed=100.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s3: node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] overload-continued s3 ((store=overloadUrgent worst=CPURate cpu=overloadUrgent writes=loadNormal bytes=loadNormal node=overloadUrgent high_disk=false frac_pending=0.00,0.00(true))) - within grace period +[mmaid=1] store s3 was added to shedding store list +[mmaid=1] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=100 meanLoad=775 fractionUsed=10.00% meanUtil=77.50% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=775 fractionUsed=10.00% meanUtil=77.50% capacity=1000] +[mmaid=1] evaluating s4: node load loadLow, store load loadNormal, worst dim WriteBandwidth +[mmaid=1] start processing shedding store s1: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s1 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] local store s1 is CPU overloaded (overloadUrgent >= overloadSlow), attempting lease transfers first +[mmaid=1] considering lease-transfer r1 from s1: candidates are [2 3] +[mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] result(failed): skipping r1 since store not overloaded relative to candidates +[mmaid=1] attempting to shed replicas next +[mmaid=1] skipping s4 for replica transfer: replica disposition refusing (health ok) +[mmaid=1] pre-means filtered 1 stores → remaining [1 2 3], means: store={[1000 0 0] [1000 1000 1000] [1 0 0] [0 0]} node={1000 1000 1} +[mmaid=1] load summary for dim=CPURate (s1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] load summary for dim=WriteBandwidth (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=ByteSize (s1): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] +[mmaid=1] load summary for dim=CPURate (n1): loadNormal, reason: load is within 5% of mean [load=1000 meanLoad=1000 fractionUsed=100.00% meanUtil=100.00% capacity=1000] +[mmaid=1] considering replica-transfer r1 from s1: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] candidates are: +[mmaid=1] result(failed): no candidates found for r1 after exclusions +[mmaid=1] start processing shedding store s2: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s2 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] skipping remote store s2: in lease shedding grace period +[mmaid=1] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate +[mmaid=1] top-K[CPURate] ranges for s3 with lease on local s1: r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=1] skipping remote store s3: in lease shedding grace period +[mmaid=1] rebalancing pass failures (store,reason:count): (s1,not-overloaded:1) +pending(0) diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt index ce917a8fcafd..98cd2faf0510 100644 --- a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/rebalance_stores_cpu_replica_unbounded.txt @@ -19,7 +19,6 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] start processing shedding store s3: cpu node load overloadUrgent, store load overloadUrgent, worst dim CPURate [mmaid=2] top-K[CPURate] ranges for s3 with lease on local s1: r3:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r2:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] r1:[cpu:100ns/s, write-bandwidth:0 B/s, byte-size:0 B] [mmaid=2] attempting to shed replicas next -[mmaid=2] excluding all stores on n3 due to overload/fd status [mmaid=2] load summary for dim=CPURate (s3): overloadUrgent, reason: fractionUsed > 90% [load=1000 meanLoad=525 fractionUsed=100.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=ByteSize (s3): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -29,6 +28,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=100 meanLoad=525 fractionUsed=10.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r3 from s3: store load [cpu:1µs/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=0.00,0.00(true)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -49,6 +50,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=210 meanLoad=525 fractionUsed=21.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r2 from s3: store load [cpu:900ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=1.10,0.00(false)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=320 meanLoad=525 fractionUsed=32.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] @@ -69,6 +72,8 @@ rebalance-stores store-id=1 max-range-move-count=999 fraction-pending-decrease-t [mmaid=2] load summary for dim=ByteSize (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] [mmaid=2] load summary for dim=CPURate (n4): loadLow, reason: load is >10% below mean [load=320 meanLoad=525 fractionUsed=32.00% meanUtil=52.50% capacity=1000] [mmaid=2] considering replica-transfer r1 from s3: store load [cpu:800ns/s, write-bandwidth:0 B/s, byte-size:0 B] +[mmaid=2] candidates are: +[mmaid=2] s4: (store=loadNormal worst=WriteBandwidth cpu=loadLow writes=loadNormal bytes=loadNormal node=loadLow high_disk=false frac_pending=2.20,0.00(false)) [mmaid=2] sortTargetCandidateSetAndPick: candidates: s4(SLS:loadNormal, overloadedDimLoadSummary:loadLow), overloadedDim:CPURate, picked s4 [mmaid=2] load summary for dim=CPURate (s4): loadLow, reason: load is >10% below mean [load=430 meanLoad=525 fractionUsed=43.00% meanUtil=52.50% capacity=1000] [mmaid=2] load summary for dim=WriteBandwidth (s4): loadNormal, reason: load is within 5% of mean [load=0 meanLoad=0 fractionUsed=0.00% meanUtil=0.00% capacity=1000] diff --git a/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/replica_disposition.txt b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/replica_disposition.txt new file mode 100644 index 000000000000..0575bcd9ce58 --- /dev/null +++ b/pkg/kv/kvserver/allocator/mmaprototype/testdata/cluster_state/replica_disposition.txt @@ -0,0 +1,181 @@ +# Test retainReadyReplicaTargetStoresOnly which filters stores based on: +# 1. Health (must be HealthOK) +# 2. Store-level replica disposition (must be ReplicaDispositionOK) +# 3. High disk space utilization (>90%) +# +# Note: constraint-based exclusions (e.g., to avoid multiple replicas per node) +# are handled post-means at the caller level, not here. + +set-store + store-id=1 node-id=1 locality-tiers=region=us + store-id=2 node-id=2 locality-tiers=region=us + store-id=3 node-id=3 locality-tiers=region=us +---- +node-id=1 locality-tiers=region=us,node=1 + store-id=1 attrs= +node-id=2 locality-tiers=region=us,node=2 + store-id=2 attrs= +node-id=3 locality-tiers=region=us,node=3 + store-id=3 attrs= + +# Set up stores with normal disk usage (50% used). +store-load-msg + store-id=1 node-id=1 load=[100,0,50] capacity=[200,100,100] load-time=0s + store-id=2 node-id=2 load=[50,0,50] capacity=[200,100,100] load-time=0s + store-id=3 node-id=3 load=[50,0,50] capacity=[200,100,100] load-time=0s +---- + +store-leaseholder-msg +store-id=1 + range-id=1 load=[10,0,0] + config=(num_replicas=3) + store-id=1 replica-id=1 type=VOTER_FULL leaseholder=true + store-id=2 replica-id=2 type=VOTER_FULL leaseholder=false + store-id=3 replica-id=3 type=VOTER_FULL leaseholder=false +---- + +# All stores healthy and accepting replicas - all retained. +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Only input stores matter. +retain-ready-replica-target-stores-only in=(1,3) +---- +[1 3] + +# Mark s2 as shedding leases, which should have no effect +# since we're looking at replicas. +set-store-status store-id=2 leases=shedding +---- +ok shedding=leases + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Make store 2 unhealthy - but we rely only on the +# disposition, so no filtering occurs. +# This can't happen in production since unhealthy +# stores can't have a green disposition. +set-store-status store-id=2 health=unhealthy leases=ok +---- +unhealthy accepting all + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Different kind of unhealthy. Same result. +set-store-status store-id=2 health=unknown +---- +unknown accepting all + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Restore store 2, make store 3 refuse replicas at store level. +set-store-status store-id=2 health=ok +---- +ok accepting all + +set-store-status store-id=3 replicas=refusing +---- +ok refusing=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s3 for replica transfer: replica disposition refusing (health ok) +[1 2] + +# Shedding and refusing are treated the same. +set-store-status store-id=3 health=ok replicas=shedding +---- +ok shedding=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s3 for replica transfer: replica disposition shedding (health ok) +[1 2] + +# Restore store 3. +set-store-status store-id=3 replicas=ok +---- +ok accepting all + +# All stores ready again. +retain-ready-replica-target-stores-only in=(1,2,3) +---- +[1 2 3] + +# Test high disk utilization filtering. +# Set store 2 to >90% disk usage. +store-load-msg + store-id=2 node-id=2 load=[50,0,95] capacity=[200,100,100] load-time=0s +---- + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s2 for replica transfer: high disk utilization (health ok) +[1 3] + +# Test combination of filters: s2 still high disk, +# s3 refusing replicas. +set-store-status store-id=3 replicas=refusing +---- +ok refusing=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s2 for replica transfer: high disk utilization (health ok) +skipping s3 for replica transfer: replica disposition refusing (health ok) +[1] + +# Test with `replicas` parameter: stores in this set bypass disposition checks +# since it already has the replica (their load should be in the mean regardless +# of whether it's accepting new replicas). + +# Reset stores to healthy. +set-store-status store-id=2 health=ok +---- +ok accepting all + +# Reset high disk on s2. +store-load-msg + store-id=2 node-id=2 load=[50,0,0] capacity=[200,100,100] load-time=0s +---- + +set-store-status store-id=3 replicas=ok +---- +ok accepting all + +# Make store 1 refuse replicas. +set-store-status store-id=1 replicas=refusing +---- +ok refusing=replicas + +# When not specified to already have a replica, s1 is filtered out. +retain-ready-replica-target-stores-only in=(1,2,3) +---- +skipping s1 for replica transfer: replica disposition refusing (health ok) +[2 3] + +# With existing replica on s1, s1 bypasses the disposition check and is retained. +# (It already has a replica, so its disposition is irrelevant for mean computation.) +retain-ready-replica-target-stores-only in=(1,2,3) replicas=(1) +---- +[1 2 3] + +# Even if s1 is ill-disposed, it's not filtered out when it's the shedding store. +# +# TODO(tbg): this is an interesting case that I need to understand +# better, especially if we also consider excluding ALL current replicas +# from the check. +set-store-status store-id=1 health=unhealthy replicas=refusing +---- +unhealthy refusing=replicas + +retain-ready-replica-target-stores-only in=(1,2,3) replicas=(1) +---- +[1 2 3]